diff -urpN -X /home/fletch/.diff.exclude 000-bk3/Documentation/basic_profiling.txt 900-mjb3/Documentation/basic_profiling.txt --- 000-bk3/Documentation/basic_profiling.txt Wed Dec 31 16:00:00 1969 +++ 900-mjb3/Documentation/basic_profiling.txt Sat Mar 8 22:03:41 2003 @@ -0,0 +1,48 @@ +These instructions are deliberately very basic. If you want something clever, +go read the real docs ;-) Please don't add more stuff, but feel free to +correct my mistakes ;-) (mbligh@aracnet.com) +Thanks to John Levon, Dave Hansen, et al. for help writing this. + + is the thing you're trying to measure. +Make sure you have the correct System.map / vmlinux referenced! +IMHO it's easier to use "make install" for linux and hack /sbin/installkernel +to copy config files, system.map, vmlinux to /boot. + +Readprofile +----------- +You need a fixed readprofile command for 2.5 ... either get hold of +a current version from: +http://www.kernel.org/pub/linux/utils/util-linux/ +or get readprofile binary fixed for 2.5 / akpm's 2.5 patch from +ftp://ftp.kernel.org/pub/linux/kernel/people/mbligh/tools/readprofile/ + +Add "profile=2" to the kernel command line. + +clear readprofile -r + +dump output readprofile -m /boot/System.map > captured_profile + +Oprofile +-------- +get source (I use 0.5) from http://oprofile.sourceforge.net/ +add "idle=poll" to the kernel command line +Configure with CONFIG_PROFILING=y and CONFIG_OPROFILE=y & reboot on new kernel +./configure --with-kernel-support +make install + +One time setup (pick appropriate one for your CPU): +P3 opcontrol --setup --vmlinux=/boot/vmlinux \ + --ctr0-event=CPU_CLK_UNHALTED --ctr0-count=100000 +Athlon/x86-64 opcontrol --setup --vmlinux=/boot/vmlinux \ + --ctr0-event=RETIRED_INSNS --ctr0-count=100000 +P4 opcontrol --setup --vmlinux=/boot/vmlinux \ + --ctr0-event=GLOBAL_POWER_EVENTS \ + --ctr0-unit-mask=1 --ctr0-count=100000 + +start daemon opcontrol --start-daemon +clear opcontrol --reset +start opcontrol --start + +stop opcontrol --stop +dump output oprofpp -dl -i /boot/vmlinux > output_file + diff -urpN -X /home/fletch/.diff.exclude 000-bk3/Documentation/filesystems/proc.txt 900-mjb3/Documentation/filesystems/proc.txt --- 000-bk3/Documentation/filesystems/proc.txt Thu Feb 13 11:08:01 2003 +++ 900-mjb3/Documentation/filesystems/proc.txt Sat Mar 8 22:04:05 2003 @@ -37,6 +37,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/sched - scheduler tunables ------------------------------------------------------------------------------ Preface @@ -1662,6 +1663,104 @@ IPX. The /proc/net/ipx_route table holds a list of IPX routes. For each route it gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. + +2.11 /proc/sys/sched - scheduler tunables +----------------------------------------- + +Useful knobs for tuning the scheduler live in /proc/sys/sched. + +child_penalty +------------- + +Percentage of the parent's sleep_avg that children inherit. sleep_avg is +a running average of the time a process spends sleeping. Tasks with high +sleep_avg values are considered interactive and given a higher dynamic +priority and a larger timeslice. You typically want this some value just +under 100. + +exit_weight +----------- + +When a CPU hog task exits, its parent's sleep_avg is reduced by a factor of +exit_weight against the exiting task's sleep_avg. + +interactive_delta +----------------- + +If a task is "interactive" it is reinserted into the active array after it +has expired its timeslice, instead of being inserted into the expired array. +How "interactive" a task must be in order to be deemed interactive is a +function of its nice value. This interactive limit is scaled linearly by nice +value and is offset by the interactive_delta. + +max_sleep_avg +------------- + +max_sleep_avg is the largest value (in ms) stored for a task's running sleep +average. The larger this value, the longer a task needs to sleep to be +considered interactive (maximum interactive bonus is a function of +max_sleep_avg). + +max_timeslice +------------- + +Maximum timeslice, in milliseconds. This is the value given to tasks of the +highest dynamic priority. + +min_timeslice +------------- + +Minimum timeslice, in milliseconds. This is the value given to tasks of the +lowest dynamic priority. Every task gets at least this slice of the processor +per array switch. + +parent_penalty +-------------- + +Percentage of the parent's sleep_avg that it retains across a fork(). +sleep_avg is a running average of the time a process spends sleeping. Tasks +with high sleep_avg values are considered interactive and given a higher +dynamic priority and a larger timeslice. Normally, this value is 100 and thus +task's retain their sleep_avg on fork. If you want to punish interactive +tasks for forking, set this below 100. + +prio_bonus_ratio +---------------- + +Middle percentage of the priority range that tasks can receive as a dynamic +priority. The default value of 25% ensures that nice values at the +extremes are still enforced. For example, nice +19 interactive tasks will +never be able to preempt a nice 0 CPU hog. Setting this higher will increase +the size of the priority range the tasks can receive as a bonus. Setting +this lower will decrease this range, making the interactivity bonus less +apparent and user nice values more applicable. + +starvation_limit +---------------- + +Sufficiently interactive tasks are reinserted into the active array when they +run out of timeslice. Normally, tasks are inserted into the expired array. +Reinserting interactive tasks into the active array allows them to remain +runnable, which is important to interactive performance. This could starve +expired tasks, however, since the interactive task could prevent the array +switch. To prevent starving the tasks on the expired array for too long. the +starvation_limit is the longest (in ms) we will let the expired array starve +at the expense of reinserting interactive tasks back into active. Higher +values here give more preferance to running interactive tasks, at the expense +of expired tasks. Lower values provide more fair scheduling behavior, at the +expense of interactivity. The units are in milliseconds. + +idle_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N idle rebalance ticks, where N = idle_node_rebalance_ratio. + +busy_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N busy rebalance ticks, where N = busy_node_rebalance_ratio. ------------------------------------------------------------------------------ Summary diff -urpN -X /home/fletch/.diff.exclude 000-bk3/Documentation/i386/gdb-serial.txt 900-mjb3/Documentation/i386/gdb-serial.txt --- 000-bk3/Documentation/i386/gdb-serial.txt Wed Dec 31 16:00:00 1969 +++ 900-mjb3/Documentation/i386/gdb-serial.txt Sat Mar 8 22:04:46 2003 @@ -0,0 +1,386 @@ +Version +======= + +This version of the gdbstub package was developed and tested on +kernel version 2.3.48. It will not install on a 2.2 kernel. It may +not work on earlier versions of 2.3 kernels. It is possible that +it will continue to work on later versions of 2.3 and then +versions of 2.4 (I hope). + + +Debugging Setup +=============== + +Designate one machine as the "development" machine. This is the +machine on which you run your compiles and which has your source +code for the kernel. Designate a second machine as the "target" +machine. This is the machine that will run your experimental +kernel. + +The two machines will be connected together via a serial line out +one or the other of the COM ports of the PC. You will need a modem +eliminator and the appropriate cables. + +On the DEVELOPMENT machine you need to apply the patch for the gdb +hooks. You have probably already done that if you are reading this +file. + +On your DEVELOPMENT machine, go to your kernel source directory and +do "make menuconfig". Go down to the kernel hacking menu item and +open it up. Enable the kernel gdb stub code by selecting that item. + +Save and exit the menuconfig program. Then do "make clean" and +"make bzImage" (or whatever target you want to make). This gets +the kernel compiled with the "-g" option set -- necessary for +debugging. + +You have just built the kernel on your DEVELOPMENT machine that you +intend to run on our TARGET machine. + +To install this new kernel, use the following installation procedure. +Remember, you are on the DEVELOPMENT machine patching the kernel source +for the kernel that you intend to run on the TARGET machine. + +Copy this kernel to your target machine using your usual procedures. +I usually arrange to copy development:/usr/src/linux/arch/i386/boot/zImage +to /vmlinuz on the TARGET machine via a LAN based NFS access. That is, +I run the cp command on the target and copy from the development machine +via the LAN. Run Lilo on the new kernel on the target machine so that it +will boot! Then boot the kernel on the target machine. + +There is an utility program named "gdbstart" in the +development:/usr/src/linux/arch/i386/kernel directory. +You should copy this program over to your target machine, probably into +/sbin. This utility program is run on the target machine to +activate the kernel hooks for the debugger. It is invoked as follows: + + gdbstart [-s speed] [-t tty-dev] + defaults: /dev/ttyS0 with speed unmodified by gdbstart + +Don't run the program just yet. We'll get to that in a bit. + +Decide on which tty port you want the machines to communicate, then +cable them up back-to-back using the null modem. COM1 is /dev/ttyS0 +and COM2 is /dev/ttyS1. + +On the DEVELOPMENT machine, create a file called .gdbinit in the +directory /usr/src/linux. An example .gdbinit file looks like this: + +define rmt +set remotebaud 38400 +target remote /dev/ttyS0 +end + +Assuming that you added my gdbinit stuff to your .gdbinit, edit .gdbinit +and find the section that looks like this: + + define rmt + set remotebaud 38400 + target remote /dev/ttyS0 + end + +Change the "target" definition so that it specifies the tty port that +you intend to use. Change the "remotebaud" definition to match the +data rate that you are going to use for the com line. + +On the TARGET machine I find it helpful to create shell script file +named "debug" in the root home directory with the following contents: + + gdbstart -s 38400 -t /dev/ttyS0 < + EOF + +This runs the gdbstart program and gives it the carriage return that +it prompts for. This sets the data rate from the target machine's side. + +You are now ready to try it out. + +On your TARGET machine, freshly rebooted with your gdbstub-equipped +kernel, type "debug" in the root home directory. The system will appear +to hang with some messages on the screen from the debug stub. What +it is doing is waiting for contact from the development machine. + +On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". +When gdb gets the symbols loaded and prompts you, enter "rmt" (that's +the macro from the .gdbinit file that you just edited). If everything +is working correctly you should see gdb print out a few lines indicating +that a breakpoint has been taken. It will actually show a line of +code in the target kernel inside the gdbstub activation code. + +The gdb interaction should look something like this: + + linux-dev:/usr/src/linux# gdb vmlinux + GDB is free software and you are welcome to distribute copies of it + under certain conditions; type "show copying" to see the conditions. + There is absolutely no warranty for GDB; type "show warranty" for details. + GDB 4.15.1 (i486-slackware-linux), + Copyright 1995 Free Software Foundation, Inc... + (gdb) rmt + breakpoint () at i386-stub.c:750 + 750 } + (gdb) + + +You can now use whatever gdb commands you like to set breakpoints. +Enter "continue" to start your target machine executing again. At this +point the target system will run at full speed until it encounters +your breakpoint or gets a segment violation in the kernel, or whatever. + + +Triggering gdbstub at Kernel Boot Time +====================================== + +The gdbstub patch now has the ability for gdb to connect to the kernel during +bootup (as opposed to waiting for the system to come all the way up and then +running the gdbstart program on the target machine). This new functionality was +added by Scott Foehner at SGI. + +To force a kernel that has been compiled with gdbstub to pause during the boot +process and wait for a connection from gdb, the paramter "gdb" should be passed +to the kernel. This can be done by typing "gdb" after the name of the kernel +on the LILO command line. The patch defaults to use ttyS1 at a baud rate of +38400. These parameters can be changed by using "gdbttyS=" and +"gdbbaud=" on the command line. + +Example: + +LILO boot: linux gdb gdbttyS=1 gdbbaud=38400 + +Note that this command is entered on the TARGET machine as it is booting +the kernel that was compiled on the DEVELOPMENT machine. + +An alternate approach is to place a line in the /etc/lilo.conf file on +your TARGET machine. Under the heading for the kernel that you intend +to boot, place a line that looks like this: + + append = "gdb gdbttyS=1 gdbbaud=38400" + +This will cause the kernel to enter the gdbstub automatically at boot +time. + +BE SURE to run "lilo" after changing the /etc/lilo.conf file. + + +The "gdbstart" Program +===================== + +This utility program is used to set up the com port and data rate +for the connection from the target system to the development system. +Its usage has been described above. + +This version of the patch uses the same tty ioctl for kernel versions +2.0.30 onwards. Thus, the gdbstart utility does not need to be re-compiled +to install the patch in a later version of the kernel. The ioctl added +to the kernel for this purpose is far enough "off the end" of existing +ioctls (as of 2.1.120) that it should not interfere with any new kernel +tty ioctls for quite some time (famous last words). + +The source for the gdbstart program resides in the arch/i386/kernel directory. + + +Debugging hints +=============== + +You can break into the target machine at any time from the development +machine by typing ^C. If the target machine has interrupts enabled +this will stop it in the kernel and enter the debugger. + +There is unfortunately no way of breaking into the kernel if it is +in a loop with interrupts disabled, so if this happens to you then +you need to place exploratory breakpoints or printk's into the kernel +to find out where it is looping. + +There is a copy of an e-mail in the kgdb distribution directory which +describes how to create an NMI on an ISA bus machine using a paper +clip. I have a sophisticated version of this made by wiring a push +button switch into a PC104/ISA bus adapter card. The adapter card +nicely furnishes wire wrap pins for all the ISA bus signals. + +When you are done debugging the kernel on the target machine it is +a good idea to leave it in a running state. This makes reboots +faster, bypassing the fsck. So do a gdb "continue" as the last gdb +command if this is possible. To terminate gdb itself on the development +machine and leave the target machine running, type ^Z to suspend gdb +and then kill it with "kill %1" or something similar. + +If gdbstub Does Not Work +======================== + +If it doesn't work, you will have to troubleshoot it. Do the easy things +first like double checking your cabling and data rates. You might +try some non-kernel based programs to see if the back-to-back connection +works properly. Just something simple like cat /etc/hosts >/dev/ttyS0 +on one machine and cat /dev/ttyS0 on the other will tell you if you +can send data from one machine to the other. There is no point in tearing +out your hair in the kernel if the line doesn't work. + +All of the real action takes place in the file +/usr/src/linux/arch/i386/kernel/gdbstub.c. That is the code on the target +machine that interacts with gdb on the development machine. In gdb you can +turn on a debug switch with the following command: + + set remotedebug + +This will print out the protocol messages that gdb is exchanging with +the target machine. + +Another place to look is /usr/src/linux/drivers/char/gdbserial.c +That is the code that talks to the serial port on the target side. +There might be a problem there. + +If you are really desperate you can use printk debugging in the +gdbstub code in the target kernel until you get it working. In particular, +there is a global variable in /usr/src/linux/arch/i386/kernel/gdbstub.c +named "remote_debug". Compile your kernel with this set to 1, rather +than 0 and the debug stub will print out lots of stuff as it does +what it does. + + +Debugging Loadable Modules +========================== + +This technique comes courtesy of Edouard Parmelan + + +When you run gdb, enter the command + +source gdbinit-modules + +This will read in a file of gdb macros that was installed in your +kernel source directory with kgdb was installed. This file implements +the following commands: + +mod-list + Lists the loaded modules in the form

+ +mod-print-symbols + Prints all the symbols in the indicated module. + +mod-add-symbols

+ Loads the symbols from the object file and associates them + with the indicated module. + +After you have loaded the module that you want to debug, use the command +mod-list to find the of your module. Then use that +address in the mod-add-symbols command to load your module's symbols. +From that point onward you can debug your module as if it were a part +of the kernel. + +The file gdbinit-modules also contains a command named mod-add-lis as +an example of how to construct a command of your own to load your +favorite module. The idea is to "can" the pathname of the module +in the command so you don't have to type so much. + +Threads +======= + +Each process in a target machine is seen as a gdb thread. gdb thread related +commands (info threads, thread n) can be used. + +ia-32 hardware breakpoints +========================== + +gdb stub contains support for hardware breakpoints using debugging features +of ia-32(x86) processors. These breakpoints do not need code modification. +They use debugging registers. 4 hardware breakpoints are available in ia-32 +processors. + +Each hardware breakpoint can be of one of the following three types. +1. Execution breakpoint - An Execution breakpoint is triggered when code at the + breakpoint address is executed. + + As limited number of hardware breakpoints are available, it is advisable + to use software breakpoints ( break command ) instead of execution + hardware breakpoints, unless modification of code is to be avoided. + +2. Write breakpoint - A write breakpoint is triggered when memory location at the + breakpoint address is written. + + A write or can be placed for data of variable length. Length of a write + breakpoint indicates length of the datatype to be watched. Length is 1 + for 1 byte data , 2 for 2 byte data, 3 for 4 byte data. + +3. Access breakpoint - An access breakpoint is triggered when memory location at + the breakpoint address is either read or written. + + Access breakpoints also have lengths similar to write breakpoints. + +IO breakpoints in ia-32 are not supported. + +Since gdb stub at present does not use the protocol used by gdb for hardware +breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros +for hardware breakpoints are described below. + +hwebrk - Places an execution breakpoint + hwebrk breakpointno address +hwwbrk - Places a write breakpoint + hwwbrk breakpointno length address +hwabrk - Places an access breakpoint + hwabrk breakpointno length address +hwrmbrk - Removes a breakpoint + hwrmbrk breakpointno +exinfo - Tells whether a software or hardware breakpoint has occured. + Prints number of the hardware breakpoint if a hardware breakpoint has + occured. + +Arguments required by these commands are as follows +breakpointno - 0 to 3 +length - 1 to 3 +address - Memory location in hex digits ( without 0x ) e.g c015e9bc + +MP support +========== + +When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb client, +all the processors are forced to enter the debugger. Current thread +corresponds to the thread running on the processor where breakpoint occured. +Threads running on other processor(s) appear similar to other non running +threads in the 'info threads' output. + +ia-32 hardware debugging registers on all processors are set to same values. +Hence any hardware breakpoints may occur on any processor. + +gdb troubleshooting +=================== + +1. gdb hangs +Kill it. restart gdb. Connect to target machine. + +2. gdb cannot connect to target machine (after killing a gdb and restarting +another) +If the target machine was not inside debugger when you killed gdb, gdb cannot +connect because the target machine won't respond. +In this case echo "Ctrl+C"(ascii 3) in the serial line. +e.g. echo -e "\003" > /dev/ttyS1 +This forces that target machine into debugger after which you can connect. + +3. gdb cannot connect even after echoing Ctrl+C into serial line +Try changing serial line settings min to 1 and time to 0 +e.g. stty min 1 time 0 < /dev/ttyS1 +Try echoing again + +check serial line speed and set it to correct value if required +e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 + +Final Items +=========== + +I picked up this code from Dave Grothe and enhanced it. + +If you make some really cool modification to this stuff, or if you +fix a bug, please let me know. + +Amit S. Kale + + +(First kgdb by David Grothe ) + +(modified by Tigran Aivazian ) + Putting gdbstub into the kernel config menu. + +(modified by Scott Foehner ) + Hooks for entering gdbstub at boot time. + +(modified by Amit S. Kale ) + Threads, ia-32 hw debugging, mp support, console support, + nmi watchdog handling. diff -urpN -X /home/fletch/.diff.exclude 000-bk3/Documentation/sysrq.txt 900-mjb3/Documentation/sysrq.txt --- 000-bk3/Documentation/sysrq.txt Tue Feb 25 23:03:43 2003 +++ 900-mjb3/Documentation/sysrq.txt Sat Mar 8 22:04:46 2003 @@ -73,6 +73,8 @@ On other - If you know of the key combos 'l' - Send a SIGKILL to all processes, INCLUDING init. (Your system will be non-functional after this.) +'g' - Enter the kernel debugger (if configured and supported). + 'h' - Will display help ( actually any other key than those listed above will display help. but 'h' is easy to remember :-) diff -urpN -X /home/fletch/.diff.exclude 000-bk3/Makefile 900-mjb3/Makefile --- 000-bk3/Makefile Sat Mar 8 10:16:02 2003 +++ 900-mjb3/Makefile Wed Mar 12 19:47:55 2003 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 5 SUBLEVEL = 64 -EXTRAVERSION = bk3 +EXTRAVERSION = -bk3-mjb3 # *DOCUMENTATION* # To see a list of typical targets execute "make help" @@ -281,6 +281,10 @@ ifneq ($(KBUILD_BUILTIN),1) KBUILD_BUILTIN := 1 endif endif +endif + +ifdef CONFIG_X86_REMOTE_DEBUG +CFLAGS += -g endif # diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/Kconfig 900-mjb3/arch/i386/Kconfig --- 000-bk3/arch/i386/Kconfig Wed Mar 5 07:36:57 2003 +++ 900-mjb3/arch/i386/Kconfig Wed Mar 12 19:47:37 2003 @@ -382,6 +382,11 @@ config X86_SSE2 depends on MK8 || MPENTIUM4 default y +config X86_CMOV + bool + depends on M686 || MPENTIUMII || MPENTIUMIII || MPENTIUM4 || MK8 || MCRUSOE + default y + config HUGETLB_PAGE bool "Huge TLB Page Support" help @@ -488,7 +493,7 @@ config NR_CPUS # Common NUMA Features config NUMA bool "Numa Memory Allocation Support" - depends on (HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT && ACPI && !ACPI_HT_ONLY))) + depends on (HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT && ACPI && !ACPI_HT_ONLY))) || X86_PC config DISCONTIGMEM bool @@ -679,6 +684,44 @@ config HIGHMEM64G endchoice +choice + help + On i386, a process can only virtually address 4GB of memory. This + lets you select how much of that virtual space you would like to + devoted to userspace, and how much to the kernel. + + Some userspace programs would like to address as much as possible and + have few demands of the kernel other than it get out of the way. These + users may opt to use the 3.5GB option to give their userspace program + as much room as possible. Due to alignment issues imposed by PAE, + the "3.5GB" option is unavailable if "64GB" high memory support is + enabled. + + Other users (especially those who use PAE) may be running out of + ZONE_NORMAL memory. Those users may benefit from increasing the + kernel's virtual address space size by taking it away from userspace, + which may not need all of its space. An indicator that this is + happening is when /proc/Meminfo's "LowFree:" is a small percentage of + "LowTotal:" while "HighFree:" is very large. + + If unsure, say "3GB" + prompt "User address space size" + default 1GB + +config 05GB + bool "3.5 GB" + depends on !HIGHMEM64G + +config 1GB + bool "3 GB" + +config 2GB + bool "2 GB" + +config 3GB + bool "1 GB" +endchoice + config HIGHMEM bool depends on HIGHMEM64G || HIGHMEM4G @@ -698,6 +741,16 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config 4K_STACK + bool "Use smaller 4k per-task stacks" + help + This option will shrink the kernel's per-task stack from 8k to + 4k. This will greatly increase your chance of overflowing it. + But, if you use the per-cpu interrupt stacks as well, your chances + go way down. Also try the CONFIG_X86_STACK_CHECK overflow + detection. It is much more reliable than the currently in-kernel + version. + config MATH_EMULATION bool "Math emulation" ---help--- @@ -757,6 +810,25 @@ config MTRR See for more information. +choice + help + This is unrelated to your processor's speed. This variable alters + how often the system is asked to generate timer interrupts. A larger + value can lead to a more responsive system, but also causes extra + overhead from the increased number of context switches. + + If in doubt, leave it at the default of 1000. + + prompt "Kernel HZ" + default 1000HZ + +config 100HZ + bool "100 Hz" + +config 1000HZ + bool "1000 Hz" +endchoice + config HAVE_DEC_LOCK bool depends on (SMP || PREEMPT) && X86_CMPXCHG @@ -1445,12 +1517,54 @@ source "arch/i386/oprofile/Kconfig" menu "Kernel hacking" +config CRASH_DUMP + tristate "Crash dump support (EXPERIMENTAL)" + depends on EXPERIMENTAL + default n + ---help--- + Say Y here to enable saving an image of system memory when a panic + or other error occurs. Dumps can also be forced with the SysRq+d + key if MAGIC_SYSRQ is enabled. + +config CRASH_DUMP_BLOCKDEV + tristate "Crash dump block device driver" + depends on CRASH_DUMP + help + Say Y to allow saving crash dumps directly to a disk device. + +config CRASH_DUMP_NETDEV + tristate "Crash dump network device driver" + depends on CRASH_DUMP + help + Say Y to allow saving crash dumps over a network device. + +config CRASH_DUMP_COMPRESS_RLE + tristate "Crash dump RLE compression" + depends on CRASH_DUMP + help + Say Y to allow saving dumps with Run Length Encoding compression. + +config CRASH_DUMP_COMPRESS_GZIP + tristate "Crash dump GZIP compression" + depends on CRASH_DUMP + help + Say Y to allow saving dumps with Gnu Zip compression. + config DEBUG_KERNEL bool "Kernel debugging" help Say Y here if you are developing drivers or trying to debug and identify kernel problems. +config KPROBES + bool "Kprobes" + depends on DEBUG_KERNEL + help + Kprobes allows you to trap at almost any kernel address, using + register_kprobe(), and providing a callback function. This is useful + for kernel debugging, non-intrusive instrumentation and testing. If + in doubt, say "N". + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL @@ -1463,6 +1577,17 @@ config DEBUG_SLAB allocation as well as poisoning memory on free to catch use of freed memory. +config X86_REMOTE_DEBUG + bool "KGDB: Remote (serial) kernel debugging with gdb" + +config KGDB_THREAD + bool "KGDB: Thread analysis" + depends on X86_REMOTE_DEBUG + +config GDB_CONSOLE + bool "KGDB: Console messages through gdb" + depends on X86_REMOTE_DEBUG + config DEBUG_IOVIRT bool "Memory mapped I/O debugging" depends on DEBUG_KERNEL @@ -1488,6 +1613,26 @@ config MAGIC_SYSRQ keys are documented in . Don't say Y unless you really know what this hack does. +config EARLY_PRINTK + bool "Early console support" + default n + depends on DEBUG_KERNEL + help + Write kernel log output directly into the VGA buffer or serial port. + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server.You should normally N here, + unless you want to debug such a crash. + + Syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. + config DEBUG_SPINLOCK bool "Spinlock debugging" depends on DEBUG_KERNEL @@ -1497,6 +1642,15 @@ config DEBUG_SPINLOCK best used in conjunction with the NMI watchdog so that spinlock deadlocks are also debuggable. +config SPINLINE + bool "Spinlock inlining" + depends on DEBUG_KERNEL + help + This will change spinlocks from out of line to inline, making them + account cost to the callers in readprofile, rather than the lock + itself (as ".text.lock.filename"). This can be helpful for finding + the callers of locks. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM @@ -1518,12 +1672,33 @@ config DEBUG_SPINLOCK_SLEEP noisy if they are called with a spinlock held. config FRAME_POINTER - bool "Compile the kernel with frame pointers" + bool + default y if X86_REMOTE_DEBUG + default n if !X86_REMOTE_DEBUG help If you say Y here the resulting kernel image will be slightly larger and slower, but it will give very useful debugging information. If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. + +config X86_STACK_CHECK + bool "Detect stack overflows" + depends on FRAME_POINTER + help + Say Y here to have the kernel attempt to detect when the per-task + kernel stack overflows. This is much more robust checking than + the above overflow check, which will only occasionally detect + an overflow. The level of guarantee here is much greater. + + Some older versions of gcc don't handle the -p option correctly. + Kernprof is affected by the same problem, which is described here: + http://oss.sgi.com/projects/kernprof/faq.html#Q9 + + Basically, if you get oopses in __free_pages_ok during boot when + you have this turned on, you need to fix gcc. The Redhat 2.96 + version and gcc-3.x seem to work. + + If not debugging a stack overflow problem, say N config X86_EXTRA_IRQS bool diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/Makefile 900-mjb3/arch/i386/Makefile --- 000-bk3/arch/i386/Makefile Sat Mar 8 10:16:03 2003 +++ 900-mjb3/arch/i386/Makefile Sat Mar 8 22:05:04 2003 @@ -76,6 +76,10 @@ mcore-$(CONFIG_X86_SUMMIT) := mach-defa # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +ifdef CONFIG_X86_STACK_CHECK +CFLAGS += -p +endif + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ @@ -89,6 +93,7 @@ drivers-$(CONFIG_OPROFILE) += arch/i386 CFLAGS += $(mflags-y) AFLAGS += $(mflags-y) +AFLAGS_vmlinux.lds.o += -imacros $(TOPDIR)/include/asm-i386/page.h boot := arch/i386/boot diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/boot/Makefile 900-mjb3/arch/i386/boot/Makefile --- 000-bk3/arch/i386/boot/Makefile Sat Mar 8 10:16:03 2003 +++ 900-mjb3/arch/i386/boot/Makefile Sat Mar 8 22:05:10 2003 @@ -102,3 +102,4 @@ zlilo: $(BOOTIMAGE) install: $(BOOTIMAGE) sh $(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" + if [ -f init/kerntypes.o ]; then cp init/kerntypes.o $(INSTALL_PATH)/Kerntypes; fi diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/boot/compressed/misc.c 900-mjb3/arch/i386/boot/compressed/misc.c --- 000-bk3/arch/i386/boot/compressed/misc.c Thu Jan 2 22:04:58 2003 +++ 900-mjb3/arch/i386/boot/compressed/misc.c Sat Mar 8 22:05:04 2003 @@ -377,3 +377,7 @@ asmlinkage int decompress_kernel(struct if (high_loaded) close_output_buffer_if_we_run_high(mv); return high_loaded; } + +/* We don't actually check for stack overflows this early. */ +__asm__(".globl mcount ; mcount: ret\n"); + diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/Makefile 900-mjb3/arch/i386/kernel/Makefile --- 000-bk3/arch/i386/kernel/Makefile Wed Mar 5 07:36:57 2003 +++ 900-mjb3/arch/i386/kernel/Makefile Sat Mar 8 22:04:56 2003 @@ -17,6 +17,7 @@ obj-$(CONFIG_MCA) += mca.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbstub.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o @@ -26,9 +27,18 @@ obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o suspend_asm.o obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_EDD) += edd.o +obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module.o obj-y += sysenter.o obj-$(CONFIG_ACPI_SRAT) += srat.o + +ifdef CONFIG_X86_REMOTE_DEBUG +GDBSTART=gdbstart +GDBCLEAN= -rm -f gdbstart /sbin/gdbstart +else +GDBSTART= +GDBCLEAN= +endif EXTRA_AFLAGS := -traditional diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/apic.c 900-mjb3/arch/i386/kernel/apic.c --- 000-bk3/arch/i386/kernel/apic.c Wed Mar 5 07:36:57 2003 +++ 900-mjb3/arch/i386/kernel/apic.c Sat Mar 8 22:05:03 2003 @@ -1053,7 +1053,8 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_apic_timer_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_apic_timer_interrupt(struct pt_regs* regs) { int cpu = smp_processor_id(); @@ -1073,14 +1074,16 @@ void smp_apic_timer_interrupt(struct pt_ * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(®s); + smp_local_timer_interrupt(regs); irq_exit(); + return regs; } /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -asmlinkage void smp_spurious_interrupt(void) +struct pt_regs * IRQHANDLER(smp_spurious_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_spurious_interrupt(struct pt_regs* regs) { unsigned long v; @@ -1098,13 +1101,15 @@ asmlinkage void smp_spurious_interrupt(v printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", smp_processor_id()); irq_exit(); + return regs; } /* * This interrupt should never happen with our APIC/SMP architecture */ -asmlinkage void smp_error_interrupt(void) +struct pt_regs * IRQHANDLER(smp_error_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_error_interrupt(struct pt_regs* regs) { unsigned long v, v1; @@ -1129,6 +1134,7 @@ asmlinkage void smp_error_interrupt(void printk (KERN_INFO "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); + return regs; } /* diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/cpu/mcheck/p4.c 900-mjb3/arch/i386/kernel/cpu/mcheck/p4.c --- 000-bk3/arch/i386/kernel/cpu/mcheck/p4.c Thu Jan 2 22:04:58 2003 +++ 900-mjb3/arch/i386/kernel/cpu/mcheck/p4.c Sat Mar 8 22:05:03 2003 @@ -61,11 +61,13 @@ static void intel_thermal_interrupt(stru /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_thermal_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_thermal_interrupt(struct pt_regs* regs) { irq_enter(); vendor_thermal_interrupt(®s); irq_exit(); + return regs; } /* P4/Xeon Thermal regulation detect and init */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/entry.S 900-mjb3/arch/i386/kernel/entry.S --- 000-bk3/arch/i386/kernel/entry.S Wed Mar 5 07:36:57 2003 +++ 900-mjb3/arch/i386/kernel/entry.S Sat Mar 8 22:05:04 2003 @@ -154,7 +154,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp @@ -218,7 +218,7 @@ need_resched: jz restore_all movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) sti - call schedule + call user_schedule movl $0,TI_PRE_COUNT(%ebp) cli jmp need_resched @@ -298,7 +298,7 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: - call schedule + call user_schedule cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -386,17 +386,78 @@ ENTRY(irq_entries_start) vector=vector+1 .endr + +# lets play optimizing compiler... +#ifdef CONFIG_X86_CMOV +#define COND_MOVE cmovnz %esi,%esp; +#else +#define COND_MOVE \ + jz 1f; \ + mov %esi,%esp; \ +1: +#endif + +# These macros will switch you to, and from a per-cpu interrupt stack +# They take the pt_regs arg and move it from the normal place on the +# stack to %eax. Any handler function can retrieve it using regparm(1). +# The handlers are expected to return the stack to switch back to in +# the same register. +# +# This means that the irq handlers need to return their arg +# +# SWITCH_TO_IRQSTACK clobbers %ebx, %ecx, %edx, %esi +# old stack gets put in %eax + +.macro SWITCH_TO_IRQSTACK + GET_THREAD_INFO(%ebx); + movl TI_IRQ_STACK(%ebx),%ecx; + movl TI_TASK(%ebx),%edx; + movl %esp,%eax; + + # %ecx+THREAD_SIZE is next stack -4 keeps us in the right one + leal (THREAD_SIZE-4)(%ecx),%esi; + + # is there a valid irq_stack? + testl %ecx,%ecx; + COND_MOVE; + + # update the task pointer in the irq stack + GET_THREAD_INFO(%esi); + movl %edx,TI_TASK(%esi); + + # update the preempt count in the irq stack + movl TI_PRE_COUNT(%ebx),%ecx; + movl %ecx,TI_PRE_COUNT(%esi); +.endm + +# copy flags from the irq stack back into the task's thread_info +# %esi is saved over the irq handler call and contains the irq stack's +# thread_info pointer +# %eax was returned from the handler, as described above +# %ebx contains the original thread_info pointer + +.macro RESTORE_FROM_IRQSTACK + movl %eax,%esp; + movl TI_FLAGS(%esi),%eax; + movl $0,TI_FLAGS(%esi); + LOCK orl %eax,TI_FLAGS(%ebx); +.endm + ALIGN common_interrupt: SAVE_ALL + SWITCH_TO_IRQSTACK call do_IRQ + RESTORE_FROM_IRQSTACK jmp ret_from_intr #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ - call smp_/**/name; \ + SWITCH_TO_IRQSTACK; \ + call smp_/**/name; \ + RESTORE_FROM_IRQSTACK; \ jmp ret_from_intr; /* The include is where all of the SMP etc. interrupts come from */ @@ -459,9 +520,16 @@ device_not_available_emulate: jmp ret_from_exception ENTRY(debug) + pushl $-1 # mark this as an int + SAVE_ALL + movl %esp,%edx pushl $0 - pushl $do_debug - jmp error_code + pushl %edx + call do_debug + addl $8,%esp + testl %eax,%eax + jnz restore_all + jmp ret_from_exception ENTRY(nmi) pushl %eax @@ -474,9 +542,16 @@ ENTRY(nmi) RESTORE_ALL ENTRY(int3) + pushl $-1 # mark this as an int + SAVE_ALL + movl %esp,%edx pushl $0 - pushl $do_int3 - jmp error_code + pushl %edx + call do_int3 + addl $8,%esp + testl %eax,%eax + jnz restore_all + jmp ret_from_exception ENTRY(overflow) pushl $0 @@ -502,6 +577,31 @@ ENTRY(invalid_TSS) pushl $do_invalid_TSS jmp error_code +#ifdef CONFIG_KGDB_THREAD +ENTRY(kern_schedule) + pushl %ebp + movl %esp, %ebp + pushl %ss + pushl %ebp + pushfl + pushl %cs + pushl 4(%ebp) + pushl %eax + pushl %es + pushl %ds + pushl %eax + pushl (%ebp) + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + call kern_do_schedule + movl %ebp, %esp + pop %ebp + ret +#endif + ENTRY(segment_not_present) pushl $do_segment_not_present jmp error_code @@ -533,6 +633,61 @@ ENTRY(spurious_interrupt_bug) pushl $0 pushl $do_spurious_interrupt_bug jmp error_code + + +#ifdef CONFIG_X86_STACK_CHECK +.data + .globl stack_overflowed +stack_overflowed: + .long 0 +.text + +ENTRY(mcount) + push %eax + movl $(THREAD_SIZE - 1),%eax + andl %esp,%eax + cmpl $STACK_WARN,%eax /* more than half the stack is used*/ + jle 1f +2: + popl %eax + ret +1: + lock; btsl $0,stack_overflowed + jc 2b + + # switch to overflow stack + movl %esp,%eax + movl $(stack_overflow_stack + THREAD_SIZE - 4),%esp + + pushf + cli + pushl %eax + + # push eip then esp of error for stack_overflow_panic + pushl 4(%eax) + pushl %eax + + # update the task pointer and cpu in the overflow stack's thread_info. + GET_THREAD_INFO_WITH_ESP(%eax) + movl TI_TASK(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_TASK + movl TI_CPU(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_CPU + + call stack_overflow + + # pop off call arguments + addl $8,%esp + + popl %eax + popf + movl %eax,%esp + popl %eax + movl $0,stack_overflowed + ret + +#warning stack check enabled +#endif .data ENTRY(sys_call_table) diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/gdbstart.c 900-mjb3/arch/i386/kernel/gdbstart.c --- 000-bk3/arch/i386/kernel/gdbstart.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/arch/i386/kernel/gdbstart.c Sat Mar 8 22:04:47 2003 @@ -0,0 +1,147 @@ +/* + * This program opens a tty file and issues the GDB stub activating + * ioctl on it. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +char *tty_name = "/dev/ttyS0" ; /* COM1 port */ +int speed = 9600 ; /* default speed */ +struct termios save_ts ; /* original term struct */ + +void print_usage(void) +{ + printf("gdbstub [-s speed] [-t tty-dev]\n") ; + printf(" defaults: /dev/ttyS0 with speed unmodified by this program\n"); + +} /* print_usage */ + +void tty_err(char *msg) +{ + char buf[100] ; + + strcpy(buf, msg) ; + strcat(buf, ": ") ; + strcat(buf, tty_name) ; + perror(buf) ; + exit(1) ; + +} /* tty_err */ + + +void setup_term(int fd) +{ + struct termios ts ; + int speed_code ; + + if (tcgetattr(fd, &ts) < 0) tty_err("tcgetattr") ; + + save_ts = ts ; + switch (speed) + { + case 4800: + speed_code = B4800 ; + break ; + case 9600: + speed_code = B9600 ; + break ; + case 19200: + speed_code = B19200 ; + break ; + case 38400: + speed_code = B38400 ; + break ; + case 57600: + speed_code = B57600 ; + break ; + case 115200: + speed_code = B115200 ; + break ; + case 230400: + speed_code = B230400 ; + break ; + default: + printf("Invalid speed: %d\n", speed) ; + exit(1) ; + } + + ts.c_cflag = CS8 | CREAD | CLOCAL ; + if (cfsetospeed(&ts, speed_code) < 0) tty_err("cfsetospeed") ; + if (cfsetispeed(&ts, speed_code) < 0) tty_err("cfsetispeed") ; + + if (tcsetattr(fd, TCSANOW, &ts) < 0) tty_err("tcsetattr") ; + +} /* setup_term */ + +int main(int argc, char **argv) +{ + int opt ; + int fil ; + int rslt ; + + while ((opt = getopt(argc, argv, "hs:t:")) > 0) + { + switch (opt) + { + case 's': + speed = atol(optarg) ; + break ; + case 't': + tty_name = optarg ; + break ; + case ':': + printf("Invalid option\n") ; + break ; + case '?': + case 'h': + default: + print_usage() ; + return 1; + } + } + + fil = open(tty_name, O_RDWR) ; + if (fil < 0) + { + perror(tty_name) ; + return 1; + } + + + setup_term(fil) ; + + /* + * When we issue this ioctl, control will not return until + * the debugger running on the remote host machine says "go". + */ + printf("\nAbout to activate GDB stub in the kernel on %s\n", tty_name) ; + printf("Hit CR to continue, kill program to abort -- ") ; + getchar() ; + sync() ; + rslt = ioctl(fil, TIOCGDB, 0) ; + if (rslt < 0) + { + perror("TIOCGDB ioctl") ; + return 1; + } + + printf("\nGDB stub successfully activated\n") ; + + for (;;) + { + pause() ; + } + + if (tcsetattr(fil, TCSANOW, &save_ts) < 0) tty_err("tcsetattr") ; + + exit(0); +} /* main */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/gdbstub.c 900-mjb3/arch/i386/kernel/gdbstub.c --- 000-bk3/arch/i386/kernel/gdbstub.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/arch/i386/kernel/gdbstub.c Sat Mar 8 22:04:47 2003 @@ -0,0 +1,1208 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (C) 2000-2001 VERITAS Software Corporation. + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: Amit Kale + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Origianl kgdb, compatibility with 2.1.xx kernel by David Grothe + * Integrated into 2.2.5 kernel by Tigran Aivazian + * thread support, + * support for multiple processors, + * support for ia-32(x86) hardware debugging, + * Console support, + * handling nmi watchdog + * Amit S. Kale ( akale@veritas.com ) + * + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif +#include + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int putDebugChar(int); /* write a single character */ +extern int getDebugChar(void); /* read and return a single char */ + +extern int pid_max; + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 1024 + +static char initialized; /* boolean flag. != 0 means we've been initialized */ + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES 64 +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +enum regnames { _EAX, /* 0 */ + _ECX, /* 1 */ + _EDX, /* 2 */ + _EBX, /* 3 */ + _ESP, /* 4 */ + _EBP, /* 5 */ + _ESI, /* 6 */ + _EDI, /* 7 */ + _PC /* 8 also known as eip */ , + _PS /* 9 also known as eflags */ , + _CS, /* 10 */ + _SS, /* 11 */ + _DS, /* 12 */ + _ES, /* 13 */ + _FS, /* 14 */ + _GS +}; /* 15 */ + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* */ + +#define BREAKPOINT() asm(" int $3"); + +/* Put the error code here just in case the user cares. */ +int gdb_i386errcode; +/* Likewise, the vector number here (since GDB only gets the signal + number through the usual means, and that's not very specific). */ +int gdb_i386vector = -1; + +static spinlock_t slavecpulocks[KGDB_MAX_NO_CPUS]; +volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#ifdef CONFIG_SMP +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +spinlock_t kgdb_nmispinlock = SPIN_LOCK_UNLOCKED; +#else +unsigned kgdb_spinlock = 0; +unsigned kgdb_nmispinlock = 0; +#endif + +static void +kgdb_usercode(void) +{ +} + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + do { + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + if (!putDebugChar(ch)) + return; + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + + } while ((getDebugChar() & 0x7f) != '+'); + +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static short error; + +static void +regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_EAX] = regs->eax; + gdb_regs[_EBX] = regs->ebx; + gdb_regs[_ECX] = regs->ecx; + gdb_regs[_EDX] = regs->edx; + gdb_regs[_ESI] = regs->esi; + gdb_regs[_EDI] = regs->edi; + gdb_regs[_EBP] = regs->ebp; + gdb_regs[_DS] = regs->xds; + gdb_regs[_ES] = regs->xes; + gdb_regs[_PS] = regs->eflags; + gdb_regs[_CS] = regs->xcs; + gdb_regs[_PC] = regs->eip; + gdb_regs[_ESP] = (int) (®s->esp); + gdb_regs[_SS] = __KERNEL_DS; + gdb_regs[_FS] = 0xFFFF; + gdb_regs[_GS] = 0xFFFF; +} /* regs_to_gdb_regs */ + +static void +gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) +{ + regs->eax = gdb_regs[_EAX]; + regs->ebx = gdb_regs[_EBX]; + regs->ecx = gdb_regs[_ECX]; + regs->edx = gdb_regs[_EDX]; + regs->esi = gdb_regs[_ESI]; + regs->edi = gdb_regs[_EDI]; + regs->ebp = gdb_regs[_EBP]; + regs->xds = gdb_regs[_DS]; + regs->xes = gdb_regs[_ES]; + regs->eflags = gdb_regs[_PS]; + regs->xcs = gdb_regs[_CS]; + regs->eip = gdb_regs[_PC]; +#if 0 /* can't change these */ + regs->esp = gdb_regs[_ESP]; + regs->xss = gdb_regs[_SS]; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif + +} /* gdb_regs_to_regs */ + +/* Indicate to caller of mem2hex or hex2mem that there has been an + error. */ +static volatile int kgdb_memerr = 0; +volatile int kgdb_memerr_expected = 0; +static volatile int kgdb_memerr_cnt = 0; +static int garbage_loc = -1; + +int +get_char(char *addr) +{ + return *addr; +} + +void +set_char(char *addr, int val) +{ + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set kgdb_memerr in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + + ch = get_char(mem++); + + if (may_fault && kgdb_memerr) { + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + kgdb_memerr_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch); + + if (may_fault && kgdb_memerr) { + return (mem); + } + } + if (may_fault) + kgdb_memerr_expected = 0; + return (mem); +} + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToInt(char **ptr, int *intValue) +{ + int numChars = 0; + int hexValue; + + *intValue = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *intValue = (*intValue << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#ifdef CONFIG_KGDB_THREAD +static int +stubhex(int ch) +{ + if (ch >= 'a' && ch <= 'f') + return ch - 'a' + 10; + if (ch >= '0' && ch <= '9') + return ch - '0'; + if (ch >= 'A' && ch <= 'F') + return ch - 'A' + 10; + return -1; +} + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif + +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +#ifdef CONFIG_KGDB_THREAD +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif + +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} + +#ifdef CONFIG_KGDB_THREAD +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } +#if 0 + thread = init_tasks[0]; + do { + if (thread->pid == pid) { + return thread; + } + thread = thread->next_task; + } while (thread != init_tasks[0]); +#endif + return NULL; +} +#endif + +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned addr; +} breakinfo[4] = { { +enabled:0}, { +enabled:0}, { +enabled:0}, { +enabled:0}}; + +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned dr7; + + asm volatile ("movl %%db7, %0\n":"=r" (dr7) + :); + do { + unsigned addr0, addr1, addr2, addr3; + asm volatile ("movl %%db0, %0\n" + "movl %%db1, %1\n" + "movl %%db2, %2\n" + "movl %%db3, %3\n":"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3):); + } while (0); + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movl %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movl %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movl %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movl %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movl %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +void +gdb_wait(void *arg) +{ + unsigned flags; + int processor; + + local_irq_save(flags); + processor = smp_processor_id(); + procindebug[processor] = 1; + current->thread.kgdbregs = arg; + spin_lock(slavecpulocks + processor); + correct_hw_break(); + procindebug[processor] = 0; + local_irq_restore(flags); +} + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + int addr, length; + int breakno, breaktype; + char *ptr; + int newPC; + unsigned long flags = ~0UL; + int gdb_regs[NUMREGBYTES / 4]; + int i; + int dr6; + int reboot = 0; +#ifdef CONFIG_KGDB_THREAD + int nothreads; + int maxthreads; + int threadid; + threadref thref; + struct task_struct *thread = NULL; +#endif +#define regs (*linux_regs) + + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { + return (0); + } + + if (kgdb_memerr_expected) { + /* + * This fault occured because of the get_char or set_char + * routines. These two routines use either eax of edx to + * indirectly reference the location in memory that they + * are working with. For a page fault, when we return + * the instruction will be retried, so we have to make + * sure that these registers point to valid memory. + */ + kgdb_memerr = 1; /* set mem error flag */ + kgdb_memerr_expected = 0; + kgdb_memerr_cnt++; /* helps in debugging */ + regs.eax = (long) &garbage_loc; /* make valid address */ + regs.edx = (long) &garbage_loc; /* make valid address */ + return (0); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_nmispinlock)) +#else + if (!kgdb_nmispinlock) +#endif + { + + /* Get kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_lock(&kgdb_spinlock); +#else + kgdb_spinlock = 1; +#endif + + local_irq_save(flags); + + /* Disable hardware debugging while we are in kgdb */ + __asm__("movl %0,%%db7": /* no output */ + :"r"(0)); + + for (i = 0; i < NR_CPUS; i++) { + spin_lock_init(&slavecpulocks[i]); + _raw_spin_lock(&slavecpulocks[i]); + } + + if (num_online_cpus() > 1) { + /* Force other cpus in debugger */ + if (smp_call_function(gdb_wait, NULL, 0, 99) != 0) { + return (1); + } + } + + procindebug[smp_processor_id()] = 1; + } + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'g': /* return the value of the CPU registers */ + if (!usethread || usethread == current) { + regs_to_gdb_regs(gdb_regs, ®s); + } else { + memset(gdb_regs, 0, NUMREGBYTES); + if (usethread->thread.kgdbregs) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + get_char((char *) usethread->thread. + kgdbregs); + kgdb_memerr_expected = 0; + if (kgdb_memerr) { + gdb_regs[_PC] = + (int) kgdb_usercode; + } else { + regs_to_gdb_regs(gdb_regs, + usethread-> + thread. + kgdbregs); + } + } else { + gdb_regs[_PC] = (int) kgdb_usercode; + } + } + mem2hex((char *) gdb_regs, remcomOutBuffer, NUMREGBYTES, + 0); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], (char *) gdb_regs, + NUMREGBYTES, 0); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) { + ptr = 0; + mem2hex((char *) addr, + remcomOutBuffer, length, + 1); + if (kgdb_memerr) { + strcpy(remcomOutBuffer, + "E03"); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + } + break; + + /* MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) + if (*(ptr++) == ':') { + hex2mem(ptr, + (char *) addr, + length, 1); + + if (kgdb_memerr) { + strcpy + (remcomOutBuffer, + "E03"); + } else { + strcpy + (remcomOutBuffer, + "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + } + break; + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + case 'c': + case 's': +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* try to read optional parameter, pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) { + regs.eip = addr; + } + + newPC = regs.eip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno)) { + if (breakinfo[breakno].type == + 0) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + } + correct_hw_break(); + asm volatile ("movl %0, %%db6\n"::"r" (0)); + for (i = 0; i < NR_CPUS; i++) { + _raw_spin_unlock(&slavecpulocks[i]); + } + + procindebug[smp_processor_id()] = 0; + /* Release kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_unlock(&kgdb_spinlock); +#else + kgdb_spinlock = 0; +#endif + if (flags != ~0UL) + local_irq_restore(flags); + return (0); + + /* kill the program */ + case 'k': + break; + + /* query */ + case 'q': + switch (remcomInBuffer[1]) { +#ifdef CONFIG_KGDB_THREAD + case 'L': + /* List threads */ + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads + && threadid < pid_max; threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + } + } + if (threadid == pid_max) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; + + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; +#endif + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, err_code, + remcomOutBuffer); + break; + } + break; + +#ifdef CONFIG_KGDB_THREAD + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + usethread = thread; + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; +#endif + + case 'r': + reboot = 1; + strcpy(remcomOutBuffer, "OK"); + break; + case 'Y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + ptr++; + hexToInt(&ptr, &breaktype); + ptr++; + hexToInt(&ptr, &length); + ptr++; + hexToInt(&ptr, &addr); + if (set_hw_break + (breakno & 0x3, breaktype & 0x3, length & 0x3, addr) + == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + if (reboot == 1) { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt)); + __asm__ __volatile__("int3"); + } + } +} + +/* this function is used to set up exception handlers for tracing and + breakpoints */ +void +set_debug_traps(void) +{ + /* + * linux_debug_hook is defined in traps.c. We store a pointer + * to our own exception handler into it. + */ + linux_debug_hook = handle_exception; + + /* + * In case GDB is started before us, ack any packets (presumably + * "$?#xx") sitting there. */ + putDebugChar('+'); + + initialized = 1; +} + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ + +void +breakpoint(void) +{ + if (initialized) + BREAKPOINT(); +} + +#ifdef CONFIG_GDB_CONSOLE +char gdbconbuf[BUFMAX]; + +void +gdb_console_write(struct console *co, const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + + if (!gdb_initialized) { + return; + } + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } +} +#endif +static int __init +kgdb_opt_gdb(char *dummy) +{ + gdb_enter = 1; + return 1; +} +static int __init +kgdb_opt_gdbttyS(char *str) +{ + gdb_ttyS = simple_strtoul(str, NULL, 10); + return 1; +} +static int __init +kgdb_opt_gdbbaud(char *str) +{ + gdb_baud = simple_strtoul(str, NULL, 10); + return 1; +} + +/* + * Sequence of these lines has to be maintained because gdb option is a prefix + * of the other two options + */ + +__setup("gdbttyS=", kgdb_opt_gdbttyS); +__setup("gdbbaud=", kgdb_opt_gdbbaud); +__setup("gdb", kgdb_opt_gdb); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/head.S 900-mjb3/arch/i386/kernel/head.S --- 000-bk3/arch/i386/kernel/head.S Tue Feb 25 23:03:43 2003 +++ 900-mjb3/arch/i386/kernel/head.S Sat Mar 8 22:05:02 2003 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -325,7 +326,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/i386_ksyms.c 900-mjb3/arch/i386/kernel/i386_ksyms.c --- 000-bk3/arch/i386/kernel/i386_ksyms.c Wed Mar 5 07:36:57 2003 +++ 900-mjb3/arch/i386/kernel/i386_ksyms.c Sat Mar 8 22:05:10 2003 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,8 @@ #include #include #include +#include +#include extern void dump_thread(struct pt_regs *, struct user *); extern spinlock_t rtc_lock; @@ -68,6 +71,7 @@ EXPORT_SYMBOL(EISA_bus); EXPORT_SYMBOL(MCA_bus); #ifdef CONFIG_DISCONTIGMEM EXPORT_SYMBOL(node_data); +EXPORT_SYMBOL(physnode_map); #endif #ifdef CONFIG_X86_NUMAQ EXPORT_SYMBOL(xquad_portio); @@ -147,6 +151,20 @@ EXPORT_SYMBOL(smp_num_siblings); EXPORT_SYMBOL(cpu_sibling_map); #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +void __this_fixmap_does_not_exist(void) +{ + BUG(); +} +EXPORT_SYMBOL(__this_fixmap_does_not_exist); + +void __br_lock_usage_bug(void) +{ + BUG(); +} +EXPORT_SYMBOL(__br_lock_usage_bug); +#endif + #ifdef CONFIG_SMP EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(cpu_online_map); @@ -184,6 +202,7 @@ EXPORT_SYMBOL(rtc_lock); EXPORT_SYMBOL_GPL(set_nmi_callback); EXPORT_SYMBOL_GPL(unset_nmi_callback); +EXPORT_SYMBOL_GPL(die_chain); #undef memcpy #undef memset @@ -212,4 +231,26 @@ EXPORT_SYMBOL(kmap_atomic_to_page); #ifdef CONFIG_EDD_MODULE EXPORT_SYMBOL(edd); EXPORT_SYMBOL(eddnr); +#endif + +#ifdef CONFIG_CRASH_DUMP_MODULE +#ifdef CONFIG_SMP +extern irq_desc_t irq_desc[NR_IRQS]; +extern unsigned long irq_affinity[NR_IRQS]; +extern void stop_this_cpu(void *); +EXPORT_SYMBOL(irq_desc); +EXPORT_SYMBOL(irq_affinity); +EXPORT_SYMBOL(stop_this_cpu); +EXPORT_SYMBOL(dump_send_ipi); +#endif +extern int pfn_is_ram(unsigned long); +EXPORT_SYMBOL(pfn_is_ram); +#ifdef ARCH_HAS_NMI_WATCHDOG +EXPORT_SYMBOL(touch_nmi_watchdog); +#endif +#endif + +#ifdef CONFIG_X86_STACK_CHECK +extern void mcount(void); +EXPORT_SYMBOL(mcount); #endif diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/init_task.c 900-mjb3/arch/i386/kernel/init_task.c --- 000-bk3/arch/i386/kernel/init_task.c Thu Feb 13 11:08:02 2003 +++ 900-mjb3/arch/i386/kernel/init_task.c Sat Mar 8 22:05:04 2003 @@ -14,6 +14,14 @@ static struct signal_struct init_signals static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +union thread_union init_irq_union + __attribute__((__section__(".data.init_task"))); + +#ifdef CONFIG_X86_STACK_CHECK +union thread_union stack_overflow_stack + __attribute__((__section__(".data.init_task"))); +#endif + /* * Initial thread structure. * diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/io_apic.c 900-mjb3/arch/i386/kernel/io_apic.c --- 000-bk3/arch/i386/kernel/io_apic.c Sat Mar 8 10:16:03 2003 +++ 900-mjb3/arch/i386/kernel/io_apic.c Sat Mar 8 22:03:43 2003 @@ -116,40 +116,84 @@ static void __init replace_pin_at_irq(un } } -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_pin_list *entry = irq_2_pin + irq; \ - \ - for (;;) { \ - unsigned int reg; \ - pin = entry->pin; \ - if (pin == -1) \ - break; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ - if (!entry->next) \ - break; \ - entry = irq_2_pin + entry->next; \ - } \ - FINAL; \ -} - -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ -DO_ACTION( __mask_and_edge, 0, = (reg & 0xffff7fff) | 0x00010000, ) - /* mask = 1, trigger = 0 */ -DO_ACTION( __unmask_and_level, 0, = (reg & 0xfffeffff) | 0x00008000, ) - /* mask = 0, trigger = 1 */ +/* mask = 1 */ +static void __mask_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + io_apic_modify(entry->apic, 0x10 + pin*2, reg |= 0x00010000); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + io_apic_sync(entry->apic); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + io_apic_modify(entry->apic, 0x10 + pin*2, reg &= 0xfffeffff); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg = (reg & 0xffff7fff) | 0x00010000; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg = (reg & 0xfffeffff) | 0x00008000; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} static void mask_IO_APIC_irq (unsigned int irq) { @@ -197,13 +241,23 @@ static void clear_IO_APIC (void) static void set_ioapic_affinity (unsigned int irq, unsigned long mask) { unsigned long flags; + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; /* * Only the first 8 bits are valid. */ mask = mask << 24; spin_lock_irqsave(&ioapic_lock, flags); - __DO_ACTION(1, = mask, ) + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(entry->apic, 0x10 + 1 + pin*2, mask); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } spin_unlock_irqrestore(&ioapic_lock, flags); } diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/irq.c 900-mjb3/arch/i386/kernel/irq.c --- 000-bk3/arch/i386/kernel/irq.c Wed Mar 5 07:36:57 2003 +++ 900-mjb3/arch/i386/kernel/irq.c Sat Mar 8 22:05:03 2003 @@ -311,7 +311,8 @@ void enable_irq(unsigned int irq) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs regs) +struct pt_regs * IRQHANDLER(do_IRQ(struct pt_regs *regs)); +struct pt_regs * do_IRQ(struct pt_regs *regs) { /* * We ack quickly, we don't want the irq controller @@ -323,7 +324,7 @@ asmlinkage unsigned int do_IRQ(struct pt * 0 return value means that this irq is already being * handled by some other CPU. (or is disabled) */ - int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */ + int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code */ int cpu = smp_processor_id(); irq_desc_t *desc = irq_desc + irq; struct irqaction * action; @@ -388,7 +389,7 @@ asmlinkage unsigned int do_IRQ(struct pt */ for (;;) { spin_unlock(&desc->lock); - handle_IRQ_event(irq, ®s, action); + handle_IRQ_event(irq, regs, action); spin_lock(&desc->lock); if (likely(!(desc->status & IRQ_PENDING))) @@ -407,7 +408,7 @@ out: irq_exit(); - return 1; + return regs; } /** @@ -865,8 +866,9 @@ static int irq_affinity_write_proc (stru return -EINVAL; irq_affinity[irq] = new_value; +#ifndef CONFIG_X86_SUMMIT irq_desc[irq].handler->set_affinity(irq, new_value); - +#endif return full_count; } diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/kprobes.c 900-mjb3/arch/i386/kernel/kprobes.c --- 000-bk3/arch/i386/kernel/kprobes.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/arch/i386/kernel/kprobes.c Sat Mar 8 22:04:56 2003 @@ -0,0 +1,160 @@ +/* + * Support for kernel probes. + * (C) 2002 Vamsi Krishna S . + */ + +#include +#include +#include +#include +#include + +/* kprobe_status settings */ +#define KPROBE_HIT_ACTIVE 0x00000001 +#define KPROBE_HIT_SS 0x00000002 + +static struct kprobe *current_kprobe; +static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags; + +/* + * returns non-zero if opcode modifies the interrupt flag. + */ +static inline int is_IF_modifier(u8 opcode) +{ + switch(opcode) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0xcf: /* iret/iretd */ + case 0x9d: /* popf/popfd */ + return 1; + } + return 0; +} + +static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + *p->addr = p->opcode; + regs->eip = (unsigned long)p->addr; +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled thorough out this function. + */ +int kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + u8 *addr = (u8 *)(regs->eip-1); + + /* We're in an interrupt, but this is clear and BUG()-safe. */ + preempt_disable(); + + /* Check we're not actually recursing */ + if (kprobe_running()) { + /* We *are* holding lock here, so this is safe. + Disarm the probe we just hit, and ignore it. */ + p = get_kprobe(addr); + if (p) { + disarm_kprobe(p, regs); + ret = 1; + } + /* If it's not ours, can't be delete race, (we hold lock). */ + goto no_kprobe; + } + + lock_kprobes(); + p = get_kprobe(addr); + if (!p) { + unlock_kprobes(); + /* Unregistered (on another cpu) after this hit? Ignore */ + if (*addr != BREAKPOINT_INSTRUCTION) + ret = 1; + /* Not one of ours: let kernel handle it */ + goto no_kprobe; + } + + kprobe_status = KPROBE_HIT_ACTIVE; + current_kprobe = p; + kprobe_saved_eflags = kprobe_old_eflags + = (regs->eflags & (TF_MASK|IF_MASK)); + if (is_IF_modifier(p->opcode)) + kprobe_saved_eflags &= ~IF_MASK; + + p->pre_handler(p, regs); + + regs->eflags |= TF_MASK; + regs->eflags &= ~IF_MASK; + + /* We hold lock, now we remove breakpoint and single step. */ + disarm_kprobe(p, regs); + kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +static void rearm_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + regs->eflags &= ~TF_MASK; + *p->addr = BREAKPOINT_INSTRUCTION; +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate and they + * remain disabled thorough out this function. And we hold kprobe lock. + */ +int post_kprobe_handler(struct pt_regs *regs) +{ + if (!kprobe_running()) + return 0; + + if (current_kprobe->post_handler) + current_kprobe->post_handler(current_kprobe, regs, 0); + + /* + * We singlestepped with interrupts disabled. So, the result on + * the stack would be incorrect for "pushfl" instruction. + * Note that regs->esp is actually the top of the stack when the + * trap occurs in kernel space. + */ + if (current_kprobe->opcode == 0x9c) { /* pushfl */ + regs->esp &= ~(TF_MASK | IF_MASK); + regs->esp |= kprobe_old_eflags; + } + + rearm_kprobe(current_kprobe, regs); + regs->eflags |= kprobe_saved_eflags; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, eflags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->eflags & TF_MASK) + return 0; + + return 1; +} + +/* Interrupts disabled, kprobe_lock held. */ +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + if (current_kprobe->fault_handler + && current_kprobe->fault_handler(current_kprobe, regs, trapnr)) + return 1; + + if (kprobe_status & KPROBE_HIT_SS) { + rearm_kprobe(current_kprobe, regs); + regs->eflags |= kprobe_old_eflags; + + unlock_kprobes(); + preempt_enable_no_resched(); + } + return 0; +} diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/nmi.c 900-mjb3/arch/i386/kernel/nmi.c --- 000-bk3/arch/i386/kernel/nmi.c Wed Mar 5 07:36:57 2003 +++ 900-mjb3/arch/i386/kernel/nmi.c Sat Mar 8 22:05:10 2003 @@ -20,11 +20,27 @@ #include #include #include +#include +#include #include #include #include +#ifdef CONFIG_X86_REMOTE_DEBUG +extern gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs) ; \ + after; \ + } \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + unsigned int nmi_watchdog = NMI_NONE; static unsigned int nmi_hz = HZ; unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ @@ -361,12 +377,59 @@ void nmi_watchdog_tick (struct pt_regs * sum = irq_stat[cpu].apic_timer_irqs; if (last_irq_sums[cpu] == sum) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_spinlock)) +#else + if (kgdb_spinlock) +#endif + { + /* We are inside kgdb, this isn't a stuck cpu */ + alert_counter[cpu] = 0; + } else { +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + if (!procindebug[cpu]) { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } + return; + } + } +#endif /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; if (alert_counter[cpu] == 5*nmi_hz) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_trylock(&kgdb_nmispinlock)) +#else + kgdb_nmispinlock = 1; +#endif + { + procindebug[cpu] = 1; + CHK_REMOTE_DEBUG(2,SIGBUS,0,regs,) + } +#ifdef CONFIG_SMP + else { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } +#endif +#endif spin_lock(&nmi_print_lock); /* * We are in trouble anyway, lets at least try @@ -375,6 +438,7 @@ void nmi_watchdog_tick (struct pt_regs * bust_spinlocks(1); printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip); show_registers(regs); + notify_die(DIE_WATCHDOG, "nmi_watchdog", regs, 0); printk("console shuts up ...\n"); console_silent(); spin_unlock(&nmi_print_lock); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/numaq.c 900-mjb3/arch/i386/kernel/numaq.c --- 000-bk3/arch/i386/kernel/numaq.c Wed Mar 5 07:36:57 2003 +++ 900-mjb3/arch/i386/kernel/numaq.c Sat Mar 8 22:03:39 2003 @@ -31,8 +31,7 @@ #include /* These are needed before the pgdat's are created */ -unsigned long node_start_pfn[MAX_NUMNODES]; -unsigned long node_end_pfn[MAX_NUMNODES]; +extern long node_start_pfn[], node_end_pfn[]; #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) @@ -64,26 +63,6 @@ static void __init smp_dump_qct(void) } } } - -/* - * ----------------------------------------- - * - * functions related to physnode_map - * - * ----------------------------------------- - */ -/* - * physnode_map keeps track of the physical memory layout of the - * numaq nodes on a 256Mb break (each element of the array will - * represent 256Mb of memory and will be marked by the node id. so, - * if the first gig is on node 0, and the second gig is on node 1 - * physnode_map will contain: - * physnode_map[0-3] = 0; - * physnode_map[4-7] = 1; - * physnode_map[8- ] = -1; - */ -int physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1}; -EXPORT_SYMBOL(physnode_map); /* * for each node mark the regions diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/process.c 900-mjb3/arch/i386/kernel/process.c --- 000-bk3/arch/i386/kernel/process.c Tue Feb 25 23:03:43 2003 +++ 900-mjb3/arch/i386/kernel/process.c Sat Mar 8 22:05:04 2003 @@ -160,7 +160,25 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); -void show_regs(struct pt_regs * regs) +void stack_overflow(unsigned long esp, unsigned long eip) +{ + int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC); + + printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%x %d %d\n", + esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing ); + + if (panicing) + print_symbol("stack overflow from %s\n", eip); + else + print_symbol("excessive stack use from %s\n", eip); + printk("esp: %p\n", (void*)esp); + show_trace((void*)esp); + + if (panicing) + panic("stack overflow\n"); +} + +asmlinkage void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -437,6 +455,7 @@ struct task_struct * __switch_to(struct /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + next_p->thread_info->irq_stack = prev_p->thread_info->irq_stack; unlazy_fpu(prev_p); /* diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/reboot.c 900-mjb3/arch/i386/kernel/reboot.c --- 000-bk3/arch/i386/kernel/reboot.c Sun Nov 17 20:29:30 2002 +++ 900-mjb3/arch/i386/kernel/reboot.c Sat Mar 8 22:05:10 2003 @@ -8,6 +8,7 @@ #include #include #include +#include /* * Power off function, if any @@ -256,7 +257,8 @@ void machine_restart(char * __unused) * Stop all CPUs and turn off local APICs and the IO-APIC, so * other OSs see a clean IRQ state. */ - smp_send_stop(); + if (notify_die(DIE_STOP,"cpustop",0,0) != NOTIFY_BAD) + smp_send_stop(); disable_IO_APIC(); #endif diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/smp.c 900-mjb3/arch/i386/kernel/smp.c --- 000-bk3/arch/i386/kernel/smp.c Wed Mar 5 07:36:57 2003 +++ 900-mjb3/arch/i386/kernel/smp.c Sat Mar 8 22:05:10 2003 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -144,6 +145,13 @@ static inline void __send_IPI_shortcut(u */ cfg = __prepare_ICR(shortcut, vector); + if (vector == DUMP_VECTOR) { + /* + * Setup DUMP IPI to be delivered as an NMI + */ + cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI; + } + /* * Send the IPI. The write to APIC_ICR fires this off. */ @@ -305,7 +313,8 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ -asmlinkage void smp_invalidate_interrupt (void) +struct pt_regs * IRQHANDLER(smp_invalidate_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; @@ -336,6 +345,7 @@ asmlinkage void smp_invalidate_interrupt out: put_cpu_no_resched(); + return regs; } static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, @@ -450,6 +460,11 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, 0, 1, 1); } +void dump_send_ipi(void) +{ + send_IPI_allbutself(DUMP_VECTOR); +} + /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing @@ -509,10 +524,17 @@ int smp_call_function (void (*func) (voi { struct call_data_struct data; int cpus = num_online_cpus()-1; + int count = 0; + int gdb; - if (!cpus) + if (cpus <= 0) return 0; + gdb = 0; + if (wait == 99) { + wait = 0; + gdb = 1; + } data.func = func; data.info = info; atomic_set(&data.started, 0); @@ -527,18 +549,33 @@ int smp_call_function (void (*func) (voi send_IPI_allbutself(CALL_FUNCTION_VECTOR); /* Wait for response */ - while (atomic_read(&data.started) != cpus) + while (atomic_read(&data.started) != cpus) { + if (gdb) { + if (count++ == 2000000) { + printk("%s: timeout\n", __FUNCTION__); + break; + } + if (count == 1000000) { + printk("looks bad\n"); + printk("cpus=%d, started=%d\n", cpus, + atomic_read(&data.started)); + } + if (count > 1000000) + udelay(1); + } barrier(); + } if (wait) while (atomic_read(&data.finished) != cpus) barrier(); + spin_unlock(&call_lock); return 0; } -static void stop_this_cpu (void * dummy) +void stop_this_cpu (void * dummy) { /* * Remove this CPU: @@ -569,14 +606,17 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) +struct pt_regs * IRQHANDLER(smp_reschedule_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + return regs; } -asmlinkage void smp_call_function_interrupt(void) +struct pt_regs * IRQHANDLER(smp_call_function_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_call_function_interrupt(struct pt_regs *regs) { - void (*func) (void *info) = call_data->func; + void (*func) (void *info, struct pt_regs *) = (void (*)(void *, struct pt_regs*))call_data->func; void *info = call_data->info; int wait = call_data->wait; @@ -591,12 +631,12 @@ asmlinkage void smp_call_function_interr * At this point the info structure may be out of scope unless wait==1 */ irq_enter(); - (*func)(info); + (*func)(info, regs); irq_exit(); if (wait) { mb(); atomic_inc(&call_data->finished); } + return regs; } - diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/smpboot.c 900-mjb3/arch/i386/kernel/smpboot.c --- 000-bk3/arch/i386/kernel/smpboot.c Wed Mar 5 07:36:57 2003 +++ 900-mjb3/arch/i386/kernel/smpboot.c Sat Mar 8 22:05:10 2003 @@ -48,6 +48,7 @@ #include #include #include +#include #include #include "smpboot_hooks.h" @@ -62,7 +63,7 @@ int smp_num_siblings = 1; int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ /* Bitmask of currently online CPUs */ -unsigned long cpu_online_map; +unsigned long cpu_online_map = 1; static volatile unsigned long cpu_callin_map; volatile unsigned long cpu_callout_map; @@ -71,6 +72,11 @@ static unsigned long smp_commenced_mask; /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +/* Per CPU interrupt stacks */ +extern union thread_union init_irq_union; +union thread_union *irq_stacks[NR_CPUS] __cacheline_aligned = + { &init_irq_union, }; + /* Set when the idlers are all forked */ int smp_threads_ready; @@ -412,6 +418,8 @@ void __init smp_callin(void) */ smp_store_cpu_info(cpuid); + notify_die(DIE_CPUINIT, "cpuinit", NULL, 0); + disable_APIC_timer(); local_irq_disable(); /* @@ -770,6 +778,28 @@ wakeup_secondary_cpu(int phys_apicid, un } #endif /* WAKE_SECONDARY_VIA_INIT */ +static void __init setup_irq_stack(struct task_struct *p, int cpu) +{ + unsigned long stk; + + stk = __get_free_pages(GFP_KERNEL, THREAD_ORDER); + if (!stk) + panic("I can't seem to allocate my irq stack. Oh well, giving up."); + + irq_stacks[cpu] = (void *)stk; + memset(irq_stacks[cpu], 0, THREAD_SIZE); + irq_stacks[cpu]->thread_info.cpu = cpu; + irq_stacks[cpu]->thread_info.preempt_count = 1; + /* interrupts are not preemptable */ + p->thread_info->irq_stack = &irq_stacks[cpu]->thread_info; + + /* If we want to make the irq stack more than one unit + * deep, we can chain then off of the irq_stack pointer + * here. + */ +} + + extern unsigned long cpu_initialized; static int __init do_boot_cpu(int apicid) @@ -794,6 +824,8 @@ static int __init do_boot_cpu(int apicid if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + setup_irq_stack(idle, cpu); + /* * We remove it from the pidhash and the runqueue * once we got the process: @@ -966,6 +998,7 @@ static void __init smp_boot_cpus(unsigne if (APIC_init_uniprocessor()) printk(KERN_NOTICE "Local APIC not detected." " Using dummy APIC emulation.\n"); + map_cpu_to_logical_apicid(); return; } diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/srat.c 900-mjb3/arch/i386/kernel/srat.c --- 000-bk3/arch/i386/kernel/srat.c Wed Mar 5 07:36:57 2003 +++ 900-mjb3/arch/i386/kernel/srat.c Sat Mar 8 22:03:38 2003 @@ -57,8 +57,7 @@ static int num_memory_chunks; /* total static int zholes_size_init; static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES]; -unsigned long node_start_pfn[MAX_NUMNODES]; -unsigned long node_end_pfn[MAX_NUMNODES]; +extern unsigned long node_start_pfn[], node_end_pfn[]; extern void * boot_ioremap(unsigned long, unsigned long); @@ -182,30 +181,19 @@ static __init void chunk_to_zones(unsign } } -/* - * physnode_map keeps track of the physical memory layout of the - * numaq nodes on a 256Mb break (each element of the array will - * represent 256Mb of memory and will be marked by the node id. so, - * if the first gig is on node 0, and the second gig is on node 1 - * physnode_map will contain: - * physnode_map[0-3] = 0; - * physnode_map[4-7] = 1; - * physnode_map[8- ] = -1; - */ -int pfnnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1}; -EXPORT_SYMBOL(pfnnode_map); - -static void __init initialize_pfnnode_map(void) +static void __init initialize_physnode_map(void) { - unsigned long topofchunk, cur = 0; int i; - - for (i = 0; i < num_memory_chunks; i++) { - cur = node_memory_chunk[i].start_pfn; - topofchunk = node_memory_chunk[i].end_pfn; - while (cur < topofchunk) { - pfnnode_map[PFN_TO_ELEMENT(cur)] = node_memory_chunk[i].nid; - cur ++; + unsigned long pfn; + struct node_memory_chunk_s *nmcp; + + /* Run the list of memory chunks and fill in the phymap. */ + nmcp = node_memory_chunk; + for (i = num_memory_chunks; --i >= 0; nmcp++) { + for (pfn = nmcp->start_pfn; pfn <= nmcp->end_pfn; + pfn += PAGES_PER_ELEMENT) + { + physnode_map[pfn / PAGES_PER_ELEMENT] = (int)nmcp->nid; } } } @@ -272,7 +260,7 @@ static int __init acpi20_parse_srat(stru for (i = 0; i < num_memory_chunks; i++) node_memory_chunk[i].nid = pxm_to_nid_map[node_memory_chunk[i].pxm]; - initialize_pfnnode_map(); + initialize_physnode_map(); printk("pxm bitmap: "); for (i = 0; i < sizeof(pxm_bitmap); i++) { diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/kernel/traps.c 900-mjb3/arch/i386/kernel/traps.c --- 000-bk3/arch/i386/kernel/traps.c Tue Feb 25 23:03:43 2003 +++ 900-mjb3/arch/i386/kernel/traps.c Sat Mar 8 22:05:10 2003 @@ -25,6 +25,7 @@ #include #include #include +#include #ifdef CONFIG_EISA #include @@ -42,6 +43,7 @@ #include #include #include +#include #include #include @@ -51,6 +53,24 @@ #include #include +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + +#ifdef CONFIG_X86_REMOTE_DEBUG +gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs) ; \ + after; \ + } \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + asmlinkage int system_call(void); asmlinkage void lcall7(void); asmlinkage void lcall27(void); @@ -244,17 +264,25 @@ bug: } spinlock_t die_lock = SPIN_LOCK_UNLOCKED; +struct notifier_block *die_chain; void die(const char * str, struct pt_regs * regs, long err) { console_verbose(); + if (notify_die(DIE_DIE, str, regs, err) == NOTIFY_BAD) + goto exit_segv; + spin_lock_irq(&die_lock); bust_spinlocks(1); handle_BUG(regs); printk("%s: %04lx\n", str, err & 0xffff); + CHK_REMOTE_DEBUG(1,SIGTRAP,err,regs,) show_registers(regs); bust_spinlocks(0); spin_unlock_irq(&die_lock); + + notify_die(DIE_OOPS, str, regs, err); + exit_segv: do_exit(SIGSEGV); } @@ -312,6 +340,7 @@ static inline void do_trap(int trapnr, i #define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,)\ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -329,7 +358,9 @@ asmlinkage void do_##name(struct pt_regs #define DO_VM86_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,return)\ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + return; \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -344,7 +375,6 @@ asmlinkage void do_##name(struct pt_regs } DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) -DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip) @@ -358,6 +388,13 @@ asmlinkage void do_general_protection(st { if (regs->eflags & VM_MASK) goto gp_in_vm86; + + if (kprobe_running() && kprobe_fault_handler(regs, 13)) + return; + + if (notify_die(DIE_PROTFAULT, "general protection", regs, error_code) + == NOTIFY_BAD) + return; if (!(regs->xcs & 3)) goto gp_in_kernel; @@ -372,8 +409,10 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) + if (!fixup_exception(regs)) { + CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,); die("general protection fault", regs, error_code); + } } static void mem_parity_error(unsigned char reason, struct pt_regs * regs) @@ -402,6 +441,7 @@ static void io_check_error(unsigned char outb(reason, 0x61); } + static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { #ifdef CONFIG_MCA @@ -436,10 +476,14 @@ static void default_do_nmi(struct pt_reg unknown_nmi_error(reason, regs); return; } + + if (notify_die(DIE_NMI, "nmi", regs, reason) == NOTIFY_BAD) + return; if (reason & 0x80) mem_parity_error(reason, regs); if (reason & 0x40) io_check_error(reason, regs); + /* * Reassert NMI in case it became active meanwhile * as it's edge-triggered. @@ -482,6 +526,20 @@ void unset_nmi_callback(void) nmi_callback = dummy_nmi_callback; } +asmlinkage int do_int3(struct pt_regs *regs, long error_code) +{ + /* This is an interrupt gate, because kprobes wants interrupts + disabled. Normal trap handlers don't. */ + if (kprobe_handler(regs)) + return 1; + if (notify_die(DIE_INT3, "int3", regs, error_code) == NOTIFY_BAD) + return 1; + + restore_interrupts(regs); + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); + return 0; +} + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -504,7 +562,7 @@ void unset_nmi_callback(void) * find every occurrence of the TF bit that could be saved away even * by user code) */ -asmlinkage void do_debug(struct pt_regs * regs, long error_code) +asmlinkage int do_debug(struct pt_regs * regs, long error_code) { unsigned int condition; struct task_struct *tsk = current; @@ -516,6 +574,18 @@ asmlinkage void do_debug(struct pt_regs if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); + if (post_kprobe_handler(regs)) + return 1; + + /* Interrupts not disabled for normal trap handling. */ + restore_interrupts(regs); + + if (notify_die(DIE_DEBUG, "debug", regs, error_code) == NOTIFY_BAD) + return 1; + + /* Interrupts not disabled for normal trap handling. */ + restore_interrupts(regs); + /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { if (!tsk->thread.debugreg[7]) @@ -539,8 +609,10 @@ asmlinkage void do_debug(struct pt_regs * allowing programs to debug themselves without the ptrace() * interface. */ +#ifndef CONFIG_X86_REMOTE_DEBUG if ((regs->xcs & 3) == 0) goto clear_TF_reenable; +#endif if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } @@ -552,11 +624,13 @@ asmlinkage void do_debug(struct pt_regs info.si_errno = 0; info.si_code = TRAP_BRKPT; - /* If this is a kernel mode trap, save the user PC on entry to - * the kernel, that's what the debugger can make sense of. - */ - info.si_addr = ((regs->xcs & 3) == 0) ? (void *)tsk->thread.eip : - (void *)regs->eip; + + /* If this is a kernel mode trap, we need to reset db7 to allow us + * to continue sanely */ + if ((regs->xcs & 3) == 0) + goto clear_dr7; + + info.si_addr = (void *)regs->eip; force_sig_info(SIGTRAP, &info, tsk); /* Disable additional traps. They'll be re-enabled when @@ -566,17 +640,20 @@ clear_dr7: __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); - return; + CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) + return 0; debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - return; + return 0; +#ifndef CONFIG_X86_REMOTE_DEBUG clear_TF_reenable: +#endif set_tsk_thread_flag(tsk, TIF_SINGLESTEP); clear_TF: regs->eflags &= ~TF_MASK; - return; + return 0; } /* @@ -740,6 +817,11 @@ asmlinkage void math_state_restore(struc struct task_struct *tsk = current; clts(); /* Allow maths ops (or we recurse) */ + if (kprobe_running() && kprobe_fault_handler(®s, 7)) + return; + if (notify_die(DIE_FPUTRAP, "fpu", ®s, 0) == NOTIFY_BAD) + return; + if (!tsk->used_math) init_fpu(tsk); restore_fpu(tsk); @@ -772,6 +854,39 @@ void __init trap_init_f00f_bug(void) } #endif +static inline void get_current_regs(struct pt_regs *regs) +{ + __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx)); + __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx)); + __asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx)); + __asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi)); + __asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi)); + __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp)); + __asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax)); + __asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp)); + __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss)); + __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs)); + __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds)); + __asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes)); + __asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags)); + regs->eip = (unsigned long)current_text_addr(); +} + +static int panic_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct pt_regs regs; + get_current_regs(®s); + + return notify_die(DIE_PANIC, (const char *)ptr, ®s, event); +} + +extern struct notifier_block *panic_notifier_list; +static int panic_event(struct notifier_block *, unsigned long, void *); +static struct notifier_block panic_block = { + .notifier_call = panic_event, +}; + #define _set_gate(gate_addr,type,dpl,addr,seg) \ do { \ int __d0, __d1; \ @@ -838,9 +953,9 @@ void __init trap_init(void) #endif set_trap_gate(0,÷_error); - set_intr_gate(1,&debug); + _set_gate(idt_table+1,14,3,&debug,__KERNEL_CS); /* debug trap for kprobes */ set_intr_gate(2,&nmi); - set_system_gate(3,&int3); /* int3-5 can be called from all */ + _set_gate(idt_table+3,14,3,&int3,__KERNEL_CS); /* int3-5 can be called from all */ set_system_gate(4,&overflow); set_system_gate(5,&bounds); set_trap_gate(6,&invalid_op); @@ -869,10 +984,14 @@ void __init trap_init(void) set_call_gate(&default_ldt[0],lcall7); set_call_gate(&default_ldt[4],lcall27); + notify_die(DIE_TRAPINIT, "traps initialized", 0, 0); /* * Should be a barrier for any external CPU state. */ cpu_init(); trap_init_hook(); + + notifier_chain_register(&panic_notifier_list, &panic_block); } + diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/mm/discontig.c 900-mjb3/arch/i386/mm/discontig.c --- 000-bk3/arch/i386/mm/discontig.c Wed Mar 5 07:36:57 2003 +++ 900-mjb3/arch/i386/mm/discontig.c Sat Mar 8 22:03:39 2003 @@ -36,11 +36,36 @@ struct pglist_data *node_data[MAX_NUMNODES]; bootmem_data_t node0_bdata; +/* + * numa interface - we expect the numa architecture specfic code to have + * populated the following initialisation. + * + * 1) numnodes - the total number of nodes configured in the system + * 2) physnode_map - the mapping between a pfn and owning node + * 3) node_start_pfn - the starting page frame number for a node + * 3) node_end_pfn - the ending page fram number for a node + */ + +/* + * physnode_map keeps track of the physical memory layout of a generic + * numa node on a 256Mb break (each element of the array will + * represent 256Mb of memory and will be marked by the node id. so, + * if the first gig is on node 0, and the second gig is on node 1 + * physnode_map will contain: + * + * physnode_map[0-3] = 0; + * physnode_map[4-7] = 1; + * physnode_map[8- ] = -1; + */ +u8 physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1}; + +unsigned long node_start_pfn[MAX_NUMNODES]; +unsigned long node_end_pfn[MAX_NUMNODES]; + extern unsigned long find_max_low_pfn(void); extern void find_max_pfn(void); extern void one_highpage_init(struct page *, int, int); -extern unsigned long node_start_pfn[], node_end_pfn[]; extern struct e820map e820; extern char _end; extern unsigned long highend_pfn, highstart_pfn; @@ -55,6 +80,36 @@ unsigned long node_remap_size[MAX_NUMNOD unsigned long node_remap_offset[MAX_NUMNODES]; void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +/* + * FLAT - support for basic PC memory model with discontig enabled, essentially + * a single node with all available processors in it with a flat + * memory map. + */ +void __init get_memcfg_numa_flat(void) +{ + int pfn; + + printk("NUMA - single node, flat memory mode\n"); + + /* Run the memory configuration and find the top of memory. */ + find_max_pfn(); + node_start_pfn[0] = 0; + node_end_pfn[0] = max_pfn; + + /* Fill in the physnode_map with our simplistic memory model, + * all memory is in node 0. + */ + for (pfn = node_start_pfn[0]; pfn <= node_end_pfn[0]; + pfn += PAGES_PER_ELEMENT) + { + physnode_map[pfn / PAGES_PER_ELEMENT] = 0; + } + + /* Indicate there is one node available. */ + node_set_online(0); + numnodes = 1; +} /* * Find the highest page frame number we have available for the node diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/mm/fault.c 900-mjb3/arch/i386/mm/fault.c --- 000-bk3/arch/i386/mm/fault.c Sat Mar 8 10:16:03 2003 +++ 900-mjb3/arch/i386/mm/fault.c Sat Mar 8 22:05:10 2003 @@ -2,6 +2,11 @@ * linux/arch/i386/mm/fault.c * * Copyright (C) 1995 Linus Torvalds + * + * Change History + * + * Tigran Aivazian Remote debugging support. + * */ #include @@ -20,12 +25,17 @@ #include #include /* For unblank_screen() */ #include +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif +#include #include #include #include #include #include +#include extern void die(const char *,struct pt_regs *,long); @@ -161,6 +171,15 @@ asmlinkage void do_page_fault(struct pt_ /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); + /* for many cases list will be empty, so optimize for that case */ + if (unlikely(die_chain != NULL) + && notify_die(DIE_PAGEFAULT, "page fault", regs, error_code) + == NOTIFY_BAD) + return; + + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + return; + /* It's safe to allow irq's after cr2 has been saved */ if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); @@ -193,6 +212,15 @@ asmlinkage void do_page_fault(struct pt_ if (in_atomic() || !mm) goto no_context; +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs) ; + return; /* return w/modified regs */ + } + } +#endif + down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -291,6 +319,19 @@ bad_area: force_sig_info(SIGSEGV, &info, tsk); return; } + +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + return; /* Return with modified registers */ + } + } else { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + } + } +#endif #ifdef CONFIG_X86_F00F_BUG /* diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/mm/init.c 900-mjb3/arch/i386/mm/init.c --- 000-bk3/arch/i386/mm/init.c Sat Mar 8 10:16:03 2003 +++ 900-mjb3/arch/i386/mm/init.c Sat Mar 8 22:05:10 2003 @@ -164,7 +164,7 @@ static inline int page_kills_ppro(unsign return 0; } -static inline int page_is_ram(unsigned long pagenr) +int page_is_ram(unsigned long pagenr) { int i; @@ -184,6 +184,12 @@ static inline int page_is_ram(unsigned l return 1; } return 0; +} + +/* To enable modules to check if a page is in RAM */ +int pfn_is_ram(unsigned long pfn) +{ + return (page_is_ram(pfn)); } #if CONFIG_HIGHMEM diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/i386/vmlinux.lds.S 900-mjb3/arch/i386/vmlinux.lds.S --- 000-bk3/arch/i386/vmlinux.lds.S Sat Mar 8 10:16:03 2003 +++ 900-mjb3/arch/i386/vmlinux.lds.S Sat Mar 8 22:03:45 2003 @@ -10,7 +10,7 @@ ENTRY(startup_32) jiffies = jiffies_64; SECTIONS { - . = 0xC0000000 + 0x100000; + . = __PAGE_OFFSET + 0x100000; /* read-only */ _text = .; /* Text and read-only data */ .text : { diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/s390/boot/Makefile 900-mjb3/arch/s390/boot/Makefile --- 000-bk3/arch/s390/boot/Makefile Mon Dec 16 21:50:39 2002 +++ 900-mjb3/arch/s390/boot/Makefile Sat Mar 8 22:05:11 2003 @@ -17,4 +17,4 @@ $(obj)/listing: vmlinux FORCE install: $(CONFIGURE) $(obj)/image sh -x $(obj)/install.sh $(KERNELRELEASE) $(obj)/image \ - System.map Kerntypes "$(INSTALL_PATH)" + System.map init/kerntypes.o "$(INSTALL_PATH)" diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/s390/boot/install.sh 900-mjb3/arch/s390/boot/install.sh --- 000-bk3/arch/s390/boot/install.sh Sun Nov 17 20:29:48 2002 +++ 900-mjb3/arch/s390/boot/install.sh Sat Mar 8 22:05:11 2003 @@ -16,7 +16,8 @@ # $1 - kernel version # $2 - kernel image file # $3 - kernel map file -# $4 - default install path (blank if root directory) +# $4 - kernel type file +# $5 - default install path (blank if root directory) # # User may have a custom install script @@ -26,13 +27,22 @@ if [ -x /sbin/installkernel ]; then exec # Default install - same as make zlilo -if [ -f $4/vmlinuz ]; then - mv $4/vmlinuz $4/vmlinuz.old +if [ -f $5/vmlinuz ]; then + mv $5/vmlinuz $5/vmlinuz.old fi -if [ -f $4/System.map ]; then - mv $4/System.map $4/System.old +if [ -f $5/System.map ]; then + mv $5/System.map $5/System.old fi -cat $2 > $4/vmlinuz -cp $3 $4/System.map +if [ -f $5/Kerntypes ]; then + mv $5/Kerntypes $5/Kerntypes.old +fi + +cat $2 > $5/vmlinuz +cp $3 $5/System.map + +# copy the kernel type file if it exists +if [ -f $4 ]; then + cp $4 $5/Kerntypes +fi diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/s390x/boot/Makefile 900-mjb3/arch/s390x/boot/Makefile --- 000-bk3/arch/s390x/boot/Makefile Mon Dec 16 21:50:39 2002 +++ 900-mjb3/arch/s390x/boot/Makefile Sat Mar 8 22:05:11 2003 @@ -18,4 +18,4 @@ $(obj)/listing: vmlinux FORCE install: $(CONFIGURE) $(obj)/image sh -x $(obj)/install.sh $(KERNELRELEASE) $(obj)/image \ - System.map Kerntypes "$(INSTALL_PATH)" + System.map kerntypes.o "$(INSTALL_PATH)" diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/s390x/boot/install.sh 900-mjb3/arch/s390x/boot/install.sh --- 000-bk3/arch/s390x/boot/install.sh Sun Nov 17 20:29:57 2002 +++ 900-mjb3/arch/s390x/boot/install.sh Sat Mar 8 22:05:11 2003 @@ -16,7 +16,8 @@ # $1 - kernel version # $2 - kernel image file # $3 - kernel map file -# $4 - default install path (blank if root directory) +# $4 - kernel type file +# $5 - default install path (blank if root directory) # # User may have a custom install script @@ -26,13 +27,22 @@ if [ -x /sbin/installkernel ]; then exec # Default install - same as make zlilo -if [ -f $4/vmlinuz ]; then - mv $4/vmlinuz $4/vmlinuz.old +if [ -f $5/vmlinuz ]; then + mv $5/vmlinuz $5/vmlinuz.old fi -if [ -f $4/System.map ]; then - mv $4/System.map $4/System.old +if [ -f $5/System.map ]; then + mv $5/System.map $5/System.old fi -cat $2 > $4/vmlinuz -cp $3 $4/System.map +if [ -f $5/Kerntypes ]; then + mv $5/Kerntypes $5/Kerntypes.old +fi + +cat $2 > $5/vmlinuz +cp $3 $5/System.map + +# copy the kernel type file if it exists +if [ -f $4 ]; then + cp $4 $5/Kerntypes +fi diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/sparc64/kernel/rtrap.S 900-mjb3/arch/sparc64/kernel/rtrap.S --- 000-bk3/arch/sparc64/kernel/rtrap.S Sun Nov 17 20:29:45 2002 +++ 900-mjb3/arch/sparc64/kernel/rtrap.S Sat Mar 8 22:04:47 2003 @@ -33,7 +33,7 @@ __handle_softirq: ba,a,pt %xcc, __handle_softirq_continue nop __handle_preemption: - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate ba,pt %xcc, __handle_preemption_continue wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate @@ -48,7 +48,7 @@ __handle_user_windows: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -92,7 +92,7 @@ __handle_perfctrs: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -271,7 +271,7 @@ to_kernel: brnz %l5, kern_fpucheck sethi %hi(PREEMPT_ACTIVE), %l6 stw %l6, [%g6 + TI_PRE_COUNT] - call schedule + call user_schedule nop ba,pt %xcc, rtrap stw %g0, [%g6 + TI_PRE_COUNT] diff -urpN -X /home/fletch/.diff.exclude 000-bk3/arch/x86_64/kernel/entry.S 900-mjb3/arch/x86_64/kernel/entry.S --- 000-bk3/arch/x86_64/kernel/entry.S Tue Feb 25 23:03:45 2003 +++ 900-mjb3/arch/x86_64/kernel/entry.S Sat Mar 8 22:04:47 2003 @@ -187,7 +187,7 @@ sysret_careful: jnc sysret_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp sysret_check @@ -256,7 +256,7 @@ int_careful: jnc int_very_careful sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp int_with_check @@ -420,7 +420,7 @@ retint_careful: jnc retint_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi GET_THREAD_INFO(%rcx) cli @@ -454,7 +454,7 @@ retint_kernel: jc retint_restore_args movl $PREEMPT_ACTIVE,threadinfo_preempt_count(%rcx) sti - call schedule + call user_schedule cli GET_THREAD_INFO(%rcx) movl $0,threadinfo_preempt_count(%rcx) diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/Makefile 900-mjb3/drivers/Makefile --- 000-bk3/drivers/Makefile Fri Feb 21 23:40:46 2003 +++ 900-mjb3/drivers/Makefile Sat Mar 8 22:05:11 2003 @@ -46,3 +46,4 @@ obj-$(CONFIG_ISDN_BOOL) += isdn/ obj-$(CONFIG_MCA) += mca/ obj-$(CONFIG_EISA) += eisa/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_CRASH_DUMP) += dump/ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/char/Makefile 900-mjb3/drivers/char/Makefile --- 000-bk3/drivers/char/Makefile Wed Mar 5 07:37:00 2003 +++ 900-mjb3/drivers/char/Makefile Sat Mar 8 22:04:47 2003 @@ -25,6 +25,7 @@ obj-$(CONFIG_COMPUTONE) += ip2.o ip2main obj-$(CONFIG_RISCOM8) += riscom8.o obj-$(CONFIG_ISI) += isicom.o obj-$(CONFIG_ESPSERIAL) += esp.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbserial.o obj-$(CONFIG_SYNCLINK) += synclink.o obj-$(CONFIG_SYNCLINKMP) += synclinkmp.o obj-$(CONFIG_N_HDLC) += n_hdlc.o diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/char/gdbserial.c 900-mjb3/drivers/char/gdbserial.c --- 000-bk3/drivers/char/gdbserial.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/char/gdbserial.c Sat Mar 8 22:04:47 2003 @@ -0,0 +1,274 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * + * Modified by Scott Foehner (sfoehner@engr.sgi.com) to allow connect + * on boot-up + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#undef PRNT /* define for debug printing */ + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +extern void set_debug_traps(void); /* GDB routine */ +extern int gdb_serial_setup(int ttyS, int baud, int *port, int *irq); +extern void shutdown_for_gdb(struct async_struct *info); + /* in serial.c */ + +int gdb_irq; +int gdb_port; +int gdb_ttyS = 1; /* Default: ttyS1 */ +int gdb_baud = 38400; +int gdb_enter = 0; /* Default: do not do gdb_hook on boot */ +int gdb_initialized = 0; + +static int initialized = -1; + +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(void) +{ + if (inb(gdb_port + UART_LSR) & UART_LSR_DR) + return (inb(gdb_port + UART_RX)); + + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + */ +static int +read_char(void) +{ + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + int chr; + + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + return (chr); + } + + return (read_data_bfr()); /* read from hardware */ + +} /* read_char */ + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(int chr) +{ + while (!(inb(gdb_port + UART_LSR) & UART_LSR_THRE)) ; + + outb(chr, gdb_port + UART_TX); + +} /* write_char */ + +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static void +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + int chr; + int iir; + + do { + chr = read_data_bfr(); + iir = inb(gdb_port + UART_IIR); +#ifdef PRNT + printk("gdb_interrupt: chr=%02x '%c' after read iir=%02x\n", + chr, chr > ' ' && chr < 0x7F ? chr : ' ', iir); +#endif + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + breakpoint(); + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { /* buffer overflow, clear it */ + gdb_buf_in_inx = 0; + atomic_set(&gdb_buf_in_cnt, 0); + gdb_buf_out_inx = 0; + break; + } + + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + atomic_inc(&gdb_buf_in_cnt); + } + while (iir & UART_IIR_RDI); + +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +extern int serial8250_init(void); + +int +gdb_hook(void) +{ + int retval; + +#ifdef CONFIG_SMP + if (NR_CPUS > KGDB_MAX_NO_CPUS) { + printk + ("kgdb: too manu cpus. Cannot enable debugger with more than 8 cpus\n"); + return (-1); + } +#endif + + /* + * Call first time just to get the ser ptr + */ + + serial8250_init(); + + if (gdb_serial_setup(gdb_ttyS, gdb_baud, &gdb_port, &gdb_irq)) { + printk("gdb_serial_setup() error"); + return (-1); + } + + retval = request_irq(gdb_irq, + gdb_interrupt, SA_INTERRUPT, "GDB-stub", NULL); + if (retval == 0) + initialized = 1; + else { + initialized = 0; + printk("gdb_hook: request_irq(irq=%d) failed: %d\n", gdb_irq, + retval); + } + + /* + * Call GDB routine to setup the exception vectors for the debugger + */ + set_debug_traps(); + + /* + * Call the breakpoint() routine in GDB to start the debugging + * session. + */ + printk("Waiting for connection from remote gdb... "); + breakpoint(); + gdb_null(); + + printk("Connected.\n"); + + gdb_initialized = 1; + return (0); + +} /* gdb_hook_interrupt2 */ + +/* + * getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. + */ +int +getDebugChar(void) +{ + volatile int chr; + +#ifdef PRNT + printk("getDebugChar: "); +#endif + + while ((chr = read_char()) < 0) + touch_nmi_watchdog(); + +#ifdef PRNT + printk("%c\n", chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + return (chr); + +} /* getDebugChar */ + +/* + * putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. + */ +void +putDebugChar(int chr) +{ +#ifdef PRNT + printk("putDebugChar: chr=%02x '%c'\n", chr, + chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + + write_char(chr); /* this routine will wait */ + +} /* putDebugChar */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/char/sysrq.c 900-mjb3/drivers/char/sysrq.c --- 000-bk3/drivers/char/sysrq.c Tue Feb 25 23:03:46 2003 +++ 900-mjb3/drivers/char/sysrq.c Sat Mar 8 22:04:47 2003 @@ -107,6 +107,18 @@ static struct sysrq_key_op sysrq_reboot_ .action_msg = "Resetting", }; +#ifdef CONFIG_X86_REMOTE_DEBUG +static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) { + int gdb_hook(void); + gdb_hook(); +} +static struct sysrq_key_op sysrq_gdb_op = { + handler: sysrq_handle_gdb, + help_msg: "Gdb", + action_msg: "Entering debugger", +}; +#endif /* SYNC SYSRQ HANDLERS BLOCK */ @@ -352,7 +364,11 @@ static struct sysrq_key_op *sysrq_key_ta /* d */ NULL, /* e */ &sysrq_term_op, /* f */ NULL, +#ifdef CONFIG_X86_REMOTE_DEBUG +/* g */ &sysrq_gdb_op, +#else /* CONFIG_X86_REMOTE_DEBUG */ /* g */ NULL, +#endif /* CONFIG_X86_REMOTE_DEBUG */ /* h */ NULL, /* i */ &sysrq_kill_op, /* j */ NULL, diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/char/tty_io.c 900-mjb3/drivers/char/tty_io.c --- 000-bk3/drivers/char/tty_io.c Sat Mar 8 10:16:04 2003 +++ 900-mjb3/drivers/char/tty_io.c Wed Mar 12 19:46:55 2003 @@ -91,6 +91,9 @@ #include #include #include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif #include #include @@ -1034,7 +1037,9 @@ static void release_mem(struct tty_struc } o_tty->magic = 0; (*o_tty->driver.refcount)--; + file_list_lock(); list_del(&o_tty->tty_files); + file_list_unlock(); free_tty_struct(o_tty); } @@ -1046,7 +1051,9 @@ static void release_mem(struct tty_struc } tty->magic = 0; (*tty->driver.refcount)--; + file_list_lock(); list_del(&tty->tty_files); + file_list_unlock(); module_put(tty->driver.owner); free_tty_struct(tty); } @@ -2219,6 +2226,9 @@ void __init console_init(void) (*call)(); call++; } +#ifdef CONFIG_GDB_CONSOLE + gdb_console_init(); +#endif } static struct tty_driver dev_tty_driver, dev_syscons_driver; diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/dump/Makefile 900-mjb3/drivers/dump/Makefile --- 000-bk3/drivers/dump/Makefile Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/dump/Makefile Sat Mar 8 22:05:11 2003 @@ -0,0 +1,14 @@ +# +# Makefile for the dump device drivers. +# +export-objs := dump_setup.o + +dump-y := dump_setup.o dump_fmt.o dump_filters.o dump_scheme.o dump_execute.o +dump-$(CONFIG_X86) += dump_i386.o +dump-objs += $(dump-y) + +obj-$(CONFIG_CRASH_DUMP) += dump.o +obj-$(CONFIG_CRASH_DUMP_BLOCKDEV) += dump_blockdev.o +obj-$(CONFIG_CRASH_DUMP_NETDEV) += dump_netdev.o +obj-$(CONFIG_CRASH_DUMP_COMPRESS_RLE) += dump_rle.o +obj-$(CONFIG_CRASH_DUMP_COMPRESS_GZIP) += dump_gzip.o diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/dump/dump_blockdev.c 900-mjb3/drivers/dump/dump_blockdev.c --- 000-bk3/drivers/dump/dump_blockdev.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/dump/dump_blockdev.c Sat Mar 8 22:05:11 2003 @@ -0,0 +1,447 @@ +/* + * Implements the dump driver interface for saving a dump to + * a block device through the kernel's generic low level block i/o + * routines. + * + * Started: June 2002 - Mohamed Abbas + * Moved original lkcd kiobuf dump i/o code from dump_base.c + * to use generic dump device interfaces + * + * Sept 2002 - Bharata B. Rao + * Convert dump i/o to directly use bio instead of kiobuf for 2.5 + * + * Oct 2002 - Suparna Bhattacharya + * Rework to new dumpdev.h structures, implement open/close/ + * silence, misc fixes (blocknr removal, bio_add_page usage) + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "dump_methods.h" + +extern void *dump_page_buf; + +/* The end_io callback for dump i/o completion */ +static int +dump_bio_end_io(struct bio *bio, unsigned int bytes_done, int error) +{ + struct dump_blockdev *dump_bdev; + + if (bio->bi_size) { + /* some bytes still left to transfer */ + return 1; /* not complete */ + } + + dump_bdev = (struct dump_blockdev *)bio->bi_private; + if (error) { + printk("IO error while writing the dump, aborting\n"); + } + + dump_bdev->err = error; + + /* no wakeup needed, since caller polls for completion */ + return 0; +} + +/* Check if the dump bio is already mapped to the specified buffer */ +static int +dump_block_map_valid(struct dump_blockdev *dev, struct page *page, + int len) +{ + struct bio *bio = dev->bio; + + if (!bio->bi_vcnt) + return 0; /* first time, not mapped */ + + + if ((bio_page(bio) != page) || (len != bio->bi_vcnt << PAGE_SHIFT)) + return 0; /* buffer not mapped */ + + /* quick check to decide if we need to redo bio_add_page */ + if (bdev_get_queue(bio->bi_bdev)->merge_bvec_fn) + return 0; /* device may have other restrictions */ + + return 1; /* already mapped */ +} + +/* + * Set up the dump bio for i/o from the specified buffer + * Return value indicates whether the full buffer could be mapped or not + */ +static int +dump_block_map(struct dump_blockdev *dev, void *buf, int len) +{ + struct page *page = virt_to_page(buf); + struct bio *bio = dev->bio; + unsigned long bsize = 0; + + bio->bi_bdev = dev->bdev; + bio->bi_sector = (dev->start_offset + dev->ddev.curr_offset) >> 9; + bio->bi_idx = 0; /* reset index to the beginning */ + + if (dump_block_map_valid(dev, page, len)) { + /* already mapped and usable rightaway */ + bio->bi_size = len; /* reset size to the whole bio */ + } else { + /* need to map the bio */ + bio->bi_size = 0; + bio->bi_vcnt = 0; + bsize = bdev_hardsect_size(bio->bi_bdev); + + /* first a few sanity checks */ + if (len < bsize) { + printk("map: len less than hardsect size \n"); + return -EINVAL; + } + + if ((unsigned long)buf & bsize) { + printk("map: not aligned \n"); + return -EINVAL; + } + + /* assume contig. page aligned low mem buffer( no vmalloc) */ + if ((page_address(page) != buf) || (len & (PAGE_SIZE - 1))) { + printk("map: invalid buffer alignment!\n"); + return -EINVAL; + } + /* finally we can go ahead and map it */ + while (bio->bi_size < len) + if (bio_add_page(bio, page++, PAGE_SIZE, 0) == 0) { + break; + } + + bio->bi_end_io = dump_bio_end_io; + bio->bi_private = dev; + } + + if (bio->bi_size != len) { + printk("map: bio size = %d not enough for len = %d!\n", + bio->bi_size, len); + return -E2BIG; + } + return 0; +} + +static void +dump_free_bio(struct bio *bio) +{ + if (bio) + kfree(bio->bi_io_vec); + kfree(bio); +} + +/* + * Prepares the dump device so we can take a dump later. + * The caller is expected to have filled up the kdev_id field in the + * block dump dev structure. + * + * At dump time when dump_block_write() is invoked it will be too + * late to recover, so as far as possible make sure obvious errors + * get caught right here and reported back to the caller. + */ +static int +dump_block_open(struct dump_dev *dev, unsigned long arg) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + struct block_device *bdev; + int retval = 0; + struct bio_vec *bvec; + + /* make sure this is a valid block device */ + if (!arg) { + retval = -EINVAL; + goto err; + } + + /* get a corresponding block_dev struct for this */ + bdev = bdget((dev_t)arg); + if (!bdev) { + retval = -ENODEV; + goto err; + } + + /* get the block device opened */ + if ((retval = blkdev_get(bdev, O_RDWR | O_LARGEFILE, 0, BDEV_RAW))) { + goto err1; + } + + if ((dump_bdev->bio = kmalloc(sizeof(struct bio), GFP_KERNEL)) + == NULL) { + printk("Cannot allocate bio\n"); + retval = -ENOMEM; + goto err2; + } + + bio_init(dump_bdev->bio); + + if ((bvec = kmalloc(sizeof(struct bio_vec) * + (DUMP_BUFFER_SIZE >> PAGE_SHIFT), GFP_KERNEL)) == NULL) { + retval = -ENOMEM; + goto err3; + } + + /* assign the new dump dev structure */ + dump_bdev->kdev_id = to_kdev_t((dev_t)arg); + dump_bdev->bdev = bdev; + + /* make a note of the limit */ + dump_bdev->limit = bdev->bd_inode->i_size; + + /* now make sure we can map the dump buffer */ + dump_bdev->bio->bi_io_vec = bvec; + dump_bdev->bio->bi_max_vecs = DUMP_BUFFER_SIZE >> PAGE_SHIFT; + + retval = dump_block_map(dump_bdev, dump_config.dumper->dump_buf, + DUMP_BUFFER_SIZE); + + if (retval) { + printk("open: dump_block_map failed, ret %d\n", retval); + goto err3; + } + + printk("Block device (%d,%d) successfully configured for dumping\n", + major(dump_bdev->kdev_id), + minor(dump_bdev->kdev_id)); + + + /* after opening the block device, return */ + return retval; + +err3: dump_free_bio(dump_bdev->bio); + dump_bdev->bio = NULL; +err2: if (bdev) blkdev_put(bdev, BDEV_RAW); + goto err; +err1: if (bdev) bdput(bdev); + dump_bdev->bdev = NULL; +err: return retval; +} + +/* + * Close the dump device and release associated resources + * Invoked when unconfiguring the dump device. + */ +static int +dump_block_release(struct dump_dev *dev) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + + /* release earlier bdev if present */ + if (dump_bdev->bdev) { + blkdev_put(dump_bdev->bdev, BDEV_RAW); + dump_bdev->bdev = NULL; + } + + dump_free_bio(dump_bdev->bio); + dump_bdev->bio = NULL; + + return 0; +} + + +/* + * Prepare the dump device for use (silence any ongoing activity + * and quiesce state) when the system crashes. + */ +static int +dump_block_silence(struct dump_dev *dev) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + + /* For now we assume we have the device to ourselves */ + /* Just a quick sanity check */ + if (!blk_queue_empty(bdev_get_queue(dump_bdev->bdev))) { + /* i/o in flight - safer to quit */ + return -EBUSY; + } + + /* + * Move to a softer level of silencing where no spin_lock_irqs + * are held on other cpus + */ + dump_silence_level = DUMP_SOFT_SPIN_CPUS; + + __dump_irq_enable(); + + printk("Dumping to block device (%d,%d) on CPU %d ...\n", + major(dump_bdev->kdev_id), minor(dump_bdev->kdev_id), + smp_processor_id()); + + return 0; +} + +/* + * Invoked when dumping is done. This is the time to put things back + * (i.e. undo the effects of dump_block_silence) so the device is + * available for normal use. + */ +static int +dump_block_resume(struct dump_dev *dev) +{ + __dump_irq_restore(); + return 0; +} + + +/* + * Seek to the specified offset in the dump device. + * Makes sure this is a valid offset, otherwise returns an error. + */ +static int +dump_block_seek(struct dump_dev *dev, loff_t off) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + loff_t offset = off + dump_bdev->start_offset; + + if (offset & ( PAGE_SIZE - 1)) { + printk("seek: non-page aligned\n"); + return -EINVAL; + } + + if (offset & (bdev_hardsect_size(dump_bdev->bdev) - 1)) { + printk("seek: not sector aligned \n"); + return -EINVAL; + } + + if (offset > dump_bdev->limit) { + printk("seek: not enough space left on device!\n"); + return -ENOSPC; + } + dev->curr_offset = off; + return 0; +} + +/* + * Write out a buffer after checking the device limitations, + * sector sizes, etc. Assumes the buffer is in directly mapped + * kernel address space (not vmalloc'ed). + * + * Returns: number of bytes written or -ERRNO. + */ +static int +dump_block_write(struct dump_dev *dev, void *buf, + unsigned long len) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + loff_t offset = dev->curr_offset + dump_bdev->start_offset; + int retval = -ENOSPC; + + if (offset >= dump_bdev->limit) { + printk("write: not enough space left on device!\n"); + goto out; + } + + /* don't write more blocks than our max limit */ + if (offset + len > dump_bdev->limit) + len = dump_bdev->limit - offset; + + + retval = dump_block_map(dump_bdev, buf, len); + if (retval){ + printk("write: dump_block_map failed! err %d\n", retval); + goto out; + } + + /* + * Write out the data to disk. + * Assumes the entire buffer mapped to a single bio, which we can + * submit and wait for io completion. In the future, may consider + * increasing the dump buffer size and submitting multiple bio s + * for better throughput. + */ + dump_bdev->err = -EAGAIN; + submit_bio(WRITE, dump_bdev->bio); + + dump_bdev->ddev.curr_offset += len; + retval = len; + out: + return retval; +} + +/* + * Name: dump_block_ready() + * Func: check if the last dump i/o is over and ready for next request + */ +static int +dump_block_ready(struct dump_dev *dev, void *buf) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + request_queue_t *q = bdev_get_queue(dump_bdev->bio->bi_bdev); + + /* check for io completion */ + if (dump_bdev->err == -EAGAIN) { + q->unplug_fn(q); + return -EAGAIN; + } + + if (dump_bdev->err) { + printk("dump i/o err\n"); + return dump_bdev->err; + } + + return 0; +} + + +struct dump_dev_ops dump_blockdev_ops = { + .open = dump_block_open, + .release = dump_block_release, + .silence = dump_block_silence, + .resume = dump_block_resume, + .seek = dump_block_seek, + .write = dump_block_write, + /* .read not implemented */ + .ready = dump_block_ready +}; + +static struct dump_blockdev default_dump_blockdev = { + .ddev = {.type_name = "blockdev", .ops = &dump_blockdev_ops, + .curr_offset = 0}, + /* + * leave enough room for the longest swap header possibly written + * written by mkswap (likely the largest page size supported by + * the arch + */ + .start_offset = DUMP_HEADER_OFFSET, + .err = 0 + /* assume the rest of the fields are zeroed by default */ +}; + +struct dump_blockdev *dump_blockdev = &default_dump_blockdev; + +static int __init +dump_blockdev_init(void) +{ + if (dump_register_device(&dump_blockdev->ddev) < 0) { + printk("block device driver registration failed\n"); + return -1; + } + + printk("block device driver for LKCD registered\n"); + return 0; +} + +static void __exit +dump_blockdev_cleanup(void) +{ + dump_unregister_device(&dump_blockdev->ddev); + printk("block device driver for LKCD unregistered\n"); +} + +MODULE_AUTHOR("LKCD Development Team "); +MODULE_DESCRIPTION("Block Dump Driver for Linux Kernel Crash Dump (LKCD)"); +MODULE_LICENSE("GPL"); + +module_init(dump_blockdev_init); +module_exit(dump_blockdev_cleanup); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/dump/dump_execute.c 900-mjb3/drivers/dump/dump_execute.c --- 000-bk3/drivers/dump/dump_execute.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/dump/dump_execute.c Sat Mar 8 22:05:11 2003 @@ -0,0 +1,124 @@ +/* + * The file has the common/generic dump execution code + * + * Started: Oct 2002 - Suparna Bhattacharya + * Split and rewrote high level dump execute code to make use + * of dump method interfaces. + * + * Derived from original code in dump_base.c created by + * Matt Robinson ) + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * Assumes dumper and dump config settings are in place + * (invokes corresponding dumper specific routines as applicable) + * + * This code is released under version 2 of the GNU GPL. + */ +#include +#include +#include +#include "dump_methods.h" + +extern int dump_device; + +struct notifier_block *dump_notifier_list; /* dump started/ended callback */ + +/* Dump progress indicator */ +void +dump_speedo(int i) +{ + static const char twiddle[4] = { '|', '\\', '-', '/' }; + printk("%c\b", twiddle[i&3]); +} + +/* Make the device ready and write out the header */ +int dump_begin(void) +{ + int err = 0; + + /* dump_dev = dump_config.dumper->dev; */ + dumper_reset(); + if ((err = dump_dev_silence())) { + /* quiesce failed, can't risk continuing */ + /* Todo/Future: switch to alternate dump scheme if possible */ + printk("dump silence dev failed ! error %d\n", err); + return err; + } + + pr_debug("Writing dump header\n"); + if ((err = dump_update_header())) { + printk("dump update header failed ! error %d\n", err); + dump_dev_resume(); + return err; + } + + dump_config.dumper->curr_offset = DUMP_BUFFER_SIZE; + + return 0; +} + +/* + * Write the dump terminator, a final header update and let go of + * exclusive use of the device for dump. + */ +int dump_complete(void) +{ + int ret = 0; + + if (dump_config.level != DUMP_LEVEL_HEADER) { + if ((ret = dump_update_end_marker())) { + printk("dump update end marker error %d\n", ret); + } + if ((ret = dump_update_header())) { + printk("dump update header error %d\n", ret); + } + } + ret = dump_dev_resume(); + + return ret; +} + +/* Saves all dump data */ +int dump_execute_savedump(void) +{ + int ret = 0; + + if ((ret = dump_begin())) { + return ret; + } + + if (dump_config.level != DUMP_LEVEL_HEADER) { + ret = dump_sequencer(); + } + dump_complete(); + + return ret; +} + +/* Does all the real work: Capture and save state */ +int dump_generic_execute(const char *panic_str, const struct pt_regs *regs) +{ + int ret = 0; + + if ((ret = dump_configure_header(panic_str, regs))) { + printk("dump config header failed ! error %d\n", ret); + return ret; + } + + /* tell interested parties that a dump is about to start */ + notifier_call_chain(&dump_notifier_list, DUMP_BEGIN, &dump_device); + + if (dump_config.level != DUMP_LEVEL_NONE) + ret = dump_execute_savedump(); + + pr_debug("dumped %ld blocks of %d bytes each\n", + dump_config.dumper->count, DUMP_BUFFER_SIZE); + + /* tell interested parties that a dump has completed */ + notifier_call_chain(&dump_notifier_list, DUMP_END, &dump_device); + + return ret; +} diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/dump/dump_filters.c 900-mjb3/drivers/dump/dump_filters.c --- 000-bk3/drivers/dump/dump_filters.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/dump/dump_filters.c Sat Mar 8 22:05:11 2003 @@ -0,0 +1,145 @@ +/* + * Default filters to select data to dump for various passes. + * + * Started: Oct 2002 - Suparna Bhattacharya + * Split and rewrote default dump selection logic to generic dump + * method interfaces + * Derived from a portion of dump_base.c created by + * Matt Robinson ) + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * Used during single-stage dumping and during stage 1 of the 2-stage scheme + * (Stage 2 of the 2-stage scheme uses the fully transparent filters + * i.e. passthru filters in dump_overlay.c) + * + * Future: Custom selective dump may involve a different set of filters. + * + * This code is released under version 2 of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include "dump_methods.h" + + +/* Copied from mm/bootmem.c - FIXME */ +/* return the number of _pages_ that will be allocated for the boot bitmap */ +unsigned long dump_calc_bootmap_pages (void) +{ + unsigned long mapsize; + unsigned long pages = num_physpages; + + mapsize = (pages+7)/8; + mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; + mapsize >>= PAGE_SHIFT; + + return mapsize; +} + + +#define DUMP_PFN_SAFETY_MARGIN 1024 /* 4 MB */ +/* temporary */ +extern unsigned long min_low_pfn; + +/* to track all used (compound + zero order) pages */ +#define PageInuse(p) (PageCompound(p) || page_count(p)) + +int dump_low_page(struct page *p) +{ + return page_to_pfn(p) < min_low_pfn + dump_calc_bootmap_pages() + + 1 + DUMP_PFN_SAFETY_MARGIN; +} + +static inline int kernel_page(struct page *p) +{ + /* FIXME: Need to exclude hugetlb pages. Clue: reserved but inuse */ + return PageReserved(p) || (!PageLRU(p) && PageInuse(p)); +} + +static inline int user_page(struct page *p) +{ + return PageInuse(p) && (!PageReserved(p) && PageLRU(p)); +} + +static inline int unreferenced_page(struct page *p) +{ + return !PageInuse(p) && !PageReserved(p); +} + + +/* loc marks the beginning of a range of pages */ +int dump_filter_kernpages(int pass, unsigned long loc, unsigned long sz) +{ + struct page *page = (struct page *)loc; + /* if any of the pages is a kernel page, select this set */ + while (sz) { + if (dump_low_page(page) || kernel_page(page)) + return 1; + sz -= PAGE_SIZE; + page++; + } + return 0; +} + + +/* loc marks the beginning of a range of pages */ +int dump_filter_userpages(int pass, unsigned long loc, unsigned long sz) +{ + struct page *page = (struct page *)loc; + int ret = 0; + /* select if the set has any user page, and no kernel pages */ + while (sz) { + if (user_page(page) && !dump_low_page(page)) { + ret = 1; + } else if (kernel_page(page) || dump_low_page(page)) { + return 0; + } + page++; + sz -= PAGE_SIZE; + } + return ret; +} + + + +/* loc marks the beginning of a range of pages */ +int dump_filter_unusedpages(int pass, unsigned long loc, unsigned long sz) +{ + struct page *page = (struct page *)loc; + + /* select if the set does not have any used pages */ + while (sz) { + if (!unreferenced_page(page) || dump_low_page(page)) { + return 0; + } + page++; + sz -= PAGE_SIZE; + } + return 1; +} + +/* dummy: last (non-existent) pass */ +int dump_filter_none(int pass, unsigned long loc, unsigned long sz) +{ + return 0; +} + +/* TBD: resolve level bitmask ? */ +struct dump_data_filter dump_filter_table[] = { + { .name = "kern", .selector = dump_filter_kernpages, + .level_mask = DUMP_MASK_KERN}, + { .name = "user", .selector = dump_filter_userpages, + .level_mask = DUMP_MASK_USED}, + { .name = "unused", .selector = dump_filter_unusedpages, + .level_mask = DUMP_MASK_UNUSED}, + { .name = "none", .selector = dump_filter_none, + .level_mask = DUMP_MASK_REST}, + { .name = "", .selector = NULL, .level_mask = 0} +}; + diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/dump/dump_fmt.c 900-mjb3/drivers/dump/dump_fmt.c --- 000-bk3/drivers/dump/dump_fmt.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/dump/dump_fmt.c Sat Mar 8 22:05:11 2003 @@ -0,0 +1,395 @@ +/* + * Implements the routines which handle the format specific + * aspects of dump for the default dump format. + * + * Used in single stage dumping and stage 1 of soft-boot based dumping + * Saves data in LKCD (lcrash) format + * + * Previously a part of dump_base.c + * + * Started: Oct 2002 - Suparna Bhattacharya + * Split off and reshuffled LKCD dump format code around generic + * dump method interfaces. + * + * Derived from original code created by + * Matt Robinson ) + * + * Contributions from SGI, IBM, HP, MCL, and others. + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2000 - 2002 TurboLinux, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "dump_methods.h" + +/* + * SYSTEM DUMP LAYOUT + * + * System dumps are currently the combination of a dump header and a set + * of data pages which contain the system memory. The layout of the dump + * (for full dumps) is as follows: + * + * +-----------------------------+ + * | generic dump header | + * +-----------------------------+ + * | architecture dump header | + * +-----------------------------+ + * | page header | + * +-----------------------------+ + * | page data | + * +-----------------------------+ + * | page header | + * +-----------------------------+ + * | page data | + * +-----------------------------+ + * | | | + * | | | + * | | | + * | | | + * | V | + * +-----------------------------+ + * | PAGE_END header | + * +-----------------------------+ + * + * There are two dump headers, the first which is architecture + * independent, and the other which is architecture dependent. This + * allows different architectures to dump different data structures + * which are specific to their chipset, CPU, etc. + * + * After the dump headers come a succession of dump page headers along + * with dump pages. The page header contains information about the page + * size, any flags associated with the page (whether it's compressed or + * not), and the address of the page. After the page header is the page + * data, which is either compressed (or not). Each page of data is + * dumped in succession, until the final dump header (PAGE_END) is + * placed at the end of the dump, assuming the dump device isn't out + * of space. + * + * This mechanism allows for multiple compression types, different + * types of data structures, different page ordering, etc., etc., etc. + * It's a very straightforward mechanism for dumping system memory. + */ + +struct __dump_header dump_header; /* the primary dump header */ +struct __dump_header_asm dump_header_asm; /* the arch-specific dump header */ + +/* + * Set up common header fields (mainly the arch indep section) + * Per-cpu state is handled by lcrash_save_context + */ +static int lcrash_init_dump_header(const char *panic_str) +{ + struct timeval dh_time; + /* make sure the dump header isn't TOO big */ + if ((sizeof(struct __dump_header) + + sizeof(struct __dump_header_asm)) > DUMP_BUFFER_SIZE) { + printk("lcrash_init_header(): combined " + "headers larger than DUMP_BUFFER_SIZE!\n"); + return -E2BIG; + } + + /* initialize the dump headers to zero */ + memset(&dump_header, 0, sizeof(dump_header)); + memset(&dump_header_asm, 0, sizeof(dump_header_asm)); + + /* configure dump header values */ + dump_header.dh_magic_number = DUMP_MAGIC_NUMBER; + dump_header.dh_version = DUMP_VERSION_NUMBER; + dump_header.dh_memory_start = PAGE_OFFSET; + dump_header.dh_memory_end = DUMP_MAGIC_NUMBER; + dump_header.dh_header_size = sizeof(struct __dump_header); + dump_header.dh_page_size = PAGE_SIZE; + dump_header.dh_dump_level = dump_config.level; + dump_header.dh_current_task = (unsigned long) current; + dump_header.dh_dump_compress = dump_config.dumper->compress-> + compress_type; + dump_header.dh_dump_flags = dump_config.flags; + dump_header.dh_dump_device = dump_config.dumper->dev->device_id; + +#if DUMP_DEBUG >= 6 + dump_header.dh_num_bytes = 0; +#endif + dump_header.dh_num_dump_pages = 0; + do_gettimeofday(&dh_time); + dump_header.dh_time.tv_sec = dh_time.tv_sec; + dump_header.dh_time.tv_usec = dh_time.tv_usec; + + memcpy((void *)&(dump_header.dh_utsname_sysname), + (const void *)&(system_utsname.sysname), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_nodename), + (const void *)&(system_utsname.nodename), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_release), + (const void *)&(system_utsname.release), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_version), + (const void *)&(system_utsname.version), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_machine), + (const void *)&(system_utsname.machine), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_domainname), + (const void *)&(system_utsname.domainname), __NEW_UTS_LEN + 1); + + if (panic_str) { + memcpy((void *)&(dump_header.dh_panic_string), + (const void *)panic_str, DUMP_PANIC_LEN); + } + + dump_header_asm.dha_magic_number = DUMP_ASM_MAGIC_NUMBER; + dump_header_asm.dha_version = DUMP_ASM_VERSION_NUMBER; + dump_header_asm.dha_header_size = sizeof(dump_header_asm); + + dump_header_asm.dha_smp_num_cpus = num_online_cpus(); + pr_debug("smp_num_cpus in header %d\n", + dump_header_asm.dha_smp_num_cpus); + + dump_header_asm.dha_dumping_cpu = smp_processor_id(); + + return 0; +} + + +int dump_lcrash_configure_header(const char *panic_str, + const struct pt_regs *regs) +{ + int retval = 0; + + if ((retval = lcrash_init_dump_header(panic_str))) + return retval; + + /* capture register states for all processors */ + dump_save_this_cpu(regs); + __dump_save_other_cpus(); /* side effect:silence cpus */ + + /* configure architecture-specific dump header values */ + if ((retval = __dump_configure_header(regs))) + return retval; + + dump_config.dumper->header_dirty++; + return 0; +} + +/* save register and task context */ +void dump_lcrash_save_context(int cpu, const struct pt_regs *regs, + struct task_struct *tsk) +{ + dump_header_asm.dha_smp_current_task[cpu] = (uint32_t) tsk; + + __dump_save_regs(&dump_header_asm.dha_smp_regs[cpu], regs); + + /* take a snapshot of the stack */ + /* doing this enables us to tolerate slight drifts on this cpu */ + if (dump_header_asm.dha_stack[cpu]) { + memcpy((void *)dump_header_asm.dha_stack[cpu], + tsk->thread_info, THREAD_SIZE); + } + dump_header_asm.dha_stack_ptr[cpu] = (uint32_t)(tsk->thread_info); +} + +/* write out the header */ +int dump_write_header(void) +{ + int retval = 0, size; + void *buf = dump_config.dumper->dump_buf; + + /* accounts for DUMP_HEADER_OFFSET if applicable */ + if ((retval = dump_dev_seek(0))) { + printk("Unable to seek to dump header offset: %d\n", + retval); + return retval; + } + + memcpy(buf, (void *)&dump_header, sizeof(dump_header)); + size = sizeof(dump_header); + memcpy(buf + size, (void *)&dump_header_asm, sizeof(dump_header_asm)); + size += sizeof(dump_header_asm); + /* assuming header is dump buffer size always ? */ + retval = dump_ll_write(buf , DUMP_BUFFER_SIZE); + + if (retval < DUMP_BUFFER_SIZE) + return (retval >= 0) ? ENOSPC : retval; + + return 0; +} + +int dump_generic_update_header(void) +{ + int err = 0; + + if (dump_config.dumper->header_dirty) { + if ((err = dump_write_header())) { + printk("dump write header failed !err %d\n", err); + } else { + dump_config.dumper->header_dirty = 0; + } + } + + return err; +} + +static inline int is_curr_stack_page(struct page *page, unsigned long size) +{ + unsigned long thread_addr = (unsigned long)current_thread_info(); + unsigned long addr = (unsigned long)page_address(page); + + return !PageHighMem(page) && (addr < thread_addr + THREAD_SIZE) + && (addr + size > thread_addr); +} + +static inline int is_dump_page(struct page *page, unsigned long size) +{ + unsigned long addr = (unsigned long)page_address(page); + unsigned long dump_buf = (unsigned long)dump_config.dumper->dump_buf; + + return !PageHighMem(page) && (addr < dump_buf + DUMP_BUFFER_SIZE) + && (addr + size > dump_buf); +} + +int dump_allow_compress(struct page *page, unsigned long size) +{ + /* + * Don't compress the page if any part of it overlaps + * with the current stack or dump buffer (since the contents + * in these could be changing while compression is going on) + */ + return !is_curr_stack_page(page, size) && !is_dump_page(page, size); +} + +void lcrash_init_pageheader(struct __dump_page *dp, struct page *page, + unsigned long sz) +{ + memset(dp, sizeof(struct __dump_page), 0); + dp->dp_flags = 0; + dp->dp_size = 0; + if (sz > 0) + dp->dp_address = page_to_pfn(page) << PAGE_SHIFT; + +#if DUMP_DEBUG > 6 + dp->dp_page_index = dump_header.dh_num_dump_pages; + dp->dp_byte_offset = dump_header.dh_num_bytes + DUMP_BUFFER_SIZE + + DUMP_HEADER_OFFSET; /* ?? */ +#endif /* DUMP_DEBUG */ +} + +int dump_lcrash_add_data(unsigned long loc, unsigned long len) +{ + struct page *page = (struct page *)loc; + void *addr, *buf = dump_config.dumper->curr_buf; + struct __dump_page *dp = (struct __dump_page *)buf; + int bytes, size; + + if (buf > dump_config.dumper->dump_buf + DUMP_BUFFER_SIZE) + return -ENOMEM; + + lcrash_init_pageheader(dp, page, len); + buf += sizeof(struct __dump_page); + + while (len) { + addr = kmap_atomic(page, KM_DUMP); + size = bytes = (len > PAGE_SIZE) ? PAGE_SIZE : len; + /* check for compression */ + if (dump_allow_compress(page, bytes)) { + size = dump_compress_data((char *)addr, bytes, (char *)buf); + } + /* set the compressed flag if the page did compress */ + if (size && (size < bytes)) { + dp->dp_flags |= DUMP_DH_COMPRESSED; + } else { + /* compression failed -- default to raw mode */ + dp->dp_flags |= DUMP_DH_RAW; + memcpy(buf, addr, bytes); + size = bytes; + } + /* memset(buf, 'A', size); temporary: testing only !! */ + kunmap_atomic(addr, KM_DUMP); + dp->dp_size += size; + buf += size; + len -= bytes; + page++; + } + + /* now update the header */ +#if DUMP_DEBUG > 6 + dump_header.dh_num_bytes += dp->dp_size + sizeof(*dp); +#endif + dump_header.dh_num_dump_pages++; + dump_config.dumper->header_dirty++; + + dump_config.dumper->curr_buf = buf; + + return len; +} + +int dump_lcrash_update_end_marker(void) +{ + struct __dump_page *dp = + (struct __dump_page *)dump_config.dumper->curr_buf; + unsigned long left; + int ret = 0; + + lcrash_init_pageheader(dp, NULL, 0); + dp->dp_flags |= DUMP_DH_END; /* tbd: truncation test ? */ + + /* now update the header */ +#if DUMP_DEBUG > 6 + dump_header.dh_num_bytes += sizeof(*dp); +#endif + dump_config.dumper->curr_buf += sizeof(*dp); + left = dump_config.dumper->curr_buf - dump_config.dumper->dump_buf; + + printk("\n"); + + while (left) { + if ((ret = dump_dev_seek(dump_config.dumper->curr_offset))) { + printk("Seek failed at offset 0x%llx\n", + dump_config.dumper->curr_offset); + return ret; + } + + if (DUMP_BUFFER_SIZE > left) + memset(dump_config.dumper->curr_buf, 'm', + DUMP_BUFFER_SIZE - left); + + if ((ret = dump_ll_write(dump_config.dumper->dump_buf, + DUMP_BUFFER_SIZE)) < DUMP_BUFFER_SIZE) { + return (ret < 0) ? ret : -ENOSPC; + } + + dump_config.dumper->curr_offset += DUMP_BUFFER_SIZE; + + if (left > DUMP_BUFFER_SIZE) { + left -= DUMP_BUFFER_SIZE; + memcpy(dump_config.dumper->dump_buf, + dump_config.dumper->dump_buf + DUMP_BUFFER_SIZE, left); + dump_config.dumper->curr_buf -= DUMP_BUFFER_SIZE; + } else { + left = 0; + } + } + return 0; +} + + +/* Default Formatter (lcrash) */ +struct dump_fmt_ops dump_fmt_lcrash_ops = { + .configure_header = dump_lcrash_configure_header, + .update_header = dump_generic_update_header, + .save_context = dump_lcrash_save_context, + .add_data = dump_lcrash_add_data, + .update_end_marker = dump_lcrash_update_end_marker +}; + +struct dump_fmt dump_fmt_lcrash = { + .name = "lcrash", + .ops = &dump_fmt_lcrash_ops +}; + diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/dump/dump_gzip.c 900-mjb3/drivers/dump/dump_gzip.c --- 000-bk3/drivers/dump/dump_gzip.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/dump/dump_gzip.c Sat Mar 8 22:05:11 2003 @@ -0,0 +1,118 @@ +/* + * GZIP Compression functions for kernel crash dumps. + * + * Created by: Matt Robinson (yakker@sourceforge.net) + * Copyright 2001 Matt D. Robinson. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* header files */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void *deflate_workspace; + +/* + * Name: dump_compress_gzip() + * Func: Compress a DUMP_PAGE_SIZE page using gzip-style algorithms (the. + * deflate functions similar to what's used in PPP). + */ +static u16 +dump_compress_gzip(const u8 *old, u16 oldsize, u8 *new, u16 newsize) +{ + /* error code and dump stream */ + int err; + z_stream dump_stream; + + dump_stream.workspace = deflate_workspace; + + if ((err = zlib_deflateInit(&dump_stream, Z_BEST_COMPRESSION)) != Z_OK) { + /* fall back to RLE compression */ + printk("dump_compress_gzip(): zlib_deflateInit() " + "failed (%d)!\n", err); + return 0; + } + + /* use old (page of memory) and size (DUMP_PAGE_SIZE) as in-streams */ + dump_stream.next_in = (u8 *) old; + dump_stream.avail_in = oldsize; + + /* out streams are new (dpcpage) and new size (DUMP_DPC_PAGE_SIZE) */ + dump_stream.next_out = new; + dump_stream.avail_out = newsize; + + /* deflate the page -- check for error */ + err = zlib_deflate(&dump_stream, Z_FINISH); + if (err != Z_STREAM_END) { + /* zero is return code here */ + (void)zlib_deflateEnd(&dump_stream); + printk("dump_compress_gzip(): zlib_deflate() failed (%d)!\n", + err); + return 0; + } + + /* let's end the deflated compression stream */ + if ((err = zlib_deflateEnd(&dump_stream)) != Z_OK) { + printk("dump_compress_gzip(): zlib_deflateEnd() " + "failed (%d)!\n", err); + } + + /* return the compressed byte total (if it's smaller) */ + if (dump_stream.total_out >= oldsize) { + return oldsize; + } + return dump_stream.total_out; +} + +/* setup the gzip compression functionality */ +static struct __dump_compress dump_gzip_compression = { + .compress_type = DUMP_COMPRESS_GZIP, + .compress_func = dump_compress_gzip, + .compress_name = "GZIP", +}; + +/* + * Name: dump_compress_gzip_init() + * Func: Initialize gzip as a compression mechanism. + */ +static int __init +dump_compress_gzip_init(void) +{ + deflate_workspace = vmalloc(zlib_deflate_workspacesize()); + if (!deflate_workspace) { + printk("dump_compress_gzip_init(): Failed to " + "alloc %d bytes for deflate workspace\n", + zlib_deflate_workspacesize()); + return -ENOMEM; + } + dump_register_compression(&dump_gzip_compression); + return 0; +} + +/* + * Name: dump_compress_gzip_cleanup() + * Func: Remove gzip as a compression mechanism. + */ +static void __exit +dump_compress_gzip_cleanup(void) +{ + vfree(deflate_workspace); + dump_unregister_compression(DUMP_COMPRESS_GZIP); +} + +/* module initialization */ +module_init(dump_compress_gzip_init); +module_exit(dump_compress_gzip_cleanup); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("LKCD Development Team "); +MODULE_DESCRIPTION("Gzip compression module for crash dump driver"); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/dump/dump_i386.c 900-mjb3/drivers/dump/dump_i386.c --- 000-bk3/drivers/dump/dump_i386.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/dump/dump_i386.c Sat Mar 8 22:05:11 2003 @@ -0,0 +1,358 @@ +/* + * Architecture specific (i386) functions for Linux crash dumps. + * + * Created by: Matt Robinson (yakker@sgi.com) + * + * Copyright 1999 Silicon Graphics, Inc. All rights reserved. + * + * 2.3 kernel modifications by: Matt D. Robinson (yakker@turbolinux.com) + * Copyright 2000 TurboLinux, Inc. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* + * The hooks for dumping the kernel virtual memory to disk are in this + * file. Any time a modification is made to the virtual memory mechanism, + * these routines must be changed to use the new mechanisms. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "dump_methods.h" +#include + +#include +#include +#include +#include +#include + +static __s32 saved_irq_count; /* saved preempt_count() flags */ + +static int +alloc_dha_stack(void) +{ + int i; + void *ptr; + + if (dump_header_asm.dha_stack[0]) + return 0; + + ptr = vmalloc(THREAD_SIZE * num_online_cpus()); + if (!ptr) { + printk("vmalloc for dha_stacks failed\n"); + return -ENOMEM; + } + + for (i = 0; i < num_online_cpus(); i++) { + dump_header_asm.dha_stack[i] = (u32)((unsigned long)ptr + + (i * THREAD_SIZE)); + } + return 0; +} + +static int +free_dha_stack(void) +{ + if (dump_header_asm.dha_stack[0]) { + vfree((void *)dump_header_asm.dha_stack[0]); + dump_header_asm.dha_stack[0] = 0; + } + return 0; +} + + +void +__dump_save_regs(struct pt_regs *dest_regs, const struct pt_regs *regs) +{ + *dest_regs = *regs; + + /* In case of panic dumps, we collects regs on entry to panic. + * so, we shouldn't 'fix' ssesp here again. But it is hard to + * tell just looking at regs whether ssesp need fixing. We make + * this decision by looking at xss in regs. If we have better + * means to determine that ssesp are valid (by some flag which + * tells that we are here due to panic dump), then we can use + * that instead of this kludge. + */ + if (!user_mode(regs)) { + if ((0xffff & regs->xss) == __KERNEL_DS) + /* already fixed up */ + return; + dest_regs->esp = (unsigned long)&(regs->esp); + __asm__ __volatile__ ("movw %%ss, %%ax;" + :"=a"(dest_regs->xss)); + } +} + + +#ifdef CONFIG_SMP +extern unsigned long irq_affinity[]; +extern irq_desc_t irq_desc[]; +extern void dump_send_ipi(void); + +static int dump_expect_ipi[NR_CPUS]; +static atomic_t waiting_for_dump_ipi; +static unsigned long saved_affinity[NR_IRQS]; + +extern void stop_this_cpu(void *); /* exported by i386 kernel */ + +static int +dump_nmi_callback(struct pt_regs *regs, int cpu) +{ + if (!dump_expect_ipi[cpu]) + return 0; + + dump_expect_ipi[cpu] = 0; + + dump_save_this_cpu(regs); + atomic_dec(&waiting_for_dump_ipi); + + level_changed: + switch (dump_silence_level) { + case DUMP_HARD_SPIN_CPUS: /* Spin until dump is complete */ + while (dump_oncpu) { + barrier(); /* paranoia */ + if (dump_silence_level != DUMP_HARD_SPIN_CPUS) + goto level_changed; + + cpu_relax(); /* kill time nicely */ + } + break; + + case DUMP_HALT_CPUS: /* Execute halt */ + stop_this_cpu(NULL); + break; + + case DUMP_SOFT_SPIN_CPUS: + /* Mark the task so it spins in schedule */ + set_tsk_thread_flag(current, TIF_NEED_RESCHED); + break; + } + + return 1; +} + +/* save registers on other processors */ +void +__dump_save_other_cpus(void) +{ + int i, cpu = smp_processor_id(); + int other_cpus = num_online_cpus()-1; + + if (other_cpus > 0) { + atomic_set(&waiting_for_dump_ipi, other_cpus); + + for (i = 0; i < NR_CPUS; i++) { + dump_expect_ipi[i] = (i != cpu && cpu_online(i)); + } + + /* short circuit normal NMI handling temporarily */ + set_nmi_callback(dump_nmi_callback); + wmb(); + + dump_send_ipi(); + /* may be we dont need to wait for NMI to be processed. + just write out the header at the end of dumping, if + this IPI is not processed until then, there probably + is a problem and we just fail to capture state of + other cpus. */ + while(atomic_read(&waiting_for_dump_ipi) > 0) { + cpu_relax(); + } + + unset_nmi_callback(); + } +} + +/* + * Routine to save the old irq affinities and change affinities of all irqs to + * the dumping cpu. + */ +static void +set_irq_affinity(void) +{ + int i; + int cpu = smp_processor_id(); + + memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long)); + for (i = 0; i < NR_IRQS; i++) { + if (irq_desc[i].handler == NULL) + continue; + irq_affinity[i] = 1UL << cpu; + if (irq_desc[i].handler->set_affinity != NULL) + irq_desc[i].handler->set_affinity(i, irq_affinity[i]); + } +} + +/* + * Restore old irq affinities. + */ +static void +reset_irq_affinity(void) +{ + int i; + + memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long)); + for (i = 0; i < NR_IRQS; i++) { + if (irq_desc[i].handler == NULL) + continue; + if (irq_desc[i].handler->set_affinity != NULL) + irq_desc[i].handler->set_affinity(i, saved_affinity[i]); + } +} + +#else /* !CONFIG_SMP */ +#define set_irq_affinity() do { } while (0) +#define reset_irq_affinity() do { } while (0) +#define save_other_cpu_states() do { } while (0) +#endif /* !CONFIG_SMP */ + +/* + * Kludge - dump from interrupt context is unreliable (Fixme) + * + * We do this so that softirqs initiated for dump i/o + * get processed and we don't hang while waiting for i/o + * to complete or in any irq synchronization attempt. + * + * This is not quite legal of course, as it has the side + * effect of making all interrupts & softirqs triggered + * while dump is in progress complete before currently + * pending softirqs and the currently executing interrupt + * code. + */ +static inline void +irq_bh_save(void) +{ + saved_irq_count = irq_count(); + preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK); +} + +static inline void +irq_bh_restore(void) +{ + preempt_count() |= saved_irq_count; +} + +/* + * Name: __dump_irq_enable + * Func: Reset system so interrupts are enabled. + * This is used for dump methods that require interrupts + * Eventually, all methods will have interrupts disabled + * and this code can be removed. + * + * Change irq affinities + * Re-enable interrupts + */ +void +__dump_irq_enable(void) +{ + set_irq_affinity(); + irq_bh_save(); + local_irq_enable(); +} + +/* + * Name: __dump_irq_restore + * Func: Resume the system state in an architecture-specific way. + + */ +void +__dump_irq_restore(void) +{ + local_irq_disable(); + reset_irq_affinity(); + irq_bh_restore(); +} + +/* + * Name: __dump_configure_header() + * Func: Meant to fill in arch specific header fields except per-cpu state + * already captured via __dump_save_context for all CPUs. + */ +int +__dump_configure_header(const struct pt_regs *regs) +{ + return (0); +} + +/* + * Name: dump_nmi_handler + * Func: Called from notify_die + */ +static int dump_die_event(struct notifier_block *this, + unsigned long event, + void *arg) +{ + const struct die_args *args = (const struct die_args *) arg; + + switch (event) { + case DIE_PANIC: + case DIE_OOPS: + case DIE_WATCHDOG: + dump_execute(args->str, args->regs); + break; + } + return NOTIFY_DONE; + +} + +static struct notifier_block dump_die_block = { + .notifier_call = dump_die_event, +}; + +/* + * Name: __dump_init() + * Func: Initialize the dumping routine process. + */ +void +__dump_init(uint64_t local_memory_start) +{ + /* hook into NMI, Panic, and OOPS */ + register_die_notifier(&dump_die_block); +} + +/* + * Name: __dump_open() + * Func: Open the dump device (architecture specific). + */ +void +__dump_open(void) +{ + alloc_dha_stack(); +} + +/* + * Name: __dump_cleanup() + * Func: Free any architecture specific data structures. This is called + * when the dump module is being removed. + */ +void +__dump_cleanup(void) +{ + free_dha_stack(); + + unregister_die_notifier(&dump_die_block); +} + +extern int pfn_is_ram(unsigned long); + +/* + * Name: __dump_page_valid() + * Func: Check if page is valid to dump. + */ +int +__dump_page_valid(unsigned long index) +{ + if (!pfn_valid(index)) + return 0; + + return pfn_is_ram(index); +} + diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/dump/dump_methods.h 900-mjb3/drivers/dump/dump_methods.h --- 000-bk3/drivers/dump/dump_methods.h Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/dump/dump_methods.h Sat Mar 8 22:05:11 2003 @@ -0,0 +1,314 @@ +/* + * Generic interfaces for flexible system dump + * + * Started: Oct 2002 - Suparna Bhattacharya (suparna@in.ibm.com) + * + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#ifndef _LINUX_DUMP_METHODS_H +#define _LINUX_DUMP_METHODS_H + +/* + * Inspired by Matt Robinson's suggestion of introducing dump + * methods as a way to enable different crash dump facilities to + * coexist where each employs its own scheme or dumping policy. + * + * The code here creates a framework for flexible dump by defining + * a set of methods and providing associated helpers that differentiate + * between the underlying mechanism (how to dump), overall scheme + * (sequencing of stages and data dumped and associated quiescing), + * output format (what the dump output looks like), target type + * (where to save the dump; see dumpdev.h), and selection policy + * (state/data to dump). + * + * These sets of interfaces can be mixed and matched to build a + * dumper suitable for a given situation, allowing for + * flexibility as well appropriate degree of code reuse. + * For example all features and options of lkcd (including + * granular selective dumping in the near future) should be + * available even when say, the 2 stage soft-boot based mechanism + * is used for taking disruptive dumps. + * + * Todo: Additionally modules or drivers may supply their own + * custom dumpers which extend dump with module specific + * information or hardware state, and can even tweak the + * mechanism when it comes to saving state relevant to + * them. + */ + +#include +#include +#include +#include + +#define MAX_PASSES 6 +#define MAX_DEVS 4 + + +/* To customise selection of pages to be dumped in a given pass/group */ +struct dump_data_filter{ + char name[32]; + int (*selector)(int, unsigned long, unsigned long); + ulong level_mask; /* dump level(s) for which this filter applies */ + loff_t start, end; /* location range applicable */ +}; + + +/* + * Determined by the kind of dump mechanism and appropriate + * overall scheme + */ +struct dump_scheme_ops { + /* sets aside memory, inits data structures etc */ + int (*configure)(unsigned long devid); + /* releases resources */ + int (*unconfigure)(void); + + /* ordering of passes, invoking iterator */ + int (*sequencer)(void); + /* iterates over system data, selects and acts on data to dump */ + int (*iterator)(int, int (*)(unsigned long, unsigned long), + struct dump_data_filter *); + /* action when data is selected for dump */ + int (*save_data)(unsigned long, unsigned long); + /* action when data is to be excluded from dump */ + int (*skip_data)(unsigned long, unsigned long); + /* policies for space, multiple dump devices etc */ + int (*write_buffer)(void *, unsigned long); +}; + +struct dump_scheme { + /* the name serves as an anchor to locate the scheme after reboot */ + char name[32]; + struct dump_scheme_ops *ops; + struct list_head list; +}; + +/* Quiescing/Silence levels (controls IPI callback behaviour) */ +extern enum dump_silence_levels { + DUMP_SOFT_SPIN_CPUS = 1, + DUMP_HARD_SPIN_CPUS = 2, + DUMP_HALT_CPUS = 3, +} dump_silence_level; + +/* determined by the dump (file) format */ +struct dump_fmt_ops { + /* build header */ + int (*configure_header)(const char *, const struct pt_regs *); + int (*update_header)(void); /* update header and write it out */ + /* save curr context */ + void (*save_context)(int, const struct pt_regs *, + struct task_struct *); + /* typically called by the save_data action */ + /* add formatted data to the dump buffer */ + int (*add_data)(unsigned long, unsigned long); + int (*update_end_marker)(void); +}; + +struct dump_fmt { + unsigned long magic; + char name[32]; /* lcrash, crash, elf-core etc */ + struct dump_fmt_ops *ops; + struct list_head list; +}; + +/* + * Modules will be able add their own data capture schemes by + * registering their own dumpers. Typically they would use the + * primary dumper as a template and tune it with their routines. + * Still Todo. + */ + +/* The combined dumper profile (mechanism, scheme, dev, fmt) */ +struct dumper { + char name[32]; /* singlestage, overlay (stg1), passthru(stg2), pull */ + struct dump_scheme *scheme; + struct dump_fmt *fmt; + struct __dump_compress *compress; + struct dump_data_filter *filter; + struct dump_dev *dev; + /* state valid only for active dumper(s) - per instance */ + /* run time state/context */ + int curr_pass; + unsigned long count; + loff_t curr_offset; /* current logical offset into dump device */ + loff_t curr_loc; /* current memory location */ + void *curr_buf; /* current position in the dump buffer */ + void *dump_buf; /* starting addr of dump buffer */ + int header_dirty; /* whether the header needs to be written out */ + struct list_head dumper_list; /* links to other dumpers */ +}; + +/* Starting point to get to the current configured state */ +struct dump_config { + ulong level; + ulong flags; + struct dumper *dumper; + struct list_head dump_dev_list; +}; + +extern struct dump_config dump_config; + + +/* Wrappers that invoke the methods for the current (active) dumper */ + +/* Scheme operations */ + +static inline int dump_sequencer(void) +{ + return dump_config.dumper->scheme->ops->sequencer(); +} + +static inline int dump_iterator(int pass, int (*action)(unsigned long, + unsigned long), struct dump_data_filter *filter) +{ + return dump_config.dumper->scheme->ops->iterator(pass, action, filter); +} + +#define dump_save_data dump_config.dumper->scheme->ops->save_data +#define dump_skip_data dump_config.dumper->scheme->ops->skip_data + +static inline int dump_write_buffer(void *buf, unsigned long len) +{ + return dump_config.dumper->scheme->ops->write_buffer(buf, len); +} + +static inline int dump_configure(unsigned long devid) +{ + return dump_config.dumper->scheme->ops->configure(devid); +} + +static inline int dump_unconfigure(void) +{ + return dump_config.dumper->scheme->ops->unconfigure(); +} + +/* Format operations */ + +static inline int dump_configure_header(const char *panic_str, + const struct pt_regs *regs) +{ + return dump_config.dumper->fmt->ops->configure_header(panic_str, regs); +} + +static inline void dump_save_context(int cpu, const struct pt_regs *regs, + struct task_struct *tsk) +{ + dump_config.dumper->fmt->ops->save_context(cpu, regs, tsk); +} + +static inline int dump_save_this_cpu(const struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + + dump_save_context(cpu, regs, current); + return 1; +} + +static inline int dump_update_header(void) +{ + return dump_config.dumper->fmt->ops->update_header(); +} + +static inline int dump_update_end_marker(void) +{ + return dump_config.dumper->fmt->ops->update_end_marker(); +} + +static inline int dump_add_data(unsigned long loc, unsigned long sz) +{ + return dump_config.dumper->fmt->ops->add_data(loc, sz); +} + +/* Compression operation */ +static inline int dump_compress_data(char *src, int slen, char *dst) +{ + return dump_config.dumper->compress->compress_func(src, slen, + dst, DUMP_DPC_PAGE_SIZE); +} + + +/* Prototypes of some default implementations of dump methods */ + +extern struct __dump_compress dump_none_compression; + +/* Default scheme methods (dump_scheme.c) */ + +extern int dump_generic_sequencer(void); +extern int dump_page_iterator(int pass, int (*action)(unsigned long, unsigned + long), struct dump_data_filter *filter); +extern int dump_generic_save_data(unsigned long loc, unsigned long sz); +extern int dump_generic_skip_data(unsigned long loc, unsigned long sz); +extern int dump_generic_write_buffer(void *buf, unsigned long len); +extern int dump_generic_configure(unsigned long); +extern int dump_generic_unconfigure(void); + +/* Default scheme template */ +extern struct dump_scheme dump_scheme_singlestage; + +/* Default dump format methods */ + +extern int dump_lcrash_configure_header(const char *panic_str, + const struct pt_regs *regs); +extern void dump_lcrash_save_context(int cpu, const struct pt_regs *regs, + struct task_struct *tsk); +extern int dump_generic_update_header(void); +extern int dump_lcrash_add_data(unsigned long loc, unsigned long sz); +extern int dump_lcrash_update_end_marker(void); + +/* Default format (lcrash) template */ +extern struct dump_fmt dump_fmt_lcrash; + +/* Default dump selection filter table */ + +/* + * Entries listed in order of importance and correspond to passes + * The last entry (with a level_mask of zero) typically reflects data that + * won't be dumped -- this may for example be used to identify data + * that will be skipped for certain so the corresponding memory areas can be + * utilized as scratch space. + */ +extern struct dump_data_filter dump_filter_table[]; + +/* Some pre-defined dumpers */ +extern struct dumper dumper_singlestage; + +/* These are temporary */ +#define DUMP_MASK_HEADER DUMP_LEVEL_HEADER +#define DUMP_MASK_KERN DUMP_LEVEL_KERN +#define DUMP_MASK_USED DUMP_LEVEL_USED +#define DUMP_MASK_UNUSED DUMP_LEVEL_ALL_RAM +#define DUMP_MASK_REST 0 /* dummy for now */ + +/* Helpers - move these to dump.h later ? */ + +int dump_generic_execute(const char *panic_str, const struct pt_regs *regs); +extern int dump_ll_write(void *buf, unsigned long len); + +static inline void dumper_reset(void) +{ + dump_config.dumper->curr_buf = dump_config.dumper->dump_buf; + dump_config.dumper->curr_loc = 0; + dump_config.dumper->curr_offset = 0; + dump_config.dumper->count = 0; + dump_config.dumper->curr_pass = 0; +} + +/* + * May later be moulded to perform boot-time allocations so we can dump + * earlier during bootup + */ +static inline void *dump_alloc_mem(unsigned long size) +{ + return kmalloc(size, GFP_KERNEL); +} + +static inline void dump_free_mem(void *buf) +{ + kfree(buf); +} + +#endif /* _LINUX_DUMP_METHODS_H */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/dump/dump_netdev.c 900-mjb3/drivers/dump/dump_netdev.c --- 000-bk3/drivers/dump/dump_netdev.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/dump/dump_netdev.c Sat Mar 8 22:05:11 2003 @@ -0,0 +1,858 @@ +/* + * Implements the dump driver interface for saving a dump via network + * interface. + * + * Some of this code has been taken/adapted from Ingo Molnar's netconsole + * code. LKCD team expresses its thanks to Ingo. + * + * Started: June 2002 - Mohamed Abbas + * Adapted netconsole code to implement LKCD dump over the network. + * + * Nov 2002 - Bharata B. Rao + * Innumerable code cleanups, simplification and some fixes. + * Netdump configuration done by ioctl instead of using module parameters. + * + * Copyright (C) 2001 Ingo Molnar + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static int startup_handshake; +static int page_counter; +static struct net_device *dump_ndev; +static struct in_device *dump_in_dev; +static u16 source_port, target_port; +static u32 source_ip, target_ip; +static unsigned char daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ; +static spinlock_t dump_skb_lock = SPIN_LOCK_UNLOCKED; +static int dump_nr_skbs; +static struct sk_buff *dump_skb; +static unsigned long flags_global; +static int netdump_in_progress; +static char device_name[IFNAMSIZ]; + +/* + * security depends on the trusted path between the netconsole + * server and netconsole client, since none of the packets are + * encrypted. The random magic number protects the protocol + * against spoofing. + */ +static u64 dump_magic; + +#define MAX_UDP_CHUNK 1460 +#define MAX_PRINT_CHUNK (MAX_UDP_CHUNK-HEADER_LEN) + +/* + * We maintain a small pool of fully-sized skbs, + * to make sure the message gets out even in + * extreme OOM situations. + */ +#define DUMP_MAX_SKBS 32 + +#define MAX_SKB_SIZE \ + (MAX_UDP_CHUNK + sizeof(struct udphdr) + \ + sizeof(struct iphdr) + sizeof(struct ethhdr)) + +static void +dump_refill_skbs(void) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&dump_skb_lock, flags); + while (dump_nr_skbs < DUMP_MAX_SKBS) { + skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); + if (!skb) + break; + if (dump_skb) + skb->next = dump_skb; + else + skb->next = NULL; + dump_skb = skb; + dump_nr_skbs++; + } + spin_unlock_irqrestore(&dump_skb_lock, flags); +} + +static struct +sk_buff * dump_get_skb(void) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&dump_skb_lock, flags); + skb = dump_skb; + if (skb) { + dump_skb = skb->next; + skb->next = NULL; + dump_nr_skbs--; + } + spin_unlock_irqrestore(&dump_skb_lock, flags); + + return skb; +} + +/* + * Zap completed output skbs. + */ +static void +zap_completion_queue(void) +{ + int count; + unsigned long flags; + int cpu = smp_processor_id(); + + count=0; + if (softnet_data[cpu].completion_queue) { + struct sk_buff *clist; + + local_irq_save(flags); + clist = softnet_data[cpu].completion_queue; + softnet_data[cpu].completion_queue = NULL; + local_irq_restore(flags); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + __kfree_skb(skb); + count++; + if (count > 10000) + printk("Error in sk list\n"); + } + } +} + +static void +dump_send_skb(struct net_device *dev, const char *msg, unsigned int msg_len, + reply_t *reply) +{ + int once = 1; + int total_len, eth_len, ip_len, udp_len, count = 0; + struct sk_buff *skb; + struct udphdr *udph; + struct iphdr *iph; + struct ethhdr *eth; + + udp_len = msg_len + HEADER_LEN + sizeof(*udph); + ip_len = eth_len = udp_len + sizeof(*iph); + total_len = eth_len + ETH_HLEN; + +repeat_loop: + zap_completion_queue(); + if (dump_nr_skbs < DUMP_MAX_SKBS) + dump_refill_skbs(); + + skb = alloc_skb(total_len, GFP_ATOMIC); + if (!skb) { + skb = dump_get_skb(); + if (!skb) { + count++; + if (once && (count == 1000000)) { + printk("possibly FATAL: out of netconsole " + "skbs!!! will keep retrying.\n"); + once = 0; + } + dev->poll_controller(dev); + goto repeat_loop; + } + } + + atomic_set(&skb->users, 1); + skb_reserve(skb, total_len - msg_len - HEADER_LEN); + skb->data[0] = NETCONSOLE_VERSION; + + put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1)); + put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5)); + put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9)); + + memcpy(skb->data + HEADER_LEN, msg, msg_len); + skb->len += msg_len + HEADER_LEN; + + udph = (struct udphdr *) skb_push(skb, sizeof(*udph)); + udph->source = source_port; + udph->dest = target_port; + udph->len = htons(udp_len); + udph->check = 0; + + iph = (struct iphdr *)skb_push(skb, sizeof(*iph)); + + iph->version = 4; + iph->ihl = 5; + iph->tos = 0; + iph->tot_len = htons(ip_len); + iph->id = 0; + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + iph->saddr = source_ip; + iph->daddr = target_ip; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + + eth->h_proto = htons(ETH_P_IP); + memcpy(eth->h_source, dev->dev_addr, dev->addr_len); + memcpy(eth->h_dest, daddr, dev->addr_len); + + count=0; +repeat_poll: + spin_lock(&dev->xmit_lock); + dev->xmit_lock_owner = smp_processor_id(); + + count++; + + + if (netif_queue_stopped(dev)) { + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); + + dev->poll_controller(dev); + zap_completion_queue(); + + + goto repeat_poll; + } + + dev->hard_start_xmit(skb, dev); + + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); +} + +static unsigned short +udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, + unsigned long base) +{ + return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base); +} + +static int +udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, + unsigned short ulen, u32 saddr, u32 daddr) +{ + if (uh->check == 0) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) + return 0; + skb->ip_summed = CHECKSUM_NONE; + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, + IPPROTO_UDP, 0); + /* Probably, we should checksum udp header (it should be in cache + * in any case) and data in tiny packets (< rx copybreak). + */ + return 0; +} + +static __inline__ int +__udp_checksum_complete(struct sk_buff *skb) +{ + return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, + skb->csum)); +} + +static __inline__ +int udp_checksum_complete(struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __udp_checksum_complete(skb); +} + +int new_req = 0; +static req_t req; + +static int +dump_rx_hook(struct sk_buff *skb) +{ + int proto; + struct iphdr *iph; + struct udphdr *uh; + __u32 len, saddr, daddr, ulen; + req_t *__req; + + /* + * First check if were are dumping or doing startup handshake, if + * not quickly return. + */ + if (!netdump_in_progress) + return NET_RX_SUCCESS; + + if (skb->dev->type != ARPHRD_ETHER) + goto out; + + proto = ntohs(skb->mac.ethernet->h_proto); + if (proto != ETH_P_IP) + goto out; + + if (skb->pkt_type == PACKET_OTHERHOST) + goto out; + + if (skb_shared(skb)) + goto out; + + /* IP header correctness testing: */ + iph = (struct iphdr *)skb->data; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + if (iph->ihl < 5 || iph->version != 4) + goto out; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto out; + + if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + goto out; + + len = ntohs(iph->tot_len); + if (skb->len < len || len < iph->ihl*4) + goto out; + + saddr = iph->saddr; + daddr = iph->daddr; + if (iph->protocol != IPPROTO_UDP) + goto out; + + if (source_ip != daddr) + goto out; + + if (target_ip != saddr) + goto out; + + len -= iph->ihl*4; + uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); + ulen = ntohs(uh->len); + + if (ulen != len || ulen < (sizeof(*uh) + sizeof(*__req))) + goto out; + + if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) + goto out; + + if (udp_checksum_complete(skb)) + goto out; + + if (source_port != uh->dest) + goto out; + + if (target_port != uh->source) + goto out; + + __req = (req_t *)(uh + 1); + if ((ntohl(__req->command) != COMM_GET_MAGIC) && + (ntohl(__req->command) != COMM_HELLO) && + (ntohl(__req->command) != COMM_START_WRITE_NETDUMP_ACK) && + (ntohl(__req->command) != COMM_START_NETDUMP_ACK) && + (memcmp(&__req->magic, &dump_magic, sizeof(dump_magic)) != 0)) + goto out; + + req.magic = ntohl(__req->magic); + req.command = ntohl(__req->command); + req.from = ntohl(__req->from); + req.to = ntohl(__req->to); + req.nr = ntohl(__req->nr); + new_req = 1; +out: + return NET_RX_DROP; +} + +static void +dump_send_mem(struct net_device *dev, req_t *req, const char* buff, size_t len) +{ + int i; + + int nr_chunks = len/1024; + reply_t reply; + + reply.nr = req->nr; + reply.info = 0; + + if ( nr_chunks <= 0) + nr_chunks = 1; + for (i = 0; i < nr_chunks; i++) { + unsigned int offset = i*1024; + reply.code = REPLY_MEM; + reply.info = offset; + dump_send_skb(dev, buff + offset, 1024, &reply); + } +} + +/* + * This function waits for the client to acknowledge the receipt + * of the netdump startup reply, with the possibility of packets + * getting lost. We resend the startup packet if no ACK is received, + * after a 1 second delay. + * + * (The client can test the success of the handshake via the HELLO + * command, and send ACKs until we enter netdump mode.) + */ +static int +dump_handshake(struct dump_dev *net_dev) +{ + char tmp[200]; + reply_t reply; + int i, j; + + if (startup_handshake) { + sprintf(tmp, "NETDUMP start, waiting for start-ACK.\n"); + reply.code = REPLY_START_NETDUMP; + reply.nr = 0; + reply.info = 0; + } else { + sprintf(tmp, "NETDUMP start, waiting for start-ACK.\n"); + reply.code = REPLY_START_WRITE_NETDUMP; + reply.nr = net_dev->curr_offset; + reply.info = net_dev->curr_offset; + } + + /* send 300 handshake packets before declaring failure */ + for (i = 0; i < 300; i++) { + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + + /* wait 1 sec */ + for (j = 0; j < 10000; j++) { + udelay(100); + dump_ndev->poll_controller(dump_ndev); + zap_completion_queue(); + if (new_req) + break; + } + + /* + * if there is no new request, try sending the handshaking + * packet again + */ + if (!new_req) + continue; + + /* + * check if the new request is of the expected type, + * if so, return, else try sending the handshaking + * packet again + */ + if (startup_handshake) { + if (req.command == COMM_HELLO || req.command == + COMM_START_NETDUMP_ACK) { + return 0; + } else { + new_req = 0; + continue; + } + } else { + if (req.command == COMM_SEND_MEM) { + return 0; + } else { + new_req = 0; + continue; + } + } + } + return -1; +} + +static ssize_t +do_netdump(struct dump_dev *net_dev, const char* buff, size_t len) +{ + reply_t reply; + char tmp[200]; + ssize_t ret = 0; + int repeatCounter, counter, total_loop; + + netdump_in_progress = 1; + + if (dump_handshake(net_dev) < 0) { + printk("network dump failed due to handshake failure\n"); + goto out; + } + + /* + * Ideally startup handshake should be done during dump configuration, + * i.e., in dump_net_open(). This will be done when I figure out + * the dependency between startup handshake, subsequent write and + * various commands wrt to net-server. + */ + if (startup_handshake) + startup_handshake = 0; + + counter = 0; + repeatCounter = 0; + total_loop = 0; + while (1) { + if (!new_req) { + dump_ndev->poll_controller(dump_ndev); + zap_completion_queue(); + } + if (!new_req) { + repeatCounter++; + + if (repeatCounter > 5) { + counter++; + if (counter > 10000) { + if (total_loop >= 100000) { + printk("Time OUT LEAVE NOW\n"); + goto out; + } else { + total_loop++; + printk("Try number %d out of " + "10 before Time Out\n", + total_loop); + } + } + mdelay(1); + repeatCounter = 0; + } + continue; + } + repeatCounter = 0; + counter = 0; + total_loop = 0; + new_req = 0; + switch (req.command) { + case COMM_NONE: + break; + + case COMM_SEND_MEM: + dump_send_mem(dump_ndev, &req, buff, len); + break; + + case COMM_EXIT: + case COMM_START_WRITE_NETDUMP_ACK: + ret = len; + goto out; + + case COMM_HELLO: + sprintf(tmp, "Hello, this is netdump version " + "0.%02d\n", NETCONSOLE_VERSION); + reply.code = REPLY_HELLO; + reply.nr = req.nr; + reply.info = net_dev->curr_offset; + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + break; + + case COMM_GET_PAGE_SIZE: + sprintf(tmp, "PAGE_SIZE: %ld\n", PAGE_SIZE); + reply.code = REPLY_PAGE_SIZE; + reply.nr = req.nr; + reply.info = PAGE_SIZE; + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + break; + + case COMM_GET_NR_PAGES: + reply.code = REPLY_NR_PAGES; + reply.nr = req.nr; + reply.info = max_mapnr; + reply.info = page_counter; + sprintf(tmp, "Number of pages: %ld\n", max_mapnr); + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + break; + + case COMM_GET_MAGIC: + reply.code = REPLY_MAGIC; + reply.nr = req.nr; + reply.info = NETCONSOLE_VERSION; + dump_send_skb(dump_ndev, (char *)&dump_magic, + sizeof(dump_magic), &reply); + break; + + default: + reply.code = REPLY_ERROR; + reply.nr = req.nr; + reply.info = req.command; + sprintf(tmp, "Got unknown command code %d!\n", + req.command); + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + break; + } + } +out: + netdump_in_progress = 0; + return ret; +} + +static int +dump_validate_config(void) +{ + source_ip = dump_in_dev->ifa_list->ifa_local; + if (!source_ip) { + printk("network device %s has no local address, " + "aborting.\n", device_name); + return -1; + } + +#define IP(x) ((unsigned char *)&source_ip)[x] + printk("Source %d.%d.%d.%d", IP(0), IP(1), IP(2), IP(3)); +#undef IP + + if (!source_port) { + printk("source_port parameter not specified, aborting.\n"); + return -1; + } + printk(":%i\n", source_port); + source_port = htons(source_port); + + if (!target_ip) { + printk("target_ip parameter not specified, aborting.\n"); + return -1; + } + +#define IP(x) ((unsigned char *)&target_ip)[x] + printk("Target %d.%d.%d.%d", IP(0), IP(1), IP(2), IP(3)); +#undef IP + + if (!target_port) { + printk("target_port parameter not specified, aborting.\n"); + return -1; + } + printk(":%i\n", target_port); + target_port = htons(target_port); + + printk("Target Ethernet Address %02x:%02x:%02x:%02x:%02x:%02x", + daddr[0], daddr[1], daddr[2], daddr[3], daddr[4], daddr[5]); + + if ((daddr[0] & daddr[1] & daddr[2] & daddr[3] & daddr[4] & + daddr[5]) == 255) + printk("(Broadcast)"); + printk("\n"); + return 0; +} + +/* + * Prepares the dump device so we can take a dump later. + * Validates the netdump configuration parameters. + * + * TODO: Network connectivity check should be done here. + */ +static int +dump_net_open(struct dump_dev *net_dev, unsigned long arg) +{ + int retval = 0; + + /* get the interface name */ + if (copy_from_user(device_name, (void *)arg, IFNAMSIZ)) + return -EFAULT; + + if (!(dump_ndev = dev_get_by_name(device_name))) { + printk("network device %s does not exist, aborting.\n", + device_name); + return -ENODEV; + } + + if (!dump_ndev->poll_controller) { + printk("network device %s does not implement polling yet, " + "aborting.\n", device_name); + retval = -1; /* return proper error */ + goto err1; + } + + if (!(dump_in_dev = in_dev_get(dump_ndev))) { + printk("network device %s is not an IP protocol device, " + "aborting.\n", device_name); + retval = -EINVAL; + goto err1; + } + + if ((retval = dump_validate_config()) < 0) + goto err2; + + net_dev->curr_offset = 0; + printk("Network device %s successfully configured for dumping\n", + device_name); + return retval; +err2: + in_dev_put(dump_in_dev); +err1: + dev_put(dump_ndev); + return retval; +} + +/* + * Close the dump device and release associated resources + * Invoked when unconfiguring the dump device. + */ +static int +dump_net_release(struct dump_dev *net_dev) +{ + if (dump_in_dev) + in_dev_put(dump_in_dev); + if (dump_ndev) + dev_put(dump_ndev); + return 0; +} + +/* + * Prepare the dump device for use (silence any ongoing activity + * and quiesce state) when the system crashes. + */ +static int +dump_net_silence(struct dump_dev *net_dev) +{ + local_irq_save(flags_global); + dump_ndev->rx_hook = dump_rx_hook; + startup_handshake = 1; + net_dev->curr_offset = 0; + printk("Dumping to network device %s on CPU %d ...\n", device_name, + smp_processor_id()); + return 0; +} + +/* + * Invoked when dumping is done. This is the time to put things back + * (i.e. undo the effects of dump_block_silence) so the device is + * available for normal use. + */ +static int +dump_net_resume(struct dump_dev *net_dev) +{ + int indx; + reply_t reply; + char tmp[200]; + + if (!dump_ndev) + return (0); + + sprintf(tmp, "NETDUMP end.\n"); + for( indx = 0; indx < 6; indx++) { + reply.code = REPLY_END_NETDUMP; + reply.nr = 0; + reply.info = 0; + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + } + printk("NETDUMP END!\n"); + local_irq_restore(flags_global); + dump_ndev->rx_hook = NULL; + startup_handshake = 0; + return 0; +} + +/* + * Seek to the specified offset in the dump device. + * Makes sure this is a valid offset, otherwise returns an error. + */ +static int +dump_net_seek(struct dump_dev *net_dev, loff_t off) +{ + net_dev->curr_offset = off; + return 0; +} + +/* + * + */ +static int +dump_net_write(struct dump_dev *net_dev, void *buf, unsigned long len) +{ + int cnt, i, off; + ssize_t ret; + + cnt = len/ PAGE_SIZE; + + for (i = 0; i < cnt; i++) { + off = i* PAGE_SIZE; + ret = do_netdump(net_dev, buf+off, PAGE_SIZE); + if (ret <= 0) + return -1; + net_dev->curr_offset = net_dev->curr_offset + PAGE_SIZE; + } + return len; +} + +/* + * check if the last dump i/o is over and ready for next request + */ +static int +dump_net_ready(struct dump_dev *net_dev, void *buf) +{ + return 0; +} + +/* + * ioctl function used for configuring network dump + */ +static int +dump_net_ioctl(struct dump_dev *net_dev, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case DIOSTARGETIP: + target_ip = arg; + break; + case DIOSTARGETPORT: + target_port = (u16)arg; + break; + case DIOSSOURCEPORT: + source_port = (u16)arg; + break; + case DIOSETHADDR: + return copy_from_user(daddr, (void *)arg, 6); + break; + case DIOGTARGETIP: + case DIOGTARGETPORT: + case DIOGSOURCEPORT: + case DIOGETHADDR: + break; + default: + return -EINVAL; + } + return 0; +} + +struct dump_dev_ops dump_netdev_ops = { + .open = dump_net_open, + .release = dump_net_release, + .silence = dump_net_silence, + .resume = dump_net_resume, + .seek = dump_net_seek, + .write = dump_net_write, + /* .read not implemented */ + .ready = dump_net_ready, + .ioctl = dump_net_ioctl +}; + +static struct dump_dev default_dump_netdev = { + .type_name = "networkdev", + .ops = &dump_netdev_ops, + .curr_offset = 0 +}; + +static int __init +dump_netdev_init(void) +{ + default_dump_netdev.curr_offset = 0; + + if (dump_register_device(&default_dump_netdev) < 0) { + printk("network dump device driver registration failed\n"); + return -1; + } + printk("network device driver for LKCD registered\n"); + + get_random_bytes(&dump_magic, sizeof(dump_magic)); + return 0; +} + +static void __exit +dump_netdev_cleanup(void) +{ + dump_unregister_device(&default_dump_netdev); +} + +MODULE_AUTHOR("LKCD Development Team "); +MODULE_DESCRIPTION("Network Dump Driver for Linux Kernel Crash Dump (LKCD)"); +MODULE_LICENSE("GPL"); + +module_init(dump_netdev_init); +module_exit(dump_netdev_cleanup); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/dump/dump_rle.c 900-mjb3/drivers/dump/dump_rle.c --- 000-bk3/drivers/dump/dump_rle.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/dump/dump_rle.c Sat Mar 8 22:05:11 2003 @@ -0,0 +1,175 @@ +/* + * RLE Compression functions for kernel crash dumps. + * + * Created by: Matt Robinson (yakker@sourceforge.net) + * Copyright 2001 Matt D. Robinson. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* header files */ +#include +#include +#include +#include +#include +#include +#include + +/* + * Name: dump_compress_rle() + * Func: Compress a DUMP_PAGE_SIZE (hardware) page down to something more + * reasonable, if possible. This is the same routine we use in IRIX. + */ +static u16 +dump_compress_rle(const u8 *old, u16 oldsize, u8 *new, u16 newsize) +{ + u16 ri, wi, count = 0; + u_char value = 0, cur_byte; + + /* + * If the block should happen to "compress" to larger than the + * buffer size, allocate a larger one and change cur_buf_size. + */ + + wi = ri = 0; + + while (ri < oldsize) { + if (!ri) { + cur_byte = value = old[ri]; + count = 0; + } else { + if (count == 255) { + if (wi + 3 > oldsize) { + return oldsize; + } + new[wi++] = 0; + new[wi++] = count; + new[wi++] = value; + value = cur_byte = old[ri]; + count = 0; + } else { + if ((cur_byte = old[ri]) == value) { + count++; + } else { + if (count > 1) { + if (wi + 3 > oldsize) { + return oldsize; + } + new[wi++] = 0; + new[wi++] = count; + new[wi++] = value; + } else if (count == 1) { + if (value == 0) { + if (wi + 3 > oldsize) { + return oldsize; + } + new[wi++] = 0; + new[wi++] = 1; + new[wi++] = 0; + } else { + if (wi + 2 > oldsize) { + return oldsize; + } + new[wi++] = value; + new[wi++] = value; + } + } else { /* count == 0 */ + if (value == 0) { + if (wi + 2 > oldsize) { + return oldsize; + } + new[wi++] = value; + new[wi++] = value; + } else { + if (wi + 1 > oldsize) { + return oldsize; + } + new[wi++] = value; + } + } /* if count > 1 */ + + value = cur_byte; + count = 0; + + } /* if byte == value */ + + } /* if count == 255 */ + + } /* if ri == 0 */ + ri++; + + } + if (count > 1) { + if (wi + 3 > oldsize) { + return oldsize; + } + new[wi++] = 0; + new[wi++] = count; + new[wi++] = value; + } else if (count == 1) { + if (value == 0) { + if (wi + 3 > oldsize) + return oldsize; + new[wi++] = 0; + new[wi++] = 1; + new[wi++] = 0; + } else { + if (wi + 2 > oldsize) + return oldsize; + new[wi++] = value; + new[wi++] = value; + } + } else { /* count == 0 */ + if (value == 0) { + if (wi + 2 > oldsize) + return oldsize; + new[wi++] = value; + new[wi++] = value; + } else { + if (wi + 1 > oldsize) + return oldsize; + new[wi++] = value; + } + } /* if count > 1 */ + + value = cur_byte; + count = 0; + return wi; +} + +/* setup the rle compression functionality */ +static struct __dump_compress dump_rle_compression = { + .compress_type = DUMP_COMPRESS_RLE, + .compress_func = dump_compress_rle, + .compress_name = "RLE", +}; + +/* + * Name: dump_compress_rle_init() + * Func: Initialize rle compression for dumping. + */ +static int __init +dump_compress_rle_init(void) +{ + dump_register_compression(&dump_rle_compression); + return 0; +} + +/* + * Name: dump_compress_rle_cleanup() + * Func: Remove rle compression for dumping. + */ +static void __exit +dump_compress_rle_cleanup(void) +{ + dump_unregister_compression(DUMP_COMPRESS_RLE); +} + +/* module initialization */ +module_init(dump_compress_rle_init); +module_exit(dump_compress_rle_cleanup); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("LKCD Development Team "); +MODULE_DESCRIPTION("RLE compression module for crash dump driver"); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/dump/dump_scheme.c 900-mjb3/drivers/dump/dump_scheme.c --- 000-bk3/drivers/dump/dump_scheme.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/dump/dump_scheme.c Sat Mar 8 22:05:11 2003 @@ -0,0 +1,346 @@ +/* + * Default single stage dump scheme methods + * + * Previously a part of dump_base.c + * + * Started: Oct 2002 - Suparna Bhattacharya + * Split and rewrote LKCD dump scheme to generic dump method + * interfaces + * Derived from original code created by + * Matt Robinson ) + * + * Contributions from SGI, IBM, HP, MCL, and others. + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* + * Implements the default dump scheme, i.e. single-stage gathering and + * saving of dump data directly to the target device, which operates in + * a push mode, where the dumping system decides what data it saves + * taking into account pre-specified dump config options. + * + * Aside: The 2-stage dump scheme, where there is a soft-reset between + * the gathering and saving phases, also reuses some of these + * default routines (see dump_overlay.c) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "dump_methods.h" + +extern int panic_timeout; /* time before reboot */ + +extern void dump_speedo(int); + +/* Default sequencer used during single stage dumping */ +/* Also invoked during stage 2 of soft-boot based dumping */ +int dump_generic_sequencer(void) +{ + struct dump_data_filter *filter = dump_config.dumper->filter; + int pass = 0, err = 0, save = 0; + int (*action)(unsigned long, unsigned long); + + /* + * We want to save the more critical data areas first in + * case we run out of space, encounter i/o failures, or get + * interrupted otherwise and have to give up midway + * So, run through the passes in increasing order + */ + for (;filter->selector; filter++, pass++) + { + /* Assumes passes are exclusive (even across dumpers) */ + /* Requires care when coding the selection functions */ + if ((save = filter->level_mask & dump_config.level)) + action = dump_save_data; + else + action = dump_skip_data; + + if ((err = dump_iterator(pass, action, filter)) < 0) + break; + + printk("\n %d dump pages %s of %d each in pass %d\n", + err, save ? "saved" : "skipped", DUMP_PAGE_SIZE, pass); + + } + + return (err < 0) ? err : 0; +} + +static inline struct page *dump_get_page(loff_t loc) +{ + unsigned long page_index = loc >> PAGE_SHIFT; + + /* todo: complete this to account for ia64/discontig mem */ + /* todo: and to check for validity, ram page, no i/o mem etc */ + /* need to use pfn/physaddr equiv of kern_addr_valid */ + if (__dump_page_valid(page_index)) + return pfn_to_page(page_index); + else + return NULL; + +} + +/* Default iterator: for singlestage and stage 1 of soft-boot dumping */ +/* Iterates over range of physical memory pages in DUMP_PAGE_SIZE increments */ +int dump_page_iterator(int pass, int (*action)(unsigned long, unsigned long), + struct dump_data_filter *filter) +{ + /* Todo : fix unit, type */ + loff_t loc; + int count = 0, err = 0; + struct page *page; + + /* Todo: Add membanks code */ + /* TBD: Check if we need to address DUMP_PAGE_SIZE < PAGE_SIZE */ + + for (loc = filter->start; loc < filter->end; loc += DUMP_PAGE_SIZE) { + dump_config.dumper->curr_loc = loc; + page = dump_get_page(loc); + if (page && filter->selector(pass, (unsigned long) page, + DUMP_PAGE_SIZE)) { + if ((err = action((unsigned long)page, DUMP_PAGE_SIZE))) + { + printk("dump_page_iterator: err %d for loc " + "0x%llx, in pass %d\n", err, loc, pass); + break; + } else + count++; + } + } + + return err ? err : count; +} + +/* + * Base function that saves the selected block of data in the dump + * Action taken when iterator decides that data needs to be saved + */ +int dump_generic_save_data(unsigned long loc, unsigned long sz) +{ + void *buf; + void *dump_buf = dump_config.dumper->dump_buf; + int left, bytes, ret; + + if ((ret = dump_add_data(loc, sz))) { + return ret; + } + buf = dump_config.dumper->curr_buf; + + /* If we've filled up the buffer write it out */ + if ((left = buf - dump_buf) >= DUMP_BUFFER_SIZE) { + bytes = dump_write_buffer(dump_buf, DUMP_BUFFER_SIZE); + if (bytes < DUMP_BUFFER_SIZE) { + printk("dump_write_buffer failed %d\n", bytes); + return bytes ? -ENOSPC : bytes; + } + + left -= bytes; + + /* -- A few chores to do from time to time -- */ + dump_config.dumper->count++; + + if (!(dump_config.dumper->count & 0x3f)) { + /* Update the header every one in a while */ + /* memset((void *)dump_buf, 'b', DUMP_BUFFER_SIZE);*/ + if ((ret = dump_update_header()) < 0) { + /* issue warning */ + return ret; + } + printk("."); + + touch_nmi_watchdog(); + } else if (!(dump_config.dumper->count & 0x7)) { + /* Show progress so the user knows we aren't hung */ + dump_speedo(dump_config.dumper->count >> 3); + } + /* Todo: Touch/Refresh watchdog */ + + /* --- Done with periodic chores -- */ + + + /* now adjust the leftover bits back to the top of the page */ + /* this case would not arise during stage 2 (passthru) */ + memset(dump_buf, 'z', DUMP_BUFFER_SIZE); + if (left) { + memcpy(dump_buf, dump_buf + DUMP_BUFFER_SIZE, left); + } + buf -= DUMP_BUFFER_SIZE; + dump_config.dumper->curr_buf = buf; + } + + return 0; +} + +int dump_generic_skip_data(unsigned long loc, unsigned long sz) +{ + /* dummy by default */ + return 0; +} + +/* + * Common low level routine to write a buffer to current dump device + * Expects checks for space etc to have been taken care of by the caller + * Operates serially at the moment for simplicity. + * TBD/Todo: Consider batching for improved throughput + */ +int dump_ll_write(void *buf, unsigned long len) +{ + long transferred = 0, last_transfer = 0; + int ret = 0; + + /* make sure device is ready */ + while ((ret = dump_dev_ready(NULL)) == -EAGAIN); + if (ret < 0) { + printk("dump_dev_ready failed !err %d\n", ret); + return ret; + } + + while (len) { + if ((last_transfer = dump_dev_write(buf, len)) <= 0) { + ret = last_transfer; + printk("dump_dev_write failed !err %d\n", + ret); + break; + } + /* wait till complete */ + while ((ret = dump_dev_ready(buf)) == -EAGAIN) + cpu_relax(); + + if (ret < 0) { + printk("i/o failed !err %d\n", ret); + break; + } + + len -= last_transfer; + buf += last_transfer; + transferred += last_transfer; + } + return (ret < 0) ? ret : transferred; +} + +/* default writeout routine for single dump device */ +/* writes out the dump data ensuring enough space is left for the end marker */ +int dump_generic_write_buffer(void *buf, unsigned long len) +{ + long written = 0; + int err = 0; + + /* check for space */ + if ((err = dump_dev_seek(dump_config.dumper->curr_offset + len + + 2*DUMP_BUFFER_SIZE)) < 0) { + printk("dump_write_buffer: insuff space after offset 0x%llx\n", + dump_config.dumper->curr_offset); + return err; + } + /* alignment check would happen as a side effect of this */ + if ((err = dump_dev_seek(dump_config.dumper->curr_offset)) < 0) + return err; + + written = dump_ll_write(buf, len); + + /* all or none */ + + if (written < len) + written = written ? -ENOSPC : written; + else + dump_config.dumper->curr_offset += len; + + return written; +} + +int dump_generic_configure(unsigned long devid) +{ + struct dump_dev *dev = dump_config.dumper->dev; + struct dump_data_filter *filter; + void *buf; + int ret = 0; + + /* Allocate the dump buffer and initialize dumper state */ + /* Assume that we get aligned addresses */ + if (!(buf = dump_alloc_mem(DUMP_BUFFER_SIZE + 2 * DUMP_PAGE_SIZE))) + return -ENOMEM; + + if ((unsigned long)buf & (PAGE_SIZE - 1)) { + /* sanity check for page aligned address */ + dump_free_mem(buf); + return -ENOMEM; /* fixme: better error code */ + } + + /* Initialize the rest of the fields */ + dump_config.dumper->dump_buf = buf; + dumper_reset(); + + /* Open the dump device */ + if (!dev) + return -ENODEV; + + if ((ret = dev->ops->open(dev, devid))) { + return ret; + } + + /* Initialise the memory ranges in the dump filter */ + for (filter = dump_config.dumper->filter ;filter->selector; filter++) { + if (!filter->start && !filter->end) { + filter->start = 0; + filter->end = max_mapnr << PAGE_SHIFT; + } + } + + return 0; +} + +int dump_generic_unconfigure(void) +{ + struct dump_dev *dev = dump_config.dumper->dev; + void *buf = dump_config.dumper->dump_buf; + int ret = 0; + + /* Close the dump device */ + if (dev && (ret = dev->ops->release(dev))) + return ret; + + if (buf) + dump_free_mem(buf); + + dump_config.dumper->curr_buf = dump_config.dumper->dump_buf = NULL; + return 0; +} + + +/* Set up the default dump scheme */ + +struct dump_scheme_ops dump_scheme_singlestage_ops = { + .configure = dump_generic_configure, + .unconfigure = dump_generic_unconfigure, + .sequencer = dump_generic_sequencer, + .iterator = dump_page_iterator, + .save_data = dump_generic_save_data, + .skip_data = dump_generic_skip_data, + .write_buffer = dump_generic_write_buffer, +}; + +struct dump_scheme dump_scheme_singlestage = { + .name = "single-stage", + .ops = &dump_scheme_singlestage_ops +}; + +/* The single stage dumper comprising all these */ +struct dumper dumper_singlestage = { + .name = "single-stage", + .scheme = &dump_scheme_singlestage, + .fmt = &dump_fmt_lcrash, + .compress = &dump_none_compression, + .filter = dump_filter_table, + .dev = NULL, +}; + diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/dump/dump_setup.c 900-mjb3/drivers/dump/dump_setup.c --- 000-bk3/drivers/dump/dump_setup.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/drivers/dump/dump_setup.c Sat Mar 8 22:05:11 2003 @@ -0,0 +1,729 @@ +/* + * Standard kernel function entry points for Linux crash dumps. + * + * Created by: Matt Robinson (yakker@sourceforge.net) + * Contributions from SGI, IBM, HP, MCL, and others. + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2000 - 2002 TurboLinux, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* + * ----------------------------------------------------------------------- + * + * DUMP HISTORY + * + * This dump code goes back to SGI's first attempts at dumping system + * memory on SGI systems running IRIX. A few developers at SGI needed + * a way to take this system dump and analyze it, and created 'icrash', + * or IRIX Crash. The mechanism (the dumps and 'icrash') were used + * by support people to generate crash reports when a system failure + * occurred. This was vital for large system configurations that + * couldn't apply patch after patch after fix just to hope that the + * problems would go away. So the system memory, along with the crash + * dump analyzer, allowed support people to quickly figure out what the + * problem was on the system with the crash dump. + * + * In comes Linux. SGI started moving towards the open source community, + * and upon doing so, SGI wanted to take its support utilities into Linux + * with the hopes that they would end up the in kernel and user space to + * be used by SGI's customers buying SGI Linux systems. One of the first + * few products to be open sourced by SGI was LKCD, or Linux Kernel Crash + * Dumps. LKCD comprises of a patch to the kernel to enable system + * dumping, along with 'lcrash', or Linux Crash, to analyze the system + * memory dump. A few additional system scripts and kernel modifications + * are also included to make the dump mechanism and dump data easier to + * process and use. + * + * As soon as LKCD was released into the open source community, a number + * of larger companies started to take advantage of it. Today, there are + * many community members that contribute to LKCD, and it continues to + * flourish and grow as an open source project. + */ + +/* + * DUMP TUNABLES + * + * This is the list of system tunables (via /proc) that are available + * for Linux systems. All the read, write, etc., functions are listed + * here. Currently, there are a few different tunables for dumps: + * + * dump_device (used to be dumpdev): + * The device for dumping the memory pages out to. This + * may be set to the primary swap partition for disruptive dumps, + * and must be an unused partition for non-disruptive dumps. + * Todo: In the case of network dumps, this may be interpreted + * as the IP address of the netdump server to connect to. + * + * dump_compress (used to be dump_compress_pages): + * This is the flag which indicates which compression mechanism + * to use. This is a BITMASK, not an index (0,1,2,4,8,16,etc.). + * This is the current set of values: + * + * 0: DUMP_COMPRESS_NONE -- Don't compress any pages. + * 1: DUMP_COMPRESS_RLE -- This uses RLE compression. + * 2: DUMP_COMPRESS_GZIP -- This uses GZIP compression. + * + * dump_level: + * The amount of effort the dump module should make to save + * information for post crash analysis. This value is now + * a BITMASK value, not an index: + * + * 0: Do nothing, no dumping. (DUMP_LEVEL_NONE) + * + * 1: Print out the dump information to the dump header, and + * write it out to the dump_device. (DUMP_LEVEL_HEADER) + * + * 2: Write out the dump header and all kernel memory pages. + * (DUMP_LEVEL_KERN) + * + * 4: Write out the dump header and all kernel and user + * memory pages. (DUMP_LEVEL_USED) + * + * 8: Write out the dump header and all conventional/cached + * memory (RAM) pages in the system (kernel, user, free). + * (DUMP_LEVEL_ALL_RAM) + * + * 16: Write out everything, including non-conventional memory + * like firmware, proms, I/O registers, uncached memory. + * (DUMP_LEVEL_ALL) + * + * The dump_level will default to 1. + * + * dump_flags: + * These are the flags to use when talking about dumps. There + * are lots of possibilities. This is a BITMASK value, not an index. + * + * ----------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include +#include "dump_methods.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * ----------------------------------------------------------------------- + * V A R I A B L E S + * ----------------------------------------------------------------------- + */ + +/* Dump tunables */ +struct dump_config dump_config = { + .level = 0, + .flags = 0, + .dumper = NULL +}; + + +/* Global variables used in dump.h */ +/* degree of system freeze when dumping */ +enum dump_silence_levels dump_silence_level = DUMP_HARD_SPIN_CPUS; + +/* Other global fields */ +extern struct __dump_header dump_header; +struct dump_dev *dump_dev = NULL; /* Active dump device */ +int dump_device = 0; +static int dump_compress = 0; + +static u16 dump_compress_none(const u8 *old, u16 oldsize, u8 *new, u16 newsize); +struct __dump_compress dump_none_compression = { + .compress_type = DUMP_COMPRESS_NONE, + .compress_func = dump_compress_none, + .compress_name = "none", +}; + +/* our device operations and functions */ +static int dump_ioctl(struct inode *i, struct file *f, + unsigned int cmd, unsigned long arg); + +static struct file_operations dump_fops = { + .ioctl = dump_ioctl, +}; + +/* static variables */ +static int dump_okay = 0; /* can we dump out to disk? */ +static spinlock_t dump_lock = SPIN_LOCK_UNLOCKED; + +/* used for dump compressors */ +static struct list_head dump_compress_list = LIST_HEAD_INIT(dump_compress_list); + +/* list of registered dump targets */ +static struct list_head dump_target_list = LIST_HEAD_INIT(dump_target_list); + +/* lkcd info structure -- this is used by lcrash for basic system data */ +struct __lkcdinfo lkcdinfo = { + .ptrsz = (sizeof(void *) * 8), +#if defined(__LITTLE_ENDIAN) + .byte_order = __LITTLE_ENDIAN, +#else + .byte_order = __BIG_ENDIAN, +#endif + .page_shift = PAGE_SHIFT, + .page_size = PAGE_SIZE, + .page_mask = PAGE_MASK, + .page_offset = PAGE_OFFSET, +}; + +/* + * ----------------------------------------------------------------------- + * / P R O C T U N A B L E F U N C T I O N S + * ----------------------------------------------------------------------- + */ + +static int proc_dump_device(ctl_table *ctl, int write, struct file *f, + void *buffer, size_t *lenp); + +/* + * sysctl-tuning infrastructure. + */ +static ctl_table dump_table[] = { + { .ctl_name = CTL_DUMP_LEVEL, + .procname = DUMP_LEVEL_NAME, + .data = &dump_config.level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, + + { .ctl_name = CTL_DUMP_FLAGS, + .procname = DUMP_FLAGS_NAME, + .data = &dump_config.flags, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, + + { .ctl_name = CTL_DUMP_COMPRESS, + .procname = DUMP_COMPRESS_NAME, + .data = &dump_compress, /* FIXME */ + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, + + { .ctl_name = CTL_DUMP_DEVICE, + .procname = DUMP_DEVICE_NAME, + .mode = 0644, + .data = &dump_device, /* FIXME */ + .maxlen = sizeof(int), + .proc_handler = proc_dump_device }, + + { 0, } +}; + +static ctl_table dump_root[] = { + { .ctl_name = KERN_DUMP, + .procname = "dump", + .mode = 0555, + .child = dump_table }, + { 0, } +}; + +static ctl_table kernel_root[] = { + { .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = dump_root, }, + { 0, } +}; + +static struct ctl_table_header *sysctl_header; + +/* + * ----------------------------------------------------------------------- + * C O M P R E S S I O N F U N C T I O N S + * ----------------------------------------------------------------------- + */ + +/* + * Name: dump_compress_none() + * Func: Don't do any compression, period. + */ +static u16 +dump_compress_none(const u8 *old, u16 oldsize, u8 *new, u16 newsize) +{ + /* just return the old size */ + return oldsize; +} + + +/* + * Name: dump_execute() + * Func: Execute the dumping process. This makes sure all the appropriate + * fields are updated correctly, and calls dump_execute_memdump(), + * which does the real work. + */ +void +dump_execute(const char *panic_str, const struct pt_regs *regs) +{ + int state = -1; + unsigned long flags; + + /* make sure we can dump */ + if (!dump_okay) { + pr_info("LKCD not yet configured, can't take dump now\n"); + return; + } + + /* Exclude multiple dumps at the same time, + * and disable interrupts, some drivers may re-enable + * interrupts in with silence() + * + * Try and acquire spin lock. If successful, leave preempt + * and interrupts disabled. See spin_lock_irqsave in spinlock.h + */ + local_irq_save(flags); + if (!spin_trylock(&dump_lock)) { + local_irq_restore(flags); + pr_info("LKCD dump already in progress\n"); + return; + } + + /* Bring system into the strictest level of quiescing for min drift + * dump drivers can soften this as required in dev->ops->silence() + */ + dump_oncpu = smp_processor_id() + 1; + dump_silence_level = DUMP_HARD_SPIN_CPUS; + + state = dump_generic_execute(panic_str, regs); + + dump_oncpu = 0; + spin_unlock_irqrestore(&dump_lock, flags); + + if (state < 0) { + printk("Dump Incomplete or failed!\n"); + } else { + printk("Dump Complete; %d dump pages saved.\n", + dump_header.dh_num_dump_pages); + } +} + +/* + * Name: dump_register_compression() + * Func: Register a dump compression mechanism. + */ +void +dump_register_compression(struct __dump_compress *item) +{ + if (item) + list_add(&(item->list), &dump_compress_list); +} + +/* + * Name: dump_unregister_compression() + * Func: Remove a dump compression mechanism, and re-assign the dump + * compression pointer if necessary. + */ +void +dump_unregister_compression(int compression_type) +{ + struct list_head *tmp; + struct __dump_compress *dc; + + /* let's make sure our list is valid */ + if (compression_type != DUMP_COMPRESS_NONE) { + list_for_each(tmp, &dump_compress_list) { + dc = list_entry(tmp, struct __dump_compress, list); + if (dc->compress_type == compression_type) { + list_del(&(dc->list)); + break; + } + } + } +} + +/* + * Name: dump_compress_init() + * Func: Initialize (or re-initialize) compression scheme. + */ +static int +dump_compress_init(int compression_type) +{ + struct list_head *tmp; + struct __dump_compress *dc; + + /* try to remove the compression item */ + list_for_each(tmp, &dump_compress_list) { + dc = list_entry(tmp, struct __dump_compress, list); + if (dc->compress_type == compression_type) { + dump_config.dumper->compress = dc; + dump_compress = compression_type; + pr_debug("Dump Compress %s\n", dc->compress_name); + return 0; + } + } + + /* + * nothing on the list -- return ENODATA to indicate an error + * + * NB: + * EAGAIN: reports "Resource temporarily unavailable" which + * isn't very enlightening. + */ + printk("compression_type:%d not found\n", compression_type); + + return -ENODATA; +} + +static int +dumper_setup(unsigned long flags, unsigned long devid) +{ + int ret = 0; + + /* unconfigure old dumper if it exists */ + dump_okay = 0; + if (dump_config.dumper) { + pr_debug("Unconfiguring current dumper\n"); + dump_unconfigure(); + } + /* set up new dumper */ + dump_config.dumper = &dumper_singlestage; + dump_config.dumper->dev = dump_dev; + + ret = dump_configure(devid); + if (!ret) { + dump_okay = 1; + pr_debug("%s dumper set up for dev 0x%lx\n", + dump_config.dumper->name, devid); + dump_device = devid; + } else { + printk("%s dumper set up failed for dev 0x%lx\n", + dump_config.dumper->name, devid); + } + return ret; +} + +static int +dump_target_init(int target) +{ + char type[20]; + struct list_head *tmp; + struct dump_dev *dev; + + switch (target) { + case DUMP_FLAGS_DISKDUMP: + strcpy(type, "blockdev"); break; + case DUMP_FLAGS_NETDUMP: + strcpy(type, "networkdev"); break; + default: + return -1; + } + + /* + * This is a bit stupid, generating strings from flag + * and doing strcmp. This is done because 'struct dump_dev' + * has string 'type_name' and not interger 'type'. + */ + list_for_each(tmp, &dump_target_list) { + dev = list_entry(tmp, struct dump_dev, list); + if (strcmp(type, dev->type_name) == 0) { + dump_dev = dev; + return 0; + } + } + return -1; +} + +/* + * Name: dump_ioctl() + * Func: Allow all dump tunables through a standard ioctl() mechanism. + * This is far better than before, where we'd go through /proc, + * because now this will work for multiple OS and architectures. + */ +static int +dump_ioctl(struct inode *i, struct file *f, unsigned int cmd, unsigned long arg) +{ + /* check capabilities */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dump_config.dumper && cmd == DIOSDUMPCOMPRESS) + /* dump device must be configured first */ + return -ENODEV; + + /* + * This is the main mechanism for controlling get/set data + * for various dump device parameters. The real trick here + * is setting the dump device (DIOSDUMPDEV). That's what + * triggers everything else. + */ + switch (cmd) { + case DIOSDUMPDEV: /* set dump_device */ + pr_debug("Configuring dump device\n"); + if (!(f->f_flags & O_RDWR)) + return -EPERM; + + __dump_open(); + return dumper_setup(dump_config.flags, arg); + + + case DIOGDUMPDEV: /* get dump_device */ + return put_user((long)dump_device, (long *)arg); + + case DIOSDUMPLEVEL: /* set dump_level */ + if (!(f->f_flags & O_RDWR)) + return -EPERM; + + /* make sure we have a positive value */ + if (arg < 0) + return -EINVAL; + + /* Fixme: clean this up */ + dump_config.level = 0; + switch ((int)arg) { + case DUMP_LEVEL_ALL: + case DUMP_LEVEL_ALL_RAM: + dump_config.level |= DUMP_MASK_UNUSED; + case DUMP_LEVEL_USED: + dump_config.level |= DUMP_MASK_USED; + case DUMP_LEVEL_KERN: + dump_config.level |= DUMP_MASK_KERN; + case DUMP_LEVEL_HEADER: + dump_config.level |= DUMP_MASK_HEADER; + case DUMP_LEVEL_NONE: + break; + default: + return (-EINVAL); + } + pr_debug("Dump Level 0x%lx\n", dump_config.level); + break; + + case DIOGDUMPLEVEL: /* get dump_level */ + /* fixme: handle conversion */ + return put_user((long)dump_config.level, (long *)arg); + + + case DIOSDUMPFLAGS: /* set dump_flags */ + /* check flags */ + if (!(f->f_flags & O_RDWR)) + return -EPERM; + + /* make sure we have a positive value */ + if (arg < 0) + return -EINVAL; + + if (dump_target_init(arg & DUMP_FLAGS_TARGETMASK) < 0) + return -EINVAL; /* return proper error */ + + dump_config.flags = arg; + + pr_debug("Dump Flags 0x%lx\n", dump_config.flags); + break; + + case DIOGDUMPFLAGS: /* get dump_flags */ + return put_user((long)dump_config.flags, (long *)arg); + + case DIOSDUMPCOMPRESS: /* set the dump_compress status */ + if (!(f->f_flags & O_RDWR)) + return -EPERM; + + return dump_compress_init((int)arg); + + case DIOGDUMPCOMPRESS: /* get the dump_compress status */ + return put_user((long)(dump_config.dumper ? + dump_config.dumper->compress->compress_type : 0), + (long *)arg); + + default: + /* + * these are network dump specific ioctls, let the + * module handle them. + */ + return dump_dev_ioctl(cmd, arg); + } + return 0; +} + +/* + * Handle special cases for dump_device + * changing dump device requires doing an opening the device + */ +static int +proc_dump_device(ctl_table *ctl, int write, struct file *f, + void *buffer, size_t *lenp) +{ + int *valp = ctl->data; + int oval = *valp; + int ret = -EPERM; + + /* same permission checks as ioctl */ + if (capable(CAP_SYS_ADMIN)) { + ret = proc_dointvec(ctl, write, f, buffer, lenp); + if (ret == 0 && write && *valp != oval) { + /* need to restore old value to close properly */ + dump_device = (dev_t) oval; + __dump_open(); + ret = dumper_setup(dump_config.flags, (dev_t) *valp); + } + } + + return ret; +} + +/* + * ----------------------------------------------------------------------- + * I N I T F U N C T I O N S + * ----------------------------------------------------------------------- + */ + +/* + * These register and unregister routines are exported for modules + * to register their dump drivers (like block, net etc) + */ +int +dump_register_device(struct dump_dev *ddev) +{ + struct list_head *tmp; + struct dump_dev *dev; + + list_for_each(tmp, &dump_target_list) { + dev = list_entry(tmp, struct dump_dev, list); + if (strcmp(ddev->type_name, dev->type_name) == 0) { + printk("Target type %s already registered\n", + dev->type_name); + return -1; /* return proper error */ + } + } + list_add(&(ddev->list), &dump_target_list); + + return 0; +} + +void +dump_unregister_device(struct dump_dev *ddev) +{ + list_del(&(ddev->list)); + if (ddev != dump_dev) + return; + + dump_okay = 0; + + if (dump_config.dumper) + dump_unconfigure(); + + dump_config.flags &= ~DUMP_FLAGS_TARGETMASK; + dump_okay = 0; + dump_dev = NULL; + dump_config.dumper = NULL; +} + + +#ifdef CONFIG_MAGIC_SYSRQ +/* Sysrq handler */ +static void sysrq_handle_crashdump(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) { + dump_execute("sysrq", pt_regs); +} + +static struct sysrq_key_op sysrq_crashdump_op = { + .handler = sysrq_handle_crashdump, + .help_msg = "Dump", + .action_msg = "Starting crash dump", +}; +#endif + +static inline void +dump_sysrq_register(void) +{ +#ifdef CONFIG_MAGIC_SYSRQ + __sysrq_lock_table(); + __sysrq_put_key_op(DUMP_SYSRQ_KEY, &sysrq_crashdump_op); + __sysrq_unlock_table(); +#endif +} + +static inline void +dump_sysrq_unregister(void) +{ +#ifdef CONFIG_MAGIC_SYSRQ + __sysrq_lock_table(); + if (__sysrq_get_key_op(DUMP_SYSRQ_KEY) == &sysrq_crashdump_op) + __sysrq_put_key_op(DUMP_SYSRQ_KEY, NULL); + __sysrq_unlock_table(); +#endif +} + +/* + * Name: dump_init() + * Func: Initialize the dump process. This will set up any architecture + * dependent code. The big key is we need the memory offsets before + * the page table is initialized, because the base memory offset + * is changed after paging_init() is called. + */ +static int __init +dump_init(void) +{ + struct sysinfo info; + + /* try to create our dump device */ + if (register_chrdev(CRASH_DUMP_MAJOR, "dump", &dump_fops)) { + printk("cannot register dump character device!\n"); + return -EBUSY; + } + + __dump_init((u64)PAGE_OFFSET); + + /* set the dump_compression_list structure up */ + dump_register_compression(&dump_none_compression); + + /* grab the total memory size now (not if/when we crash) */ + si_meminfo(&info); + + /* set the memory size */ + dump_header.dh_memory_size = (u64)info.totalram; + + sysctl_header = register_sysctl_table(kernel_root, 0); + dump_sysrq_register(); + + pr_info("Crash dump driver initialized.\n"); + return 0; +} + +static void __exit +dump_cleanup(void) +{ + dump_okay = 0; + + if (dump_config.dumper) + dump_unconfigure(); + + /* arch-specific cleanup routine */ + __dump_cleanup(); + + /* ignore errors while unregistering -- since can't do anything */ + unregister_sysctl_table(sysctl_header); + unregister_chrdev(CRASH_DUMP_MAJOR, "dump"); + dump_sysrq_unregister(); +} + +EXPORT_SYMBOL(dump_register_compression); +EXPORT_SYMBOL(dump_unregister_compression); +EXPORT_SYMBOL(dump_register_device); +EXPORT_SYMBOL(dump_unregister_device); +EXPORT_SYMBOL(dump_config); +EXPORT_SYMBOL(dump_silence_level); + +EXPORT_SYMBOL(__dump_irq_enable); +EXPORT_SYMBOL(__dump_irq_restore); + +MODULE_AUTHOR("Matt D. Robinson "); +MODULE_DESCRIPTION("Linux Kernel Crash Dump (LKCD) driver"); +MODULE_LICENSE("GPL"); + +module_init(dump_init); +module_exit(dump_cleanup); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/net/3c59x.c 900-mjb3/drivers/net/3c59x.c --- 000-bk3/drivers/net/3c59x.c Fri Feb 21 23:40:47 2003 +++ 900-mjb3/drivers/net/3c59x.c Sat Mar 8 22:05:11 2003 @@ -885,6 +885,7 @@ static void set_rx_mode(struct net_devic static int vortex_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static void vortex_tx_timeout(struct net_device *dev); static void acpi_set_WOL(struct net_device *dev); +static void vorboom_poll(struct net_device *dev); /* This driver uses 'options' to pass the media type, full-duplex flag, etc. */ /* Option count limit only -- unlimited interfaces are supported. */ @@ -1444,6 +1445,9 @@ static int __devinit vortex_probe1(struc dev->set_multicast_list = set_rx_mode; dev->tx_timeout = vortex_tx_timeout; dev->watchdog_timeo = (watchdog * HZ) / 1000; +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &vorboom_poll; +#endif if (pdev && vp->enable_wol) { vp->pm_state_valid = 1; pci_save_state(VORTEX_PCI(vp), vp->power_state); @@ -2427,6 +2431,29 @@ static void boomerang_interrupt(int irq, handler_exit: spin_unlock(&vp->lock); } + +#ifdef HAVE_POLL_CONTROLLER + +/* + * Polling 'interrupt' - used by things like netconsole to send skbs + * without having to re-enable interrupts. It's not called while + * the interrupt routine is executing. + */ + +static void vorboom_poll (struct net_device *dev) +{ + struct vortex_private *vp = (struct vortex_private *)dev->priv; + + disable_irq(dev->irq); + if (vp->full_bus_master_tx) + boomerang_interrupt(dev->irq, dev, 0); + else + vortex_interrupt(dev->irq, dev, 0); + enable_irq(dev->irq); +} + +#endif + static int vortex_rx(struct net_device *dev) { diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/net/e100/e100_main.c 900-mjb3/drivers/net/e100/e100_main.c --- 000-bk3/drivers/net/e100/e100_main.c Wed Mar 5 07:37:02 2003 +++ 900-mjb3/drivers/net/e100/e100_main.c Sat Mar 8 22:05:11 2003 @@ -551,6 +551,22 @@ e100_trigger_SWI(struct e100_private *bd readw(&(bdp->scb->scb_status)); /* flushes last write, read-safe */ } +#ifdef HAVE_POLL_CONTROLLER + +/* + * Polling 'interrupt' - used by things like netconsole to send skbs + * without having to re-enable interrupts. It's not called while + * the interrupt routine is executing. + */ +static void +e100_poll(struct net_device *dev) +{ + disable_irq(dev->irq); + e100intr(dev->irq, dev, NULL); + enable_irq(dev->irq); +} +#endif + static int __devinit e100_found1(struct pci_dev *pcid, const struct pci_device_id *ent) { @@ -569,6 +585,9 @@ e100_found1(struct pci_dev *pcid, const SET_MODULE_OWNER(dev); +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &e100_poll; +#endif if (first_time) { first_time = false; printk(KERN_NOTICE "%s - version %s\n", diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/net/e1000/e1000_main.c 900-mjb3/drivers/net/e1000/e1000_main.c --- 000-bk3/drivers/net/e1000/e1000_main.c Tue Feb 25 23:03:47 2003 +++ 900-mjb3/drivers/net/e1000/e1000_main.c Sat Mar 8 22:05:11 2003 @@ -162,6 +162,7 @@ static void e1000_leave_82542_rst(struct static inline void e1000_rx_checksum(struct e1000_adapter *adapter, struct e1000_rx_desc *rx_desc, struct sk_buff *skb); +static void e1000_Poll(struct net_device *dev); static void e1000_tx_timeout(struct net_device *dev); static void e1000_tx_timeout_task(struct net_device *dev); @@ -406,6 +407,9 @@ e1000_probe(struct pci_dev *pdev, adapter->bd_number = cards_found; adapter->id_string = e1000_strings[ent->driver_data]; +#ifdef HAVE_POLL_CONTROLLER + netdev->poll_controller = &e1000_Poll; +#endif /* setup the private structure */ if(e1000_sw_init(adapter)) @@ -1834,6 +1838,15 @@ e1000_intr(int irq, void *data, struct p } #endif } + +#ifdef HAVE_POLL_CONTROLLER +static void e1000_Poll(struct net_device *dev) +{ + disable_irq(dev->irq); + e1000_intr(dev->irq, dev, NULL); + enable_irq(dev->irq); +} +#endif #ifdef CONFIG_E1000_NAPI static int diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/net/eepro100.c 900-mjb3/drivers/net/eepro100.c --- 000-bk3/drivers/net/eepro100.c Wed Mar 5 07:37:02 2003 +++ 900-mjb3/drivers/net/eepro100.c Sat Mar 8 22:05:11 2003 @@ -544,6 +544,7 @@ static void speedo_refill_rx_buffers(str static int speedo_rx(struct net_device *dev); static void speedo_tx_buffer_gc(struct net_device *dev); static void speedo_interrupt(int irq, void *dev_instance, struct pt_regs *regs); +static void poll_speedo (struct net_device *dev); static int speedo_close(struct net_device *dev); static struct net_device_stats *speedo_get_stats(struct net_device *dev); static int speedo_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); @@ -879,6 +880,9 @@ static int __devinit speedo_found1(struc dev->get_stats = &speedo_get_stats; dev->set_multicast_list = &set_rx_mode; dev->do_ioctl = &speedo_ioctl; +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &poll_speedo; +#endif return 0; } @@ -1658,6 +1662,23 @@ static void speedo_interrupt(int irq, vo clear_bit(0, (void*)&sp->in_interrupt); return; } + +#ifdef HAVE_POLL_CONTROLLER + +/* + * Polling 'interrupt' - used by things like netconsole to send skbs + * without having to re-enable interrupts. It's not called while + * the interrupt routine is executing. + */ + +static void poll_speedo (struct net_device *dev) +{ + disable_irq(dev->irq); + speedo_interrupt (dev->irq, dev, NULL); + enable_irq(dev->irq); +} + +#endif static inline struct RxFD *speedo_rx_alloc(struct net_device *dev, int entry) { diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/net/smc-ultra.c 900-mjb3/drivers/net/smc-ultra.c --- 000-bk3/drivers/net/smc-ultra.c Wed Mar 5 07:37:02 2003 +++ 900-mjb3/drivers/net/smc-ultra.c Sat Mar 8 22:05:11 2003 @@ -122,6 +122,14 @@ MODULE_DEVICE_TABLE(isapnp, ultra_device #define ULTRA_IO_EXTENT 32 #define EN0_ERWCNT 0x08 /* Early receive warning count. */ + +static void ultra_poll(struct net_device *dev) +{ + disable_irq(dev->irq); + ei_interrupt(dev->irq, dev, NULL); + enable_irq(dev->irq); +} + /* Probe for the Ultra. This looks like a 8013 with the station address PROM at I/O ports +8 to +13, with a checksum following. @@ -134,6 +142,9 @@ int __init ultra_probe(struct net_device SET_MODULE_OWNER(dev); +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &ultra_poll; +#endif if (base_addr > 0x1ff) /* Check a single specified location. */ return ultra_probe1(dev, base_addr); else if (base_addr != 0) /* Don't probe at all. */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/net/tlan.c 900-mjb3/drivers/net/tlan.c --- 000-bk3/drivers/net/tlan.c Wed Mar 5 07:37:02 2003 +++ 900-mjb3/drivers/net/tlan.c Sat Mar 8 22:05:11 2003 @@ -345,6 +345,8 @@ static int TLan_EeSendByte( u16, u8, int static void TLan_EeReceiveByte( u16, u8 *, int ); static int TLan_EeReadByte( struct net_device *, u8, u8 * ); +static void TLan_Poll(struct net_device *); + static TLanIntVectorFunc *TLanIntVector[TLAN_INT_NUMBER_OF_INTS] = { TLan_HandleInvalid, @@ -854,6 +856,9 @@ static int TLan_Init( struct net_device dev->get_stats = &TLan_GetStats; dev->set_multicast_list = &TLan_SetMulticastList; dev->do_ioctl = &TLan_ioctl; +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &TLan_Poll; +#endif dev->tx_timeout = &TLan_tx_timeout; dev->watchdog_timeo = TX_TIMEOUT; @@ -1136,7 +1141,14 @@ static void TLan_HandleInterrupt(int irq } /* TLan_HandleInterrupts */ - +#ifdef HAVE_POLL_CONTROLLER +static void TLan_Poll(struct net_device *dev) +{ + disable_irq(dev->irq); + TLan_HandleInterrupt(dev->irq, dev, NULL); + enable_irq(dev->irq); +} +#endif /*************************************************************** diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/net/tulip/tulip_core.c 900-mjb3/drivers/net/tulip/tulip_core.c --- 000-bk3/drivers/net/tulip/tulip_core.c Wed Mar 5 07:37:02 2003 +++ 900-mjb3/drivers/net/tulip/tulip_core.c Sat Mar 8 22:05:11 2003 @@ -242,6 +242,7 @@ static void tulip_down(struct net_device static struct net_device_stats *tulip_get_stats(struct net_device *dev); static int private_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static void set_rx_mode(struct net_device *dev); +static void poll_tulip(struct net_device *dev); @@ -1618,6 +1619,9 @@ static int __devinit tulip_init_one (str dev->get_stats = tulip_get_stats; dev->do_ioctl = private_ioctl; dev->set_multicast_list = set_rx_mode; +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &poll_tulip; +#endif if (register_netdev(dev)) goto err_out_free_ring; @@ -1773,6 +1777,24 @@ static void __devexit tulip_remove_one ( /* pci_power_off (pdev, -1); */ } + + +#ifdef HAVE_POLL_CONTROLLER + +/* + * Polling 'interrupt' - used by things like netconsole to send skbs + * without having to re-enable interrupts. It's not called while + * the interrupt routine is executing. + */ + +static void poll_tulip (struct net_device *dev) +{ + disable_irq(dev->irq); + tulip_interrupt (dev->irq, dev, NULL); + enable_irq(dev->irq); +} + +#endif static struct pci_driver tulip_driver = { diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/serial/8250.c 900-mjb3/drivers/serial/8250.c --- 000-bk3/drivers/serial/8250.c Sat Mar 8 10:16:05 2003 +++ 900-mjb3/drivers/serial/8250.c Sat Mar 8 22:05:30 2003 @@ -1982,10 +1982,11 @@ static struct console serial8250_console .index = -1, }; -static void __init serial8250_console_init(void) +static int __init serial8250_console_init(void) { serial8250_isa_init_ports(); register_console(&serial8250_console); + return 0; } console_initcall(serial8250_console_init); @@ -2094,9 +2095,116 @@ void serial8250_get_irq_map(unsigned int } } -static int __init serial8250_init(void) +#ifdef CONFIG_X86_REMOTE_DEBUG +/* + * Takes: + * ttyS - integer specifying which serial port to use for debugging + * baud - baud rate of specified serial port + * Returns: + * port for use by the gdb serial driver + */ +int gdb_serial_setup(int ttyS, int baud, int *port, int *irq) +{ + struct uart_8250_port *up; + unsigned cval; + int bits = 8; + int parity = 'n'; + int cflag = CREAD | HUPCL | CLOCAL; + int quot = 0; + + /* + * Now construct a cflag setting. + */ + switch(baud) { + case 1200: + cflag |= B1200; + break; + case 2400: + cflag |= B2400; + break; + case 4800: + cflag |= B4800; + break; + case 19200: + cflag |= B19200; + break; + case 38400: + cflag |= B38400; + break; + case 57600: + cflag |= B57600; + break; + case 115200: + cflag |= B115200; + break; + case 9600: + default: + cflag |= B9600; + break; + } + switch(bits) { + case 7: + cflag |= CS7; + break; + default: + case 8: + cflag |= CS8; + break; + } + switch(parity) { + case 'o': case 'O': + cflag |= PARODD; + break; + case 'e': case 'E': + cflag |= PARENB; + break; + } + + /* + * Divisor, bytesize and parity + */ + + up = &serial8250_ports[ttyS]; +// ser->flags &= ~ASYNC_BOOT_AUTOCONF; + quot = ( 1843200 / 16 ) / baud; + cval = cflag & (CSIZE | CSTOPB); + cval >>= 4; + if (cflag & PARENB) + cval |= UART_LCR_PARITY; + if (!(cflag & PARODD)) + cval |= UART_LCR_EPAR; + + /* + * Disable UART interrupts, set DTR and RTS high + * and set speed. + */ + cval = 0x3; + serial_outp(up, UART_LCR, cval | UART_LCR_DLAB); /* set DLAB */ + serial_outp(up, UART_DLL, quot & 0xff); /* LS of divisor */ + serial_outp(up, UART_DLM, quot >> 8); /* MS of divisor */ + serial_outp(up, UART_LCR, cval); /* reset DLAB */ + serial_outp(up, UART_IER, UART_IER_RDI); /* turn on interrupts*/ + serial_outp(up, UART_MCR, UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS); + + /* + * If we read 0xff from the LSR, there is no UART here. + */ + if (serial_inp(up, UART_LSR) == 0xff) + return 1; + *port = up->port.iobase; + *irq = up->port.irq; +// serial8250_shutdown(&up->port); + return 0; +} +#endif + +int serial8250_init(void) { int ret, i; + static int didit = 0; + + if (didit++) + return 0; printk(KERN_INFO "Serial: 8250/16550 driver $Revision: 1.90 $ " "IRQ sharing %sabled\n", share_irqs ? "en" : "dis"); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/drivers/serial/core.c 900-mjb3/drivers/serial/core.c --- 000-bk3/drivers/serial/core.c Sat Mar 8 10:16:05 2003 +++ 900-mjb3/drivers/serial/core.c Sat Mar 8 22:04:47 2003 @@ -36,6 +36,10 @@ #include #include /* for serial_state and serial_icounter_struct */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + #include #include @@ -1043,6 +1047,17 @@ uart_ioctl(struct tty_struct *tty, struc (unsigned int *)arg); break; +#ifdef CONFIG_X86_REMOTE_DEBUG + case TIOCGDB: + ret = -ENOTTY; + if (capable(CAP_SYS_ADMIN)) { + gdb_ttyS = minor(tty->device) & 0x03F; + gdb_baud = tty_get_baud_rate(tty); + ret = gdb_hook(); + } + break; +#endif + case TIOCMBIS: case TIOCMBIC: case TIOCMSET: @@ -1118,6 +1133,30 @@ uart_ioctl(struct tty_struct *tty, struc } return ret; } + + /* + * ------------------------------------------------------------ + * Serial GDB driver (most in gdbserial.c) + * ------------------------------------------------------------ + */ + +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_GDB_CONSOLE +static struct console gdbcons = { + name: "gdb", + write: gdb_console_write, + flags: CON_PRINTBUFFER | CON_ENABLED, + index: -1, +}; +#endif + +#ifdef CONFIG_GDB_CONSOLE +void __init gdb_console_init(void) +{ + register_console(&gdbcons); +} +#endif +#endif /* CONFIG_X86_REMOTE_DEBUG */ static void uart_set_termios(struct tty_struct *tty, struct termios *old_termios) { diff -urpN -X /home/fletch/.diff.exclude 000-bk3/fs/dcache.c 900-mjb3/fs/dcache.c --- 000-bk3/fs/dcache.c Wed Mar 5 07:37:05 2003 +++ 900-mjb3/fs/dcache.c Wed Mar 12 19:45:37 2003 @@ -24,6 +24,7 @@ #include #include #include +#include #include #define DCACHE_PARANOIA 1 @@ -1571,7 +1572,7 @@ void __init vfs_caches_init(unsigned lon filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + SLAB_HWCACHE_ALIGN, filp_ctor, filp_dtor); if(!filp_cachep) panic("Cannot create filp SLAB cache"); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/fs/exec.c 900-mjb3/fs/exec.c --- 000-bk3/fs/exec.c Tue Feb 25 23:03:49 2003 +++ 900-mjb3/fs/exec.c Sat Mar 8 22:04:41 2003 @@ -316,6 +316,7 @@ void put_dirty_page(struct task_struct * lru_cache_add_active(page); flush_dcache_page(page); flush_page_to_ram(page); + SetPageAnon(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/fs/file_table.c 900-mjb3/fs/file_table.c --- 000-bk3/fs/file_table.c Tue Feb 25 23:03:49 2003 +++ 900-mjb3/fs/file_table.c Wed Mar 12 19:47:00 2003 @@ -22,72 +22,81 @@ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -/* Here the new files go */ -static LIST_HEAD(anon_list); -/* And here the free ones sit */ -static LIST_HEAD(free_list); /* public *and* exported. Not pretty! */ -spinlock_t files_lock = SPIN_LOCK_UNLOCKED; +spinlock_t files_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; + +static spinlock_t filp_count_lock = SPIN_LOCK_UNLOCKED; + +/* slab constructors and destructors are called from arbitrary + * context and must be fully threaded - use a local spinlock + * to protect files_stat.nr_files + */ +void filp_ctor(void * objp, struct kmem_cache_s *cachep, unsigned long cflags) +{ + if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + unsigned long flags; + spin_lock_irqsave(&filp_count_lock, flags); + files_stat.nr_files++; + spin_unlock_irqrestore(&filp_count_lock, flags); + } +} + +void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags) +{ + unsigned long flags; + spin_lock_irqsave(&filp_count_lock, flags); + files_stat.nr_files--; + spin_unlock_irqrestore(&filp_count_lock, flags); +} + +static inline void file_free(struct file *f) +{ + kmem_cache_free(filp_cachep, f); +} /* Find an unused file structure and return a pointer to it. * Returns NULL, if there are no more free file structures or * we run out of memory. - * - * SMP-safe. */ -struct file * get_empty_filp(void) +struct file *get_empty_filp(void) { - static int old_max = 0; +static int old_max = 0; struct file * f; - file_list_lock(); - if (files_stat.nr_free_files > NR_RESERVED_FILES) { - used_one: - f = list_entry(free_list.next, struct file, f_list); - list_del(&f->f_list); - files_stat.nr_free_files--; - new_one: - memset(f, 0, sizeof(*f)); - if (security_file_alloc(f)) { - list_add(&f->f_list, &free_list); - files_stat.nr_free_files++; - file_list_unlock(); - return NULL; - } - eventpoll_init_file(f); - atomic_set(&f->f_count,1); - f->f_version = 0; - f->f_uid = current->fsuid; - f->f_gid = current->fsgid; - f->f_owner.lock = RW_LOCK_UNLOCKED; - list_add(&f->f_list, &anon_list); - file_list_unlock(); - return f; - } - /* - * Use a reserved one if we're the superuser - */ - if (files_stat.nr_free_files && !current->euid) - goto used_one; /* - * Allocate a new one if we're below the limit. + * Privileged users can go above max_files */ - if (files_stat.nr_files < files_stat.max_files) { - file_list_unlock(); - f = kmem_cache_alloc(filp_cachep, SLAB_KERNEL); - file_list_lock(); + if (files_stat.nr_files < files_stat.max_files || + capable(CAP_SYS_ADMIN)) { + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (f) { - files_stat.nr_files++; - goto new_one; + memset(f, 0, sizeof(*f)); + if (security_file_alloc(f)) { + file_free(f); + goto fail; + } + eventpoll_init_file(f); + atomic_set(&f->f_count, 1); + f->f_uid = current->fsuid; + f->f_gid = current->fsgid; + f->f_owner.lock = RW_LOCK_UNLOCKED; + /* f->f_version: 0 */ + INIT_LIST_HEAD(&f->f_list); + return f; } - /* Big problems... */ - printk(KERN_WARNING "VFS: filp allocation failed\n"); + } - } else if (files_stat.max_files > old_max) { - printk(KERN_INFO "VFS: file-max limit %d reached\n", files_stat.max_files); + /* Ran out of filps - report that */ + if (files_stat.max_files >= old_max) { + printk(KERN_INFO "VFS: file-max limit %d reached\n", + files_stat.max_files); old_max = files_stat.max_files; + } else { + /* Big problems... */ + printk(KERN_WARNING "VFS: filp allocation failed\n"); } - file_list_unlock(); +fail: return NULL; } @@ -106,6 +115,7 @@ int init_private_file(struct file *filp, filp->f_uid = current->fsuid; filp->f_gid = current->fsgid; filp->f_op = dentry->d_inode->i_fop; + INIT_LIST_HEAD(&filp->f_list); if (filp->f_op->open) return filp->f_op->open(dentry->d_inode, filp); else @@ -121,11 +131,11 @@ void fput(struct file * file) /* __fput is called from task context when aio completion releases the last * last use of a struct file *. Do not use otherwise. */ -void __fput(struct file * file) +void __fput(struct file *file) { - struct dentry * dentry = file->f_dentry; - struct vfsmount * mnt = file->f_vfsmnt; - struct inode * inode = dentry->d_inode; + struct dentry *dentry = file->f_dentry; + struct vfsmount *mnt = file->f_vfsmnt; + struct inode *inode = dentry->d_inode; /* * The function eventpoll_release() should be the first called @@ -144,16 +154,15 @@ void __fput(struct file * file) file->f_dentry = NULL; file->f_vfsmnt = NULL; list_del(&file->f_list); - list_add(&file->f_list, &free_list); - files_stat.nr_free_files++; file_list_unlock(); + file_free(file); dput(dentry); mntput(mnt); } -struct file * fget(unsigned int fd) +struct file *fget(unsigned int fd) { - struct file * file; + struct file *file; struct files_struct *files = current->files; read_lock(&files->file_lock); @@ -164,17 +173,14 @@ struct file * fget(unsigned int fd) return file; } -/* Here. put_filp() is SMP-safe now. */ - void put_filp(struct file *file) { - if(atomic_dec_and_test(&file->f_count)) { + if (atomic_dec_and_test(&file->f_count)) { security_file_free(file); file_list_lock(); list_del(&file->f_list); - list_add(&file->f_list, &free_list); - files_stat.nr_free_files++; file_list_unlock(); + file_free(file); } } @@ -183,8 +189,7 @@ void file_move(struct file *file, struct if (!list) return; file_list_lock(); - list_del(&file->f_list); - list_add(&file->f_list, list); + list_move(&file->f_list, list); file_list_unlock(); } @@ -209,7 +214,7 @@ int fs_may_remount_ro(struct super_block if (inode->i_nlink == 0) goto too_bad; - /* Writable file? */ + /* Writeable file? */ if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) goto too_bad; } diff -urpN -X /home/fletch/.diff.exclude 000-bk3/fs/inode.c 900-mjb3/fs/inode.c --- 000-bk3/fs/inode.c Sat Mar 8 10:16:05 2003 +++ 900-mjb3/fs/inode.c Sat Mar 8 22:04:26 2003 @@ -1088,16 +1088,19 @@ sector_t bmap(struct inode * inode, sect void update_atime(struct inode *inode) { - struct timespec now = CURRENT_TIME; + struct timespec now; - /* Can later do this more lazily with a per superblock interval */ - if (timespec_equal(&inode->i_atime, &now)) - return; if (IS_NOATIME(inode)) return; if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) return; if (IS_RDONLY(inode)) + return; + + now = CURRENT_TIME; + + /* Can later do this more lazily with a per superblock interval */ + if (timespec_equal(&inode->i_atime, &now)) return; inode->i_atime = now; mark_inode_dirty_sync(inode); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/fs/proc/proc_misc.c 900-mjb3/fs/proc/proc_misc.c --- 000-bk3/fs/proc/proc_misc.c Wed Mar 5 07:37:06 2003 +++ 900-mjb3/fs/proc/proc_misc.c Sat Mar 8 22:05:12 2003 @@ -97,6 +97,37 @@ static int loadavg_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +static int real_loadavg_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int a, b, c, cpu; + int len; + + a = tasks_running[0] + (FIXED_1/200); + b = tasks_running[1] + (FIXED_1/200); + c = tasks_running[2] + (FIXED_1/200); + len = sprintf(page,"Domain load1 load2 load3 nr_run/nr_thrd\n"); + len += sprintf(page+len,"SYSTEM %5d.%02d %5d.%02d %5d.%02d %7ld/%7d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running(), nr_threads); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!cpu_online(cpu)) + continue; + a = cpu_tasks_running[0][cpu] + (FIXED_1/200); + b = cpu_tasks_running[1][cpu] + (FIXED_1/200); + c = cpu_tasks_running[2][cpu] + (FIXED_1/200); + len += sprintf(page+len, "%5d %5d.%02d %5d.%02d %5d.%02d %7ld/7%d\n", + cpu, + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running_cpu(cpu), nr_threads); + } + return proc_calc_metrics(page, start, off, count, eof, len); +} + static int uptime_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -133,6 +164,40 @@ static int uptime_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +struct vmalloc_info { + unsigned long used; + unsigned long largest_chunk; +}; + +static struct vmalloc_info get_vmalloc_info(void) +{ + unsigned long prev_end = VMALLOC_START; + struct vm_struct* vma; + struct vmalloc_info vmi; + vmi.used = 0; + + read_lock(&vmlist_lock); + + if(!vmlist) + vmi.largest_chunk = (VMALLOC_END-VMALLOC_START); + else + vmi.largest_chunk = 0; + + for (vma = vmlist; vma; vma = vma->next) { + unsigned long free_area_size = + (unsigned long)vma->addr - prev_end; + vmi.used += vma->size; + if (vmi.largest_chunk < free_area_size ) + vmi.largest_chunk = free_area_size; + prev_end = vma->size + (unsigned long)vma->addr; + } + if(VMALLOC_END-prev_end > vmi.largest_chunk) + vmi.largest_chunk = VMALLOC_END-prev_end; + + read_unlock(&vmlist_lock); + return vmi; +} + extern atomic_t vm_committed_space; static int meminfo_read_proc(char *page, char **start, off_t off, @@ -144,6 +209,8 @@ static int meminfo_read_proc(char *page, unsigned long inactive; unsigned long active; unsigned long free; + unsigned long vmtot; + struct vmalloc_info vmi; get_page_state(&ps); get_zone_counts(&active, &inactive, &free); @@ -156,6 +223,11 @@ static int meminfo_read_proc(char *page, si_swapinfo(&i); committed = atomic_read(&vm_committed_space); + vmtot = (VMALLOC_END-VMALLOC_START)>>10; + vmi = get_vmalloc_info(); + vmi.used >>= 10; + vmi.largest_chunk >>= 10; + /* * Tagged format, for easy grepping and expansion. */ @@ -179,7 +251,10 @@ static int meminfo_read_proc(char *page, "Slab: %8lu kB\n" "Committed_AS: %8u kB\n" "PageTables: %8lu kB\n" - "ReverseMaps: %8lu\n", + "ReverseMaps: %8lu\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.bufferram), @@ -199,7 +274,10 @@ static int meminfo_read_proc(char *page, K(ps.nr_slab), K(committed), K(ps.nr_page_table_pages), - ps.nr_reverse_maps + ps.nr_reverse_maps, + vmtot, + vmi.used, + vmi.largest_chunk ); len += hugetlb_report_meminfo(page + len); @@ -257,6 +335,9 @@ static struct file_operations proc_vmsta .release = seq_release, }; +extern int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data); + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -301,6 +382,71 @@ static struct file_operations proc_modul }; #endif +#ifdef CONFIG_NUMA +#define K(x) ((x) << (PAGE_SHIFT - 10)) +static int show_meminfo_numa (struct seq_file *m, void *v) +{ + int *d = v; + int nid = *d; + struct sysinfo i; + si_meminfo_node(&i, nid); + seq_printf(m, "\n" + "Node %d MemTotal: %8lu kB\n" + "Node %d MemFree: %8lu kB\n" + "Node %d MemUsed: %8lu kB\n" + "Node %d HighTotal: %8lu kB\n" + "Node %d HighFree: %8lu kB\n" + "Node %d LowTotal: %8lu kB\n" + "Node %d LowFree: %8lu kB\n", + nid, K(i.totalram), + nid, K(i.freeram), + nid, K(i.totalram-i.freeram), + nid, K(i.totalhigh), + nid, K(i.freehigh), + nid, K(i.totalram-i.totalhigh), + nid, K(i.freeram-i.freehigh)); + + return 0; +} +#undef K + +extern struct seq_operations meminfo_numa_op; +static int meminfo_numa_open(struct inode *inode, struct file *file) +{ + return seq_open(file,&meminfo_numa_op); +} + +static struct file_operations proc_meminfo_numa_operations = { + open: meminfo_numa_open, + read: seq_read, + llseek: seq_lseek, + release: seq_release, +}; + +static void *meminfo_numa_start(struct seq_file *m, loff_t *pos) +{ + return *pos < numnodes ? pos : NULL; +} + +static void *meminfo_numa_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return meminfo_numa_start(m, pos); +} + +static void meminfo_numa_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations meminfo_numa_op = { + .start = meminfo_numa_start, + .next = meminfo_numa_next, + .stop = meminfo_numa_stop, + .show = show_meminfo_numa, +}; + +#endif + extern struct seq_operations slabinfo_op; extern ssize_t slabinfo_write(struct file *, const char *, size_t, loff_t *); static int slabinfo_open(struct inode *inode, struct file *file) @@ -543,6 +689,7 @@ void __init proc_misc_init(void) int (*read_proc)(char*,char**,off_t,int,int*,void*); } *p, simple_ones[] = { {"loadavg", loadavg_read_proc}, + {"real_loadavg",real_loadavg_read_proc}, {"uptime", uptime_read_proc}, {"meminfo", meminfo_read_proc}, {"version", version_read_proc}, @@ -561,6 +708,7 @@ void __init proc_misc_init(void) #endif {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"schedstat", schedstats_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) @@ -582,6 +730,9 @@ void __init proc_misc_init(void) create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations); #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); +#endif +#ifdef CONFIG_NUMA + create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations); #endif proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { diff -urpN -X /home/fletch/.diff.exclude 000-bk3/fs/sysfs/dir.c 900-mjb3/fs/sysfs/dir.c --- 000-bk3/fs/sysfs/dir.c Sat Mar 8 10:16:05 2003 +++ 900-mjb3/fs/sysfs/dir.c Wed Mar 12 19:47:37 2003 @@ -98,7 +98,6 @@ void sysfs_remove_dir(struct kobject * k * Unlink and unhash. */ spin_unlock(&dcache_lock); - d_delete(d); simple_unlink(dentry->d_inode,d); dput(d); spin_lock(&dcache_lock); @@ -108,16 +107,11 @@ void sysfs_remove_dir(struct kobject * k } up(&dentry->d_inode->i_sem); - d_invalidate(dentry); - simple_rmdir(parent->d_inode,dentry); d_delete(dentry); + simple_rmdir(parent->d_inode,dentry); pr_debug(" o %s removing done (%d)\n",dentry->d_name.name, atomic_read(&dentry->d_count)); - /** - * Drop reference from initial sysfs_get_dentry(). - */ - dput(dentry); /** * Drop reference from dget() on entrance. diff -urpN -X /home/fletch/.diff.exclude 000-bk3/fs/sysfs/inode.c 900-mjb3/fs/sysfs/inode.c --- 000-bk3/fs/sysfs/inode.c Wed Mar 5 07:37:06 2003 +++ 900-mjb3/fs/sysfs/inode.c Wed Mar 12 19:47:37 2003 @@ -93,19 +93,14 @@ void sysfs_hash_and_remove(struct dentry /* make sure dentry is really there */ if (victim->d_inode && (victim->d_parent->d_inode == dir->d_inode)) { - simple_unlink(dir->d_inode,victim); - d_delete(victim); - pr_debug("sysfs: Removing %s (%d)\n", victim->d_name.name, atomic_read(&victim->d_count)); - /* - * Drop reference from initial sysfs_get_dentry(). - */ - dput(victim); + + simple_unlink(dir->d_inode,victim); + } - - /** - * Drop the reference acquired from sysfs_get_dentry() above. + /* + * Drop reference from sysfs_get_dentry() above. */ dput(victim); } diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/bug.h 900-mjb3/include/asm-i386/bug.h --- 000-bk3/include/asm-i386/bug.h Sat Mar 8 10:16:06 2003 +++ 900-mjb3/include/asm-i386/bug.h Sat Mar 8 22:04:47 2003 @@ -9,6 +9,11 @@ * undefined" opcode for parsing in the trap handler. */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#define BUG() do { \ + asm ("int $0x3"); \ +} while (0) +#else #if 1 /* Set to zero for a slightly smaller kernel */ #define BUG() \ __asm__ __volatile__( "ud2\n" \ @@ -17,6 +22,7 @@ : : "i" (__LINE__), "i" (__FILE__)) #else #define BUG() __asm__ __volatile__("ud2\n") +#endif #endif #define PAGE_BUG(page) do { \ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/dump.h 900-mjb3/include/asm-i386/dump.h --- 000-bk3/include/asm-i386/dump.h Wed Dec 31 16:00:00 1969 +++ 900-mjb3/include/asm-i386/dump.h Sat Mar 8 22:05:11 2003 @@ -0,0 +1,75 @@ +/* + * Kernel header file for Linux crash dumps. + * + * Created by: Matt Robinson (yakker@sgi.com) + * + * Copyright 1999 Silicon Graphics, Inc. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* This header file holds the architecture specific crash dump header */ +#ifndef _ASM_DUMP_H +#define _ASM_DUMP_H + +/* necessary header files */ +#include +#include +#include +#include + +/* definitions */ +#define DUMP_ASM_MAGIC_NUMBER 0xdeaddeadULL /* magic number */ +#define DUMP_ASM_VERSION_NUMBER 0x3 /* version number */ + +/* max number of cpus */ +#define DUMP_MAX_NUM_CPUS 32 + +/* + * Structure: __dump_header_asm + * Function: This is the header for architecture-specific stuff. It + * follows right after the dump header. + */ +struct __dump_header_asm { + /* the dump magic number -- unique to verify dump is valid */ + u64 dha_magic_number; + + /* the version number of this dump */ + u32 dha_version; + + /* the size of this header (in case we can't read it) */ + u32 dha_header_size; + + /* the esp for i386 systems */ + u32 dha_esp; + + /* the eip for i386 systems */ + u32 dha_eip; + + /* the dump registers */ + struct pt_regs dha_regs; + + /* smp specific */ + u32 dha_smp_num_cpus; + u32 dha_dumping_cpu; + struct pt_regs dha_smp_regs[DUMP_MAX_NUM_CPUS]; + u32 dha_smp_current_task[DUMP_MAX_NUM_CPUS]; + u32 dha_stack[DUMP_MAX_NUM_CPUS]; + u32 dha_stack_ptr[DUMP_MAX_NUM_CPUS]; +} __attribute__((packed)); + +#ifdef __KERNEL__ + +extern struct __dump_header_asm dump_header_asm; + +#ifdef CONFIG_SMP +extern unsigned long irq_affinity[]; +extern int (*dump_ipi_function_ptr)(struct pt_regs *); +extern void dump_send_ipi(void); +#else +#define dump_send_ipi() do { } while(0) +#endif + +#endif /* __KERNEL__ */ + +#endif /* _ASM_DUMP_H */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/e820.h 900-mjb3/include/asm-i386/e820.h --- 000-bk3/include/asm-i386/e820.h Sun Nov 17 20:29:50 2002 +++ 900-mjb3/include/asm-i386/e820.h Sat Mar 8 22:05:11 2003 @@ -35,6 +35,7 @@ struct e820map { }; extern struct e820map e820; + #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/early_printk.h 900-mjb3/include/asm-i386/early_printk.h --- 000-bk3/include/asm-i386/early_printk.h Wed Dec 31 16:00:00 1969 +++ 900-mjb3/include/asm-i386/early_printk.h Sat Mar 8 22:03:43 2003 @@ -0,0 +1,8 @@ +#ifndef __EARLY_PRINTK_H_I386_ +#define __EARLY_PRINTK_H_i386_ + +#define VGABASE 0xB8000 +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/ioctls.h 900-mjb3/include/asm-i386/ioctls.h --- 000-bk3/include/asm-i386/ioctls.h Sun Nov 17 20:29:22 2002 +++ 900-mjb3/include/asm-i386/ioctls.h Sat Mar 8 22:04:47 2003 @@ -68,6 +68,7 @@ #define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ #define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ #define FIOQSIZE 0x5460 +#define TIOCGDB 0x547F /* enable GDB stub mode on this tty */ /* Used for packet mode */ #define TIOCPKT_DATA 0 diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/kdebug.h 900-mjb3/include/asm-i386/kdebug.h --- 000-bk3/include/asm-i386/kdebug.h Wed Dec 31 16:00:00 1969 +++ 900-mjb3/include/asm-i386/kdebug.h Sat Mar 8 22:05:11 2003 @@ -0,0 +1,53 @@ +#ifndef _I386_KDEBUG_H +#define _I386_KDEBUG_H 1 + +#include + +struct pt_regs; + +struct die_args { + struct pt_regs *regs; + const char *str; + long err; +}; + + +/* Grossly misnamed. */ +enum die_val { + DIE_OOPS = 1, + DIE_INT3, + DIE_DEBUG, + DIE_PANIC, + DIE_NMI, + DIE_DIE, + DIE_CPUINIT, /* not really a die, but .. */ + DIE_TRAPINIT, + DIE_STOP, + DIE_PROTFAULT, + DIE_PAGEFAULT, + DIE_FPUTRAP, + DIE_WATCHDOG, +}; + +extern struct notifier_block *die_chain; + +static inline int register_die_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&die_chain, nb); + +} + +static inline int unregister_die_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&die_chain, nb); +} + +static inline int notify_die(enum die_val val, const char *str, + struct pt_regs *regs,long err) +{ + struct die_args args = { regs: regs, str: str, err: err }; + + return notifier_call_chain(&die_chain, val, &args); +} + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/kmap_types.h 900-mjb3/include/asm-i386/kmap_types.h --- 000-bk3/include/asm-i386/kmap_types.h Thu Feb 13 11:08:13 2003 +++ 900-mjb3/include/asm-i386/kmap_types.h Sat Mar 8 22:05:11 2003 @@ -24,7 +24,8 @@ D(10) KM_IRQ0, D(11) KM_IRQ1, D(12) KM_SOFTIRQ0, D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(14) KM_TYPE_NR, +D(15) KM_DUMP }; #undef D diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/kprobes.h 900-mjb3/include/asm-i386/kprobes.h --- 000-bk3/include/asm-i386/kprobes.h Wed Dec 31 16:00:00 1969 +++ 900-mjb3/include/asm-i386/kprobes.h Sat Mar 8 22:04:56 2003 @@ -0,0 +1,34 @@ +#ifndef _ASM_KPROBES_H +#define _ASM_KPROBES_H +/* + * Dynamic Probes (kprobes) support + * Vamsi Krishna S , July, 2002 + * Mailing list: dprobes@www-124.ibm.com + */ +#include +#include + +struct pt_regs; + +typedef u8 kprobe_opcode_t; +#define BREAKPOINT_INSTRUCTION 0xcc + +/* trap3/1 are intr gates for kprobes. So, restore the status of IF, + * if necessary, before executing the original int3/1 (trap) handler. + */ +static inline void restore_interrupts(struct pt_regs *regs) +{ + if (regs->eflags & IF_MASK) + __asm__ __volatile__ ("sti"); +} + +#ifdef CONFIG_KPROBES +extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); +extern int post_kprobe_handler(struct pt_regs *regs); +extern int kprobe_handler(struct pt_regs *regs); +#else /* !CONFIG_KPROBES */ +static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { return 0; } +static inline int post_kprobe_handler(struct pt_regs *regs) { return 0; } +static inline int kprobe_handler(struct pt_regs *regs) { return 0; } +#endif +#endif /* _ASM_KPROBES_H */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/linkage.h 900-mjb3/include/asm-i386/linkage.h --- 000-bk3/include/asm-i386/linkage.h Sun Nov 17 20:29:46 2002 +++ 900-mjb3/include/asm-i386/linkage.h Sat Mar 8 22:05:03 2003 @@ -3,6 +3,7 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #define FASTCALL(x) x __attribute__((regparm(3))) +#define IRQHANDLER(x) x __attribute__((regparm(1))) #ifdef CONFIG_X86_ALIGNMENT_16 #define __ALIGN .align 16,0x90 diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/mach-bigsmp/mach_apic.h 900-mjb3/include/asm-i386/mach-bigsmp/mach_apic.h --- 000-bk3/include/asm-i386/mach-bigsmp/mach_apic.h Sat Mar 8 10:16:06 2003 +++ 900-mjb3/include/asm-i386/mach-bigsmp/mach_apic.h Wed Mar 12 19:47:39 2003 @@ -19,7 +19,7 @@ static inline int apic_id_registered(voi } #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -#define TARGET_CPUS ((cpu_callout_map < 0xf)?cpu_callout_map:0xf) +#define TARGET_CPUS ((cpu_online_map < 0xf)?cpu_online_map:0xf) #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/mach-default/irq_vectors.h 900-mjb3/include/asm-i386/mach-default/irq_vectors.h --- 000-bk3/include/asm-i386/mach-default/irq_vectors.h Mon Dec 23 23:01:56 2002 +++ 900-mjb3/include/asm-i386/mach-default/irq_vectors.h Sat Mar 8 22:05:11 2003 @@ -48,6 +48,7 @@ #define INVALIDATE_TLB_VECTOR 0xfd #define RESCHEDULE_VECTOR 0xfc #define CALL_FUNCTION_VECTOR 0xfb +#define DUMP_VECTOR 0xfa #define THERMAL_APIC_VECTOR 0xf0 /* diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/mach-default/mach_apic.h 900-mjb3/include/asm-i386/mach-default/mach_apic.h --- 000-bk3/include/asm-i386/mach-default/mach_apic.h Sat Mar 8 10:16:06 2003 +++ 900-mjb3/include/asm-i386/mach-default/mach_apic.h Wed Mar 12 19:47:39 2003 @@ -4,7 +4,7 @@ #define APIC_DFR_VALUE (APIC_DFR_FLAT) #ifdef CONFIG_SMP - #define TARGET_CPUS (cpu_callout_map) + #define TARGET_CPUS (cpu_online_map) #else #define TARGET_CPUS 0x01 #endif diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/mach-summit/mach_apic.h 900-mjb3/include/asm-i386/mach-summit/mach_apic.h --- 000-bk3/include/asm-i386/mach-summit/mach_apic.h Sat Mar 8 10:16:06 2003 +++ 900-mjb3/include/asm-i386/mach-summit/mach_apic.h Wed Mar 12 19:47:39 2003 @@ -13,7 +13,7 @@ extern int x86_summit; ((phys_apic) & XAPIC_DEST_CLUSTER_MASK) ) #define APIC_DFR_VALUE (x86_summit ? APIC_DFR_CLUSTER : APIC_DFR_FLAT) -#define TARGET_CPUS (x86_summit ? XAPIC_DEST_CPUS_MASK : cpu_callout_map) +#define TARGET_CPUS (x86_summit ? XAPIC_DEST_CPUS_MASK : cpu_online_map) #define INT_DELIVERY_MODE (x86_summit ? dest_Fixed : dest_LowestPrio) #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/mmzone.h 900-mjb3/include/asm-i386/mmzone.h --- 000-bk3/include/asm-i386/mmzone.h Wed Mar 5 07:37:06 2003 +++ 900-mjb3/include/asm-i386/mmzone.h Sat Mar 8 22:03:39 2003 @@ -10,14 +10,6 @@ #ifdef CONFIG_DISCONTIGMEM -#ifdef CONFIG_X86_NUMAQ -#include -#elif CONFIG_X86_SUMMIT -#include -#else -#define pfn_to_nid(pfn) (0) -#endif /* CONFIG_X86_NUMAQ */ - extern struct pglist_data *node_data[]; /* @@ -101,5 +93,41 @@ extern struct pglist_data *node_data[]; * ( pfn_to_pgdat(pfn) && ((pfn) < node_end_pfn(pfn_to_nid(pfn))) ) */ #define pfn_valid(pfn) ((pfn) < num_physpages) + +/* + * generic node memory support, the following assumptions apply: + * + * 1) memory comes in 256Mb contigious chunks which are either present or not + * 2) we will not have more than 64Gb in total + * + * for now assume that 64Gb is max amount of RAM for whole system + * 64Gb / 4096bytes/page = 16777216 pages + */ +#define MAX_NR_PAGES 16777216 +#define MAX_ELEMENTS 256 +#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS) + +extern u8 physnode_map[]; + +static inline int pfn_to_nid(unsigned long pfn) +{ + return(physnode_map[(pfn) / PAGES_PER_ELEMENT]); +} +static inline struct pglist_data *pfn_to_pgdat(unsigned long pfn) +{ + return(NODE_DATA(pfn_to_nid(pfn))); +} + +#ifdef CONFIG_X86_NUMAQ +#include +#elif CONFIG_X86_SUMMIT +#include +#elif CONFIG_X86_PC +#define get_memcfg_numa get_memcfg_numa_flat +#define get_zholes_size(n) (0) +#else +#define pfn_to_nid(pfn) (0) +#endif /* CONFIG_X86_NUMAQ */ + #endif /* CONFIG_DISCONTIGMEM */ #endif /* _ASM_MMZONE_H_ */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/numaq.h 900-mjb3/include/asm-i386/numaq.h --- 000-bk3/include/asm-i386/numaq.h Wed Mar 5 07:37:06 2003 +++ 900-mjb3/include/asm-i386/numaq.h Sat Mar 8 22:03:40 2003 @@ -28,18 +28,6 @@ #ifdef CONFIG_X86_NUMAQ -/* - * for now assume that 64Gb is max amount of RAM for whole system - * 64Gb / 4096bytes/page = 16777216 pages - */ -#define MAX_NR_PAGES 16777216 -#define MAX_ELEMENTS 256 -#define PAGES_PER_ELEMENT (16777216/256) - -extern int physnode_map[]; -#define pfn_to_nid(pfn) ({ physnode_map[(pfn) / PAGES_PER_ELEMENT]; }) -#define pfn_to_pgdat(pfn) NODE_DATA(pfn_to_nid(pfn)) -#define PHYSADDR_TO_NID(pa) pfn_to_nid(pa >> PAGE_SHIFT) #define MAX_NUMNODES 8 extern void get_memcfg_numaq(void); #define get_memcfg_numa() get_memcfg_numaq() @@ -169,7 +157,7 @@ struct sys_cfg_data { struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */ }; -static inline unsigned long get_zholes_size(int nid) +static inline unsigned long *get_zholes_size(int nid) { return 0; } diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/page.h 900-mjb3/include/asm-i386/page.h --- 000-bk3/include/asm-i386/page.h Wed Mar 5 07:37:06 2003 +++ 900-mjb3/include/asm-i386/page.h Sat Mar 8 22:05:02 2003 @@ -3,7 +3,11 @@ /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) +#ifndef __ASSEMBLY__ +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#else +#define PAGE_SIZE (1 << PAGE_SHIFT) +#endif #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) @@ -90,7 +94,16 @@ typedef struct { unsigned long pgprot; } * and CONFIG_HIGHMEM64G options in the kernel configuration. */ -#define __PAGE_OFFSET (0xC0000000) +#include +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000) +#endif /* * This much address space is reserved for vmalloc() and iomap() diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/param.h 900-mjb3/include/asm-i386/param.h --- 000-bk3/include/asm-i386/param.h Sun Nov 17 20:29:26 2002 +++ 900-mjb3/include/asm-i386/param.h Sat Mar 8 22:03:44 2003 @@ -2,10 +2,18 @@ #define _ASMi386_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +#include + +#ifdef CONFIG_1000HZ +# define HZ 1000 /* Internal kernel timer frequency */ +#else +# define HZ 100 #endif + +#define USER_HZ 100 /* .. some user interfaces are in "ticks" */ +#define CLOCKS_PER_SEC (USER_HZ) /* like times() */ + +#endif /* __KERNEL__ */ #ifndef HZ #define HZ 100 diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/processor.h 900-mjb3/include/asm-i386/processor.h --- 000-bk3/include/asm-i386/processor.h Tue Feb 25 23:03:50 2003 +++ 900-mjb3/include/asm-i386/processor.h Sat Mar 8 22:04:47 2003 @@ -280,7 +280,11 @@ extern unsigned int mca_pentium_flag; /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ +#ifdef CONFIG_05GB +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 16)) +#else #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#endif /* * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. @@ -394,6 +398,9 @@ struct thread_struct { unsigned int saved_fs, saved_gs; /* IO permissions */ unsigned long *ts_io_bitmap; +#ifdef CONFIG_X86_REMOTE_DEBUG + struct pt_regs *kgdbregs; +#endif }; #define INIT_THREAD { \ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/rwlock.h 900-mjb3/include/asm-i386/rwlock.h --- 000-bk3/include/asm-i386/rwlock.h Sun Nov 17 20:29:57 2002 +++ 900-mjb3/include/asm-i386/rwlock.h Wed Mar 12 19:47:37 2003 @@ -20,28 +20,52 @@ #define RW_LOCK_BIAS 0x01000000 #define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $1,(%0)\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_read_lock_const(rw, helper) \ - asm volatile(LOCK "subl $1,%0\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "jns 1f\n\t" \ + "call " helper "\n\t" \ + "1:\t" \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "jns 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\t" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ + #define __build_read_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ @@ -50,28 +74,51 @@ __build_read_lock_ptr(rw, helper); \ } while (0) -#define __build_write_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_write_lock_const(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jz 1f\n\t" \ + "call " helper "\n\t" \ + "1:\n" \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jz 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\n" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ #define __build_write_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/smp.h 900-mjb3/include/asm-i386/smp.h --- 000-bk3/include/asm-i386/smp.h Fri Jan 17 09:18:31 2003 +++ 900-mjb3/include/asm-i386/smp.h Sat Mar 8 22:05:11 2003 @@ -39,6 +39,7 @@ extern int smp_num_siblings; extern int cpu_sibling_map[]; extern void smp_flush_tlb(void); +extern void dump_send_ipi(void); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); extern void smp_send_reschedule(int cpu); extern void smp_send_reschedule_all(void); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/spinlock.h 900-mjb3/include/asm-i386/spinlock.h --- 000-bk3/include/asm-i386/spinlock.h Mon Dec 23 23:01:56 2002 +++ 900-mjb3/include/asm-i386/spinlock.h Wed Mar 12 19:47:37 2003 @@ -42,18 +42,35 @@ typedef struct { #define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0) #define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) -#define spin_lock_string \ - "\n1:\t" \ - "lock ; decb %0\n\t" \ - "js 2f\n" \ - LOCK_SECTION_START("") \ - "2:\t" \ - "rep;nop\n\t" \ - "cmpb $0,%0\n\t" \ - "jle 2b\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END +#ifdef CONFIG_SPINLINE + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + "jmp 3f\n" \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + "3:\t" + +#else /* !CONFIG_SPINLINE */ + + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + LOCK_SECTION_START("") \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END + +#endif /* CONFIG_SPINLINE */ /* * This works. Despite all the confusion. * (except on PPro SMP or if we are using OOSTORE) diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/srat.h 900-mjb3/include/asm-i386/srat.h --- 000-bk3/include/asm-i386/srat.h Wed Mar 5 07:37:06 2003 +++ 900-mjb3/include/asm-i386/srat.h Sat Mar 8 22:03:39 2003 @@ -27,17 +27,6 @@ #ifndef _ASM_SRAT_H_ #define _ASM_SRAT_H_ -/* - * each element in pfnnode_map represents 256 MB (2^28) of pages. - * so, to represent 64GB we need 256 elements. - */ -#define MAX_ELEMENTS 256 -#define PFN_TO_ELEMENT(pfn) ((pfn)>>(28 - PAGE_SHIFT)) - -extern int pfnnode_map[]; -#define pfn_to_nid(pfn) ({ pfnnode_map[PFN_TO_ELEMENT(pfn)]; }) -#define pfn_to_pgdat(pfn) NODE_DATA(pfn_to_nid(pfn)) -#define PHYSADDR_TO_NID(pa) pfn_to_nid(pa >> PAGE_SHIFT) #define MAX_NUMNODES 8 extern void get_memcfg_from_srat(void); extern unsigned long *get_zholes_size(int); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-i386/thread_info.h 900-mjb3/include/asm-i386/thread_info.h --- 000-bk3/include/asm-i386/thread_info.h Thu Jan 9 19:16:11 2003 +++ 900-mjb3/include/asm-i386/thread_info.h Sat Mar 8 22:05:09 2003 @@ -9,6 +9,7 @@ #ifdef __KERNEL__ +#include #ifndef __ASSEMBLY__ #include #endif @@ -29,9 +30,11 @@ struct thread_info { __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: + 0 for interrupts: illegal 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ + struct thread_info *irq_stack; /* pointer to cpu irq stack */ struct restart_block restart_block; __u8 supervisor_stack[0]; @@ -46,7 +49,8 @@ struct thread_info { #define TI_CPU 0x0000000C #define TI_PRE_COUNT 0x00000010 #define TI_ADDR_LIMIT 0x00000014 -#define TI_RESTART_BLOCK 0x0000018 +#define TI_IRQ_STACK 0x00000018 +#define TI_RESTART_BLOCK 0x0000022 #endif @@ -57,48 +61,66 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ +#ifdef CONFIG_4K_STACK +#define THREAD_ORDER 0 +#define STACK_WARN 0x200 +#define STACK_PANIC 0x100 +#else +#define THREAD_ORDER 1 +#define STACK_WARN ((THREAD_SIZE)>>1) +#define STACK_PANIC 0x100 +#endif +#define INIT_THREAD_SIZE THREAD_SIZE + #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .irq_stack = &init_irq_union.thread_info, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + } \ } #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) +/* thread information allocation */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define alloc_thread_info() kmalloc(THREAD_SIZE, GFP_KERNEL) +#define free_thread_info(ti) kfree(ti) +#define get_thread_info(ti) get_task_struct((ti)->task) +#define put_thread_info(ti) put_task_struct((ti)->task) + /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } -/* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_thread_info() ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) -#define free_thread_info(ti) free_pages((unsigned long) (ti), 1) -#define get_thread_info(ti) get_task_struct((ti)->task) -#define put_thread_info(ti) put_task_struct((ti)->task) - #else /* !__ASSEMBLY__ */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) + /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $-THREAD_SIZE, reg; \ andl %esp, reg -#endif +/* use this one if reg already contains %esp */ +#define GET_THREAD_INFO_WITH_ESP(reg) \ + andl $-THREAD_SIZE, reg +#endif + /* * thread information flags * - these are process state flags that various assembly files may need to access diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/asm-x86_64/early_printk.h 900-mjb3/include/asm-x86_64/early_printk.h --- 000-bk3/include/asm-x86_64/early_printk.h Wed Dec 31 16:00:00 1969 +++ 900-mjb3/include/asm-x86_64/early_printk.h Sat Mar 8 22:03:43 2003 @@ -0,0 +1,8 @@ +#ifdef __EARLY_PRINTK_H_X86_64_ +#define __EARLY_PRINTK_H_X86_64_ + +#define VGABASE 0xffffffff800b8000UL +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/dump.h 900-mjb3/include/linux/dump.h --- 000-bk3/include/linux/dump.h Wed Dec 31 16:00:00 1969 +++ 900-mjb3/include/linux/dump.h Sat Mar 8 22:05:11 2003 @@ -0,0 +1,362 @@ +/* + * Kernel header file for Linux crash dumps. + * + * Created by: Matt Robinson (yakker@sgi.com) + * Copyright 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * + * vmdump.h to dump.h by: Matt D. Robinson (yakker@sourceforge.net) + * Copyright 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved. + * + * Most of this is the same old stuff from vmdump.h, except now we're + * actually a stand-alone driver plugged into the block layer interface, + * with the exception that we now allow for compression modes externally + * loaded (e.g., someone can come up with their own). + * + * This code is released under version 2 of the GNU GPL. + */ + +/* This header file includes all structure definitions for crash dumps. */ +#ifndef _DUMP_H +#define _DUMP_H + +#if defined(CONFIG_CRASH_DUMP) || defined (CONFIG_CRASH_DUMP_MODULE) + +#include +#include +#include + +/* + * Predefine default DUMP_PAGE constants, asm header may override. + * + * On ia64 discontinuous memory systems it's possible for the memory + * banks to stop at 2**12 page alignments, the smallest possible page + * size. But the system page size, PAGE_SIZE, is in fact larger. + */ +#define DUMP_PAGE_SHIFT PAGE_SHIFT +#define DUMP_PAGE_MASK PAGE_MASK +#define DUMP_PAGE_ALIGN(addr) PAGE_ALIGN(addr) +#define DUMP_HEADER_OFFSET PAGE_SIZE + +/* keep DUMP_PAGE_SIZE constant to 4K = 1<<12 + * it may be different from PAGE_SIZE then. + */ +#define DUMP_PAGE_SIZE 4096 + +/* + * Predefined default memcpy() to use when copying memory to the dump buffer. + * + * On ia64 there is a heads up function that can be called to let the prom + * machine check monitor know that the current activity is risky and it should + * ignore the fault (nofault). In this case the ia64 header will redefine this + * macro to __dump_memcpy() and use it's arch specific version. + */ +#define DUMP_memcpy memcpy + +/* necessary header files */ +#include /* for architecture-specific header */ + +/* + * Size of the buffer that's used to hold: + * + * 1. the dump header (padded to fill the complete buffer) + * 2. the possibly compressed page headers and data + */ +#define DUMP_BUFFER_SIZE (64 * 1024) /* size of dump buffer */ +#define DUMP_HEADER_SIZE DUMP_BUFFER_SIZE + +/* standard header definitions */ +#define DUMP_MAGIC_NUMBER 0xa8190173618f23edULL /* dump magic number */ +#define DUMP_MAGIC_LIVE 0xa8190173618f23cdULL /* live magic number */ +#define DUMP_VERSION_NUMBER 0x8 /* dump version number */ +#define DUMP_PANIC_LEN 0x100 /* dump panic string length */ + +/* dump levels - type specific stuff added later -- add as necessary */ +#define DUMP_LEVEL_NONE 0x0 /* no dumping at all -- just bail */ +#define DUMP_LEVEL_HEADER 0x1 /* kernel dump header only */ +#define DUMP_LEVEL_KERN 0x2 /* dump header and kernel pages */ +#define DUMP_LEVEL_USED 0x4 /* dump header, kernel/user pages */ +#define DUMP_LEVEL_ALL_RAM 0x8 /* dump header, all RAM pages */ +#define DUMP_LEVEL_ALL 0x10 /* dump all memory RAM and firmware */ + + +/* dump compression options -- add as necessary */ +#define DUMP_COMPRESS_NONE 0x0 /* don't compress this dump */ +#define DUMP_COMPRESS_RLE 0x1 /* use RLE compression */ +#define DUMP_COMPRESS_GZIP 0x2 /* use GZIP compression */ + +/* dump flags - any dump-type specific flags -- add as necessary */ +#define DUMP_FLAGS_NONE 0x0 /* no flags are set for this dump */ + +#define DUMP_FLAGS_TARGETMASK 0xf0000000 /* handle special case targets */ +#define DUMP_FLAGS_DISKDUMP 0x80000000 /* dump to local disk */ +#define DUMP_FLAGS_NETDUMP 0x40000000 /* dump over the network */ + +/* dump header flags -- add as necessary */ +#define DUMP_DH_FLAGS_NONE 0x0 /* no flags set (error condition!) */ +#define DUMP_DH_RAW 0x1 /* raw page (no compression) */ +#define DUMP_DH_COMPRESSED 0x2 /* page is compressed */ +#define DUMP_DH_END 0x4 /* end marker on a full dump */ +#define DUMP_DH_TRUNCATED 0x8 /* dump is incomplete */ +#define DUMP_DH_TEST_PATTERN 0x10 /* dump page is a test pattern */ +#define DUMP_DH_NOT_USED 0x20 /* 1st bit not used in flags */ + +/* names for various dump parameters in /proc/kernel */ +#define DUMP_ROOT_NAME "sys/dump" +#define DUMP_DEVICE_NAME "device" +#define DUMP_COMPRESS_NAME "compress" +#define DUMP_LEVEL_NAME "level" +#define DUMP_FLAGS_NAME "flags" + +#define DUMP_SYSRQ_KEY 'd' /* key to use for MAGIC_SYSRQ key */ + +/* CTL_DUMP names: */ +enum +{ + CTL_DUMP_DEVICE=1, + CTL_DUMP_COMPRESS=3, + CTL_DUMP_LEVEL=3, + CTL_DUMP_FLAGS=4, + CTL_DUMP_TEST=5, +}; + + +/* page size for gzip compression -- buffered slightly beyond hardware PAGE_SIZE used by DUMP */ +#define DUMP_DPC_PAGE_SIZE (DUMP_PAGE_SIZE + 512) + +/* dump ioctl() control options */ +#define DIOSDUMPDEV 1 /* set the dump device */ +#define DIOGDUMPDEV 2 /* get the dump device */ +#define DIOSDUMPLEVEL 3 /* set the dump level */ +#define DIOGDUMPLEVEL 4 /* get the dump level */ +#define DIOSDUMPFLAGS 5 /* set the dump flag parameters */ +#define DIOGDUMPFLAGS 6 /* get the dump flag parameters */ +#define DIOSDUMPCOMPRESS 7 /* set the dump compress level */ +#define DIOGDUMPCOMPRESS 8 /* get the dump compress level */ + +/* these ioctls are used only by netdump module */ +#define DIOSTARGETIP 9 /* set the target m/c's ip */ +#define DIOGTARGETIP 10 /* get the target m/c's ip */ +#define DIOSTARGETPORT 11 /* set the target m/c's port */ +#define DIOGTARGETPORT 12 /* get the target m/c's port */ +#define DIOSSOURCEPORT 13 /* set the source m/c's port */ +#define DIOGSOURCEPORT 14 /* get the source m/c's port */ +#define DIOSETHADDR 15 /* set ethernet address */ +#define DIOGETHADDR 16 /* get ethernet address */ + +/* + * Structure: __dump_header + * Function: This is the header dumped at the top of every valid crash + * dump. + */ +struct __dump_header { + /* the dump magic number -- unique to verify dump is valid */ + u64 dh_magic_number; + + /* the version number of this dump */ + u32 dh_version; + + /* the size of this header (in case we can't read it) */ + u32 dh_header_size; + + /* the level of this dump (just a header?) */ + u32 dh_dump_level; + + /* + * We assume dump_page_size to be 4K in every case. + * Store here the configurable system page size (4K, 8K, 16K, etc.) + */ + u32 dh_page_size; + + /* the size of all physical memory */ + u64 dh_memory_size; + + /* the start of physical memory */ + u64 dh_memory_start; + + /* the end of physical memory */ + u64 dh_memory_end; + + /* the number of hardware/physical pages in this dump specifically */ + u32 dh_num_dump_pages; + + /* the panic string, if available */ + char dh_panic_string[DUMP_PANIC_LEN]; + + /* timeval depends on architecture, two long values */ + struct { + u64 tv_sec; + u64 tv_usec; + } dh_time; /* the time of the system crash */ + + /* the NEW utsname (uname) information -- in character form */ + /* we do this so we don't have to include utsname.h */ + /* plus it helps us be more architecture independent */ + /* now maybe one day soon they'll make the [65] a #define! */ + char dh_utsname_sysname[65]; + char dh_utsname_nodename[65]; + char dh_utsname_release[65]; + char dh_utsname_version[65]; + char dh_utsname_machine[65]; + char dh_utsname_domainname[65]; + + /* the address of current task (OLD = void *, NEW = u64) */ + u64 dh_current_task; + + /* what type of compression we're using in this dump (if any) */ + u32 dh_dump_compress; + + /* any additional flags */ + u32 dh_dump_flags; + + /* any additional flags */ + u32 dh_dump_device; +} __attribute__((packed)); + +/* + * Structure: __dump_page + * Function: To act as the header associated to each physical page of + * memory saved in the system crash dump. This allows for + * easy reassembly of each crash dump page. The address bits + * are split to make things easier for 64-bit/32-bit system + * conversions. + * + * dp_byte_offset and dp_page_index are landmarks that are helpful when + * looking at a hex dump of /dev/vmdump, + */ +struct __dump_page { + /* the address of this dump page */ + u64 dp_address; + + /* the size of this dump page */ + u32 dp_size; + + /* flags (currently DUMP_COMPRESSED, DUMP_RAW or DUMP_END) */ + u32 dp_flags; +} __attribute__((packed)); + +/* + * Structure: __lkcdinfo + * Function: This structure contains information needed for the lkcdutils + * package (particularly lcrash) to determine what information is + * associated to this kernel, specifically. + */ +struct __lkcdinfo { + int arch; + int ptrsz; + int byte_order; + int linux_release; + int page_shift; + int page_size; + u64 page_mask; + u64 page_offset; + int stack_offset; +}; + +#ifdef __KERNEL__ + +/* + * Structure: __dump_compress + * Function: This is what an individual compression mechanism can use + * to plug in their own compression techniques. It's always + * best to build these as individual modules so that people + * can put in whatever they want. + */ +struct __dump_compress { + /* the list_head structure for list storage */ + struct list_head list; + + /* the type of compression to use (DUMP_COMPRESS_XXX) */ + int compress_type; + const char *compress_name; + + /* the compression function to call */ + u16 (*compress_func)(const u8 *, u16, u8 *, u16); +}; + +/* functions for dump compression registration */ +extern void dump_register_compression(struct __dump_compress *); +extern void dump_unregister_compression(int); + +/* + * Structure dump_mbank[]: + * + * For CONFIG_DISCONTIGMEM systems this array specifies the + * memory banks/chunks that need to be dumped after a panic. + * + * For classic systems it specifies a single set of pages from + * 0 to max_mapnr. + */ +struct __dump_mbank { + u64 start; + u64 end; + int type; + int pad1; + long pad2; +}; + +#define DUMP_MBANK_TYPE_CONVENTIONAL_MEMORY 1 +#define DUMP_MBANK_TYPE_OTHER 2 + +#define MAXCHUNKS 256 +extern int dump_mbanks; +extern struct __dump_mbank dump_mbank[MAXCHUNKS]; + +/* notification event codes */ +#define DUMP_BEGIN 0x0001 /* dump beginning */ +#define DUMP_END 0x0002 /* dump ending */ + +/* Scheduler soft spin control. + * + * 0 - no dump in progress + * 1 - cpu0 is dumping, ... + */ +extern unsigned long dump_oncpu; +extern void dump_execute(const char *, const struct pt_regs *); + +/* + * Notifier list for kernel code which wants to be called + * at kernel dump. + */ +extern struct notifier_block *dump_notifier_list; +static inline int register_dump_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&dump_notifier_list, nb); +} +static inline int unregister_dump_notifier(struct notifier_block * nb) +{ + return notifier_chain_unregister(&dump_notifier_list, nb); +} + +/* + * Common Arch Specific Functions should be declared here. + * This allows the C compiler to detect discrepancies. + */ +extern void __dump_open(void); +extern void __dump_cleanup(void); +extern void __dump_init(u64); +extern void __dump_save_regs(struct pt_regs *, const struct pt_regs *); +extern int __dump_configure_header(const struct pt_regs *); +extern void __dump_irq_enable(void); +extern void __dump_irq_restore(void); +extern int __dump_page_valid(unsigned long index); +#ifdef CONFIG_SMP +extern void __dump_save_other_cpus(void); +#else +#define __dump_save_other_cpus(void) +#endif + +#endif /* __KERNEL__ */ + +#else /* !CONFIG_CRASH_DUMP */ + +/* If not configured then make code disappear! */ +#define register_dump_watchdog(x) do { } while(0) +#define unregister_dump_watchdog(x) do { } while(0) +#define register_dump_notifier(x) do { } while(0) +#define unregister_dump_notifier(x) do { } while(0) +#define dump_in_progress() 0 + +#endif /* !CONFIG_CRASH_DUMP */ + +#endif /* _DUMP_H */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/dump_netdev.h 900-mjb3/include/linux/dump_netdev.h --- 000-bk3/include/linux/dump_netdev.h Wed Dec 31 16:00:00 1969 +++ 900-mjb3/include/linux/dump_netdev.h Sat Mar 8 22:05:11 2003 @@ -0,0 +1,80 @@ +/* + * linux/drivers/net/netconsole.h + * + * Copyright (C) 2001 Ingo Molnar + * + * This file contains the implementation of an IRQ-safe, crash-safe + * kernel console implementation that outputs kernel messages to the + * network. + * + * Modification history: + * + * 2001-09-17 started by Ingo Molnar. + */ + +/**************************************************************** + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + ****************************************************************/ + +#define NETCONSOLE_VERSION 0x03 + +enum netdump_commands { + COMM_NONE = 0, + COMM_SEND_MEM = 1, + COMM_EXIT = 2, + COMM_REBOOT = 3, + COMM_HELLO = 4, + COMM_GET_NR_PAGES = 5, + COMM_GET_PAGE_SIZE = 6, + COMM_START_NETDUMP_ACK = 7, + COMM_GET_REGS = 8, + COMM_GET_MAGIC = 9, + COMM_START_WRITE_NETDUMP_ACK = 10, +}; + +typedef struct netdump_req_s { + u64 magic; + u32 nr; + u32 command; + u32 from; + u32 to; +} req_t; + +enum netdump_replies { + REPLY_NONE = 0, + REPLY_ERROR = 1, + REPLY_LOG = 2, + REPLY_MEM = 3, + REPLY_RESERVED = 4, + REPLY_HELLO = 5, + REPLY_NR_PAGES = 6, + REPLY_PAGE_SIZE = 7, + REPLY_START_NETDUMP = 8, + REPLY_END_NETDUMP = 9, + REPLY_REGS = 10, + REPLY_MAGIC = 11, + REPLY_START_WRITE_NETDUMP = 12, +}; + +typedef struct netdump_reply_s { + u32 nr; + u32 code; + u32 info; +} reply_t; + +#define HEADER_LEN (1 + sizeof(reply_t)) + + diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/dumpdev.h 900-mjb3/include/linux/dumpdev.h --- 000-bk3/include/linux/dumpdev.h Wed Dec 31 16:00:00 1969 +++ 900-mjb3/include/linux/dumpdev.h Sat Mar 8 22:05:11 2003 @@ -0,0 +1,123 @@ +/* + * Generic dump device interfaces for flexible system dump + * (Enables variation of dump target types e.g disk, network, memory) + * + * These interfaces have evolved based on discussions on lkcd-devel. + * Eventually the intent is to support primary and secondary or + * alternate targets registered at the same time, with scope for + * situation based failover or multiple dump devices used for parallel + * dump i/o. + * + * Started: Oct 2002 - Suparna Bhattacharya (suparna@in.ibm.com) + * + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#ifndef _LINUX_DUMPDEV_H +#define _LINUX_DUMPDEV_H + +#include +#include +#include + +/* Determined by the dump target (device) type */ + +struct dump_dev; + +struct dump_dev_ops { + int (*open)(struct dump_dev *, unsigned long); /* configure */ + int (*release)(struct dump_dev *); /* unconfigure */ + int (*silence)(struct dump_dev *); /* when dump starts */ + int (*resume)(struct dump_dev *); /* when dump is over */ + int (*seek)(struct dump_dev *, loff_t); + /* trigger a write (async in nature typically) */ + int (*write)(struct dump_dev *, void *, unsigned long); + /* not usually used during dump, but option available */ + int (*read)(struct dump_dev *, void *, unsigned long); + /* use to poll for completion */ + int (*ready)(struct dump_dev *, void *); + int (*ioctl)(struct dump_dev *, unsigned int, unsigned long); +}; + +struct dump_dev { + char type_name[32]; /* block, net-poll etc */ + unsigned long device_id; /* interpreted differently for various types */ + struct dump_dev_ops *ops; + struct list_head list; + loff_t curr_offset; +}; + +/* + * dump_dev type variations: + */ + +/* block */ +struct dump_blockdev { + struct dump_dev ddev; + kdev_t kdev_id; + struct block_device *bdev; + struct bio *bio; + loff_t start_offset; + loff_t limit; + int err; +}; + +static inline struct dump_blockdev *DUMP_BDEV(struct dump_dev *dev) +{ + return container_of(dev, struct dump_blockdev, ddev); +} + +/* Dump device / target operation wrappers */ +/* These assume that dump_dev is initiatized to dump_config.dumper->dev */ + +extern struct dump_dev *dump_dev; + +static inline int dump_dev_open(unsigned long arg) +{ + return dump_dev->ops->open(dump_dev, arg); +} + +static inline int dump_dev_release(void) +{ + return dump_dev->ops->release(dump_dev); +} + +static inline int dump_dev_silence(void) +{ + return dump_dev->ops->silence(dump_dev); +} + +static inline int dump_dev_resume(void) +{ + return dump_dev->ops->resume(dump_dev); +} + +static inline int dump_dev_seek(loff_t offset) +{ + return dump_dev->ops->seek(dump_dev, offset); +} + +static inline int dump_dev_write(void *buf, unsigned long len) +{ + return dump_dev->ops->write(dump_dev, buf, len); +} + +static inline int dump_dev_ready(void *buf) +{ + return dump_dev->ops->ready(dump_dev, buf); +} + +static inline int dump_dev_ioctl(unsigned int cmd, unsigned long arg) +{ + if (!dump_dev->ops->ioctl) + return -EINVAL; + return dump_dev->ops->ioctl(dump_dev, cmd, arg); +} + +extern int dump_register_device(struct dump_dev *); +extern void dump_unregister_device(struct dump_dev *); + +#endif /* _LINUX_DUMPDEV_H */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/early_printk.h 900-mjb3/include/linux/early_printk.h --- 000-bk3/include/linux/early_printk.h Wed Dec 31 16:00:00 1969 +++ 900-mjb3/include/linux/early_printk.h Sat Mar 8 22:03:43 2003 @@ -0,0 +1,47 @@ +#ifndef __EARLY_PRINTK_H_ +#define __EARLY_PRINTK_H_ + +#ifdef CONFIG_EARLY_PRINTK +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +/* Simple serial port output */ + +#define DEFAULT_BAUD 57600 +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + + +void early_printk(const char *fmt, ...); +int __init setup_early_printk(char *opt); + +#else + +#define early_printk(...) do {} while(0) +#define setup_early_printk(X) do {} while(0) + +#endif + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/file.h 900-mjb3/include/linux/file.h --- 000-bk3/include/linux/file.h Sun Nov 17 20:29:20 2002 +++ 900-mjb3/include/linux/file.h Wed Mar 12 19:45:37 2003 @@ -40,6 +40,9 @@ extern void FASTCALL(set_close_on_exec(u extern void put_filp(struct file *); extern int get_unused_fd(void); extern void FASTCALL(put_unused_fd(unsigned int fd)); +struct kmem_cache_s; +extern void filp_ctor(void * objp, struct kmem_cache_s *cachep, unsigned long cflags); +extern void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags); extern struct file ** alloc_fd_array(int); extern int expand_fd_array(struct files_struct *, int nr); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/gdb.h 900-mjb3/include/linux/gdb.h --- 000-bk3/include/linux/gdb.h Wed Dec 31 16:00:00 1969 +++ 900-mjb3/include/linux/gdb.h Sat Mar 8 22:04:47 2003 @@ -0,0 +1,67 @@ +#ifndef _GDB_H_ +#define _GDB_H_ + +/* + * Copyright (C) 2001 Amit S. Kale + */ + +/* gdb locks */ +#define KGDB_MAX_NO_CPUS NR_CPUS + +extern int gdb_enter; /* 1 = enter debugger on boot */ +extern int gdb_ttyS; +extern int gdb_baud; +extern int gdb_initialized; + +extern int gdb_hook(void); +extern void breakpoint(void); + +typedef int gdb_debug_hook(int trapno, + int signo, + int err_code, + struct pt_regs *regs); +extern gdb_debug_hook *linux_debug_hook; + +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +extern spinlock_t kgdb_nmispinlock; +#else +extern unsigned kgdb_spinlock; +extern unsigned kgdb_nmispinlock; +#endif + +extern volatile int kgdb_memerr_expected; + +struct console; +void gdb_console_write(struct console *co, const char *s, + unsigned count); +void gdb_console_init(void); + +extern volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#define KGDB_ASSERT(message, condition) do { \ + if (!(condition)) { \ + printk("kgdb assertion failed: %s\n", message); \ + asm ("int $0x3"); \ + } \ +} while (0) + +#ifdef CONFIG_KERNEL_ASSERTS +#define KERNEL_ASSERT(message, condition) KGDB_ASSERT(message, condition) +#else +#define KERNEL_ASSERT(message, condition) +#endif + +#define KA_VALID_ERRNO(errno) ((errno) > 0 && (errno) <= EMEDIUMTYPE) + +#define KA_VALID_PTR_ERR(ptr) KA_VALID_ERRNO(-PTR_ERR(ptr)) + +#define KA_VALID_KPTR(ptr) (!(ptr) || \ + ((void *)(ptr) >= (void *)PAGE_OFFSET && \ + (void *)(ptr) < ERR_PTR(-EMEDIUMTYPE))) + +#define KA_VALID_PTRORERR(errptr) (KA_VALID_KPTR(errptr) || KA_VALID_PTR_ERR(errptr)) + +#define KA_HELD_GKL() (current->lock_depth >= 0) + +#endif /* _GDB_H_ */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/kprobes.h 900-mjb3/include/linux/kprobes.h --- 000-bk3/include/linux/kprobes.h Wed Dec 31 16:00:00 1969 +++ 900-mjb3/include/linux/kprobes.h Sat Mar 8 22:04:56 2003 @@ -0,0 +1,60 @@ +#ifndef _LINUX_KPROBES_H +#define _LINUX_KPROBES_H +#include +#include +#include +#include +#include + +struct kprobe; +struct pt_regs; + +typedef void (*kprobe_pre_handler_t)(struct kprobe *, struct pt_regs *); +typedef void (*kprobe_post_handler_t)(struct kprobe *, struct pt_regs *, + unsigned long flags); +typedef int (*kprobe_fault_handler_t)(struct kprobe *, struct pt_regs *, + int trapnr); + +struct kprobe { + struct list_head list; + + /* location of the probe point */ + kprobe_opcode_t *addr; + + /* Called before addr is executed. */ + kprobe_pre_handler_t pre_handler; + + /* Called after addr is executed, unless... */ + kprobe_post_handler_t post_handler; + + /* ... called if executing addr causes a fault (eg. page fault). + * Return 1 if it handled fault, otherwise kernel will see it. */ + kprobe_fault_handler_t fault_handler; + + /* Saved opcode (which has been replaced with breakpoint) */ + kprobe_opcode_t opcode; +}; + +#ifdef CONFIG_KPROBES +/* Locks kprobe: irq must be disabled */ +void lock_kprobes(void); +void unlock_kprobes(void); + +/* kprobe running now on this CPU? */ +static inline int kprobe_running(void) +{ + extern unsigned int kprobe_cpu; + return kprobe_cpu == smp_processor_id(); +} + +/* Get the kprobe at this addr (if any). Must have called lock_kprobes */ +struct kprobe *get_kprobe(void *addr); + +int register_kprobe(struct kprobe *p); +void unregister_kprobe(struct kprobe *p); +#else +static inline int kprobe_running(void) { return 0; } +static inline int register_kprobe(struct kprobe *p) { return -ENOSYS; } +static inline void unregister_kprobe(struct kprobe *p) { } +#endif +#endif /* _LINUX_KPROBES_H */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/major.h 900-mjb3/include/linux/major.h --- 000-bk3/include/linux/major.h Fri Dec 13 23:18:13 2002 +++ 900-mjb3/include/linux/major.h Sat Mar 8 22:05:11 2003 @@ -165,6 +165,8 @@ #define OSST_MAJOR 206 /* OnStream-SCx0 SCSI tape */ +#define CRASH_DUMP_MAJOR 221 /* crash dump interface */ + #define IBM_TTY3270_MAJOR 227 /* Official allocations now */ #define IBM_FS3270_MAJOR 228 diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/mm.h 900-mjb3/include/linux/mm.h --- 000-bk3/include/linux/mm.h Wed Mar 5 07:37:07 2003 +++ 900-mjb3/include/linux/mm.h Sat Mar 8 22:04:43 2003 @@ -171,6 +171,7 @@ struct page { struct pte_chain *chain;/* Reverse pte mapping pointer. * protected by PG_chainlock */ pte_addr_t direct; + int mapcount; } pte; unsigned long private; /* mapping-private opaque data */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/netdevice.h 900-mjb3/include/linux/netdevice.h --- 000-bk3/include/linux/netdevice.h Fri Jan 17 09:18:32 2003 +++ 900-mjb3/include/linux/netdevice.h Sat Mar 8 22:05:11 2003 @@ -422,6 +422,9 @@ struct net_device unsigned char *haddr); int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); int (*accept_fastpath)(struct net_device *, struct dst_entry*); +#define HAVE_POLL_CONTROLLER + void (*poll_controller)(struct net_device *dev); + int (*rx_hook)(struct sk_buff *skb); /* open/release and usage marking */ struct module *owner; diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/page-flags.h 900-mjb3/include/linux/page-flags.h --- 000-bk3/include/linux/page-flags.h Thu Feb 13 11:08:14 2003 +++ 900-mjb3/include/linux/page-flags.h Sat Mar 8 22:04:41 2003 @@ -74,6 +74,7 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_anon 20 /* Anonymous page */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -255,6 +256,10 @@ extern void get_full_page_state(struct p #define PageCompound(page) test_bit(PG_compound, &(page)->flags) #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) + +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) /* * The PageSwapCache predicate doesn't use a PG_flag at this time, diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/sched.h 900-mjb3/include/linux/sched.h --- 000-bk3/include/linux/sched.h Sat Mar 8 10:16:06 2003 +++ 900-mjb3/include/linux/sched.h Sat Mar 8 22:05:12 2003 @@ -69,7 +69,11 @@ struct exec_domain; * the EXP_n values would be 1981, 2034 and 2043 if still using only * 11 bit fractions. */ -extern unsigned long avenrun[]; /* Load averages */ +extern unsigned long avenrun[]; /* Load averages */ +extern unsigned long tasks_running[3]; /* Real load averages */ +extern unsigned long cpu_tasks_running[3][NR_CPUS]; /* Real load averages per cpu */ + +extern unsigned long tasks_running[]; /* Real load averages */ #define FSHIFT 11 /* nr of bits of precision */ #define FIXED_1 (1<fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ static inline void task_lock(struct task_struct *p) diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/sysctl.h 900-mjb3/include/linux/sysctl.h --- 000-bk3/include/linux/sysctl.h Tue Feb 25 23:03:51 2003 +++ 900-mjb3/include/linux/sysctl.h Sat Mar 8 22:05:11 2003 @@ -66,7 +66,8 @@ enum CTL_DEV=7, /* Devices */ CTL_BUS=8, /* Busses */ CTL_ABI=9, /* Binary emulation */ - CTL_CPU=10 /* CPU stuff (speed scaling, etc) */ + CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_SCHED=11, /* scheduler tunables */ }; /* CTL_BUS names: */ @@ -129,6 +130,8 @@ enum KERN_CADPID=54, /* int: PID of the process to notify on CAD */ KERN_PIDMAX=55, /* int: PID # limit */ KERN_CORE_PATTERN=56, /* string: pattern for core-file names */ + + KERN_DUMP=60, /* directory: dump parameters */ }; @@ -157,6 +160,21 @@ enum VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ }; +/* Tunable scheduler parameters in /proc/sys/sched/ */ +enum { + SCHED_MIN_TIMESLICE=1, /* minimum process timeslice */ + SCHED_MAX_TIMESLICE=2, /* maximum process timeslice */ + SCHED_CHILD_PENALTY=3, /* penalty on fork to child */ + SCHED_PARENT_PENALTY=4, /* penalty on fork to parent */ + SCHED_EXIT_WEIGHT=5, /* penalty to parent of CPU hog child */ + SCHED_PRIO_BONUS_RATIO=6, /* percent of max prio given as bonus */ + SCHED_INTERACTIVE_DELTA=7, /* delta used to scale interactivity */ + SCHED_MAX_SLEEP_AVG=8, /* maximum sleep avg attainable */ + SCHED_STARVATION_LIMIT=9, /* no re-active if expired is starved */ + SCHED_NODE_THRESHOLD=10, /* NUMA node rebalance threshold */ + SCHED_IDLE_NODE_REBALANCE_RATIO=11, /* how often to global balance */ + SCHED_BUSY_NODE_REBALANCE_RATIO=12, /* how often to global balance */ +}; /* CTL_NET names: */ enum diff -urpN -X /home/fletch/.diff.exclude 000-bk3/include/linux/timex.h 900-mjb3/include/linux/timex.h --- 000-bk3/include/linux/timex.h Sun Nov 17 20:29:21 2002 +++ 900-mjb3/include/linux/timex.h Sat Mar 8 22:03:44 2003 @@ -76,7 +76,7 @@ #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #else -# error You lose. +# error Please use a HZ value which is between 12 and 1536 #endif /* diff -urpN -X /home/fletch/.diff.exclude 000-bk3/init/Makefile 900-mjb3/init/Makefile --- 000-bk3/init/Makefile Wed Mar 5 07:37:08 2003 +++ 900-mjb3/init/Makefile Sat Mar 8 22:05:11 2003 @@ -1,6 +1,10 @@ # # Makefile for the linux kernel. # +ifdef CONFIG_CRASH_DUMP +EXTRA_TARGETS := kerntypes.o +CFLAGS_kerntypes.o := -gstabs +endif obj-y := main.o version.o mounts.o initramfs.o mounts-y := do_mounts.o @@ -23,3 +27,4 @@ $(obj)/version.o: include/linux/compile. include/linux/compile.h: FORCE @echo -n ' GEN $@' @sh $(srctree)/scripts/mkcompile_h $@ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CC) $(CFLAGS)" + diff -urpN -X /home/fletch/.diff.exclude 000-bk3/init/kerntypes.c 900-mjb3/init/kerntypes.c --- 000-bk3/init/kerntypes.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/init/kerntypes.c Sat Mar 8 22:05:11 2003 @@ -0,0 +1,24 @@ +/* + * kerntypes.c + * + * Copyright (C) 2000 Tom Morano (tjm@sgi.com) and + * Matt D. Robinson (yakker@alacritech.com) + * + * Dummy module that includes headers for all kernel types of interest. + * The kernel type information is used by the lcrash utility when + * analyzing system crash dumps or the live system. Using the type + * information for the running system, rather than kernel header files, + * makes for a more flexible and robust analysis tool. + * + * This source code is released under version 2 of the GNU GPL. + */ +#include +#include +#include +#include +#include + +void +kerntypes_dummy(void) +{ +} diff -urpN -X /home/fletch/.diff.exclude 000-bk3/init/main.c 900-mjb3/init/main.c --- 000-bk3/init/main.c Wed Mar 5 07:37:08 2003 +++ 900-mjb3/init/main.c Sat Mar 8 22:05:11 2003 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,10 @@ #include #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + /* * Versions of gcc older than that listed below may actually compile * and link okay, but the end product can have subtle run time bugs. @@ -88,6 +93,16 @@ extern void ipc_init(void); int system_running = 0; /* + * The kernel_magic value represents the address of _end, which allows + * namelist tools to "match" each other respectively. That way a tool + * that looks at /dev/mem can verify that it is using the right System.map + * file -- if kernel_magic doesn't equal the namelist value of _end, + * something's wrong. + */ +extern unsigned long _end; +unsigned long *kernel_magic = &_end; + +/* * Boot command-line arguments */ #define MAX_INIT_ARGS 8 @@ -375,6 +390,7 @@ asmlinkage void __init start_kernel(void */ lock_kernel(); printk(linux_banner); + setup_early_printk(&command_line); setup_arch(&command_line); setup_per_cpu_areas(); @@ -445,6 +461,12 @@ asmlinkage void __init start_kernel(void * make syscalls (and thus be locked). */ init_idle(current, smp_processor_id()); + +#ifdef CONFIG_X86_REMOTE_DEBUG + if (gdb_enter) { + gdb_hook(); /* right at boot time */ + } +#endif /* Do the rest non-__init'ed, we're now alive */ rest_init(); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/kernel/Makefile 900-mjb3/kernel/Makefile --- 000-bk3/kernel/Makefile Tue Feb 25 23:03:51 2003 +++ 900-mjb3/kernel/Makefile Sat Mar 8 22:04:56 2003 @@ -18,6 +18,8 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_KPROBES) += kprobes.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -urpN -X /home/fletch/.diff.exclude 000-bk3/kernel/early_printk.c 900-mjb3/kernel/early_printk.c --- 000-bk3/kernel/early_printk.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/kernel/early_printk.c Sat Mar 8 22:03:43 2003 @@ -0,0 +1,209 @@ +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +static int current_ypos = 1, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= MAX_YPOS) { + /* scroll 1 line up */ + for(k = 1, j = 0; k < MAX_YPOS; k++, j++) { + for(i = 0; i < MAX_XPOS; i++) { + writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), + VGABASE + 2*(MAX_XPOS*j + i)); + } + } + for(i = 0; i < MAX_XPOS; i++) { + writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); + } + current_ypos = MAX_YPOS-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++)); + if (current_xpos >= MAX_XPOS) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ + +int early_serial_base; /* ttyS0 */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + rep_nop(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); + s++; + } +} + +static __init void early_serial_init(char *opt) +{ + unsigned char c; + unsigned divisor, baud = DEFAULT_BAUD; + static int bases[] = SERIAL_BASES; + char *s, *e; + + early_serial_base = bases[0]; + + if (*opt == ',') + ++opt; + + s = strsep(&opt, ","); + if (s != NULL) { + unsigned port; + if (!strncmp(s,"0x",2)) + early_serial_base = simple_strtoul(s, &e, 16); + else { + if (!strncmp(s,"ttyS",4)) + s+=4; + port = simple_strtoul(s, &e, 10); + if (port > (SERIAL_BASES_LEN-1) || s == e) + port = 0; + early_serial_base = bases[port]; + } + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + s = strsep(&opt, ","); + if (s != NULL) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + va_start(ap,fmt); + n = vsnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int keep_early; + +int __init setup_early_printk(char *opt) +{ + char *space, *s; + char buf[256]; + + s = strstr(opt, "earlyprintk="); + if (s == NULL) + return -1; + opt = s+12; + + if (early_console_initialized) + return -1; + + strncpy(buf,opt,256); + buf[255] = 0; + space = strchr(buf, ' '); + if (space) + *space = 0; + + if (strstr(buf,"keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3)) { + early_console = &early_vga_console; + } else { + early_console = NULL; + return -1; + } + early_console_initialized = 1; + register_console(early_console); + early_printk( "early printk console registered\n" ); + return 0; +} + +void __init disable_early_printk(void) +{ + if (!early_console_initialized || !early_console) + return; + if (!keep_early) { + printk("disabling early console...\n"); + unregister_console(early_console); + early_console_initialized = 0; + } else { + printk("keeping early console.\n"); + } +} + +/* syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. */ +__setup("earlyprintk=", setup_early_printk); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/kernel/fork.c 900-mjb3/kernel/fork.c --- 000-bk3/kernel/fork.c Sat Mar 8 10:16:06 2003 +++ 900-mjb3/kernel/fork.c Sat Mar 8 22:05:09 2003 @@ -197,7 +197,10 @@ void __init fork_init(unsigned long memp * value: the thread structures can take up at most half * of memory. */ - max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + if (THREAD_SIZE >= PAGE_SIZE) + max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + else + max_threads = (mempages * (PAGE_SIZE/THREAD_SIZE)) / 8; /* * we need to allow at least 20 threads to boot a system */ diff -urpN -X /home/fletch/.diff.exclude 000-bk3/kernel/kprobes.c 900-mjb3/kernel/kprobes.c --- 000-bk3/kernel/kprobes.c Wed Dec 31 16:00:00 1969 +++ 900-mjb3/kernel/kprobes.c Sat Mar 8 22:04:56 2003 @@ -0,0 +1,89 @@ +/* Support for kernel probes. + (C) 2002 Vamsi Krishna S . +*/ +#include +#include +#include +#include +#include +#include +#include + +#define KPROBE_HASH_BITS 6 +#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) + +static struct list_head kprobe_table[KPROBE_TABLE_SIZE]; + +unsigned int kprobe_cpu = NR_CPUS; +static spinlock_t kprobe_lock = SPIN_LOCK_UNLOCKED; + +/* Locks kprobe: irqs must be disabled */ +void lock_kprobes(void) +{ + spin_lock(&kprobe_lock); + kprobe_cpu = smp_processor_id(); +} + +void unlock_kprobes(void) +{ + kprobe_cpu = NR_CPUS; + spin_unlock(&kprobe_lock); +} + +/* You have to be holding the kprobe_lock */ +struct kprobe *get_kprobe(void *addr) +{ + struct list_head *head, *tmp; + + head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; + list_for_each(tmp, head) { + struct kprobe *p = list_entry(tmp, struct kprobe, list); + if (p->addr == addr) + return p; + } + return NULL; +} + +int register_kprobe(struct kprobe *p) +{ + int ret = 0; + + spin_lock_irq(&kprobe_lock); + if (get_kprobe(p->addr)) { + ret = -EEXIST; + goto out; + } + list_add(&p->list, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); + + p->opcode = *p->addr; + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t)); + out: + spin_unlock_irq(&kprobe_lock); + return ret; +} + +void unregister_kprobe(struct kprobe *p) +{ + spin_lock_irq(&kprobe_lock); + *p->addr = p->opcode; + list_del(&p->list); + flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t)); + spin_unlock_irq(&kprobe_lock); +} + +static int __init init_kprobes(void) +{ + int i; + + /* FIXME allocate the probe table, currently defined statically */ + /* initialize all list heads */ + for (i = 0; i < KPROBE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kprobe_table[i]); + + return 0; +} +__initcall(init_kprobes); + +EXPORT_SYMBOL_GPL(register_kprobe); +EXPORT_SYMBOL_GPL(unregister_kprobe); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/kernel/ksyms.c 900-mjb3/kernel/ksyms.c --- 000-bk3/kernel/ksyms.c Wed Mar 5 07:37:08 2003 +++ 900-mjb3/kernel/ksyms.c Sat Mar 8 22:05:11 2003 @@ -57,6 +57,8 @@ #include #include #include +#include +#include #include #if defined(CONFIG_PROC_FS) @@ -471,7 +473,10 @@ EXPORT_SYMBOL(sleep_on); EXPORT_SYMBOL(sleep_on_timeout); EXPORT_SYMBOL(interruptible_sleep_on); EXPORT_SYMBOL(interruptible_sleep_on_timeout); -EXPORT_SYMBOL(schedule); +EXPORT_SYMBOL(do_schedule); +#ifdef CONFIG_KGDB_THREAD +EXPORT_SYMBOL(kern_schedule); +#endif #ifdef CONFIG_PREEMPT EXPORT_SYMBOL(preempt_schedule); #endif @@ -620,3 +625,8 @@ EXPORT_SYMBOL(dump_stack); EXPORT_SYMBOL(ptrace_notify); EXPORT_SYMBOL(current_kernel_time); + +#ifdef CONFIG_CRASH_DUMP_MODULE +EXPORT_SYMBOL(min_low_pfn); +EXPORT_SYMBOL(dump_oncpu); +#endif diff -urpN -X /home/fletch/.diff.exclude 000-bk3/kernel/panic.c 900-mjb3/kernel/panic.c --- 000-bk3/kernel/panic.c Thu Jan 9 19:16:15 2003 +++ 900-mjb3/kernel/panic.c Sat Mar 8 22:05:11 2003 @@ -53,6 +53,7 @@ NORET_TYPE void panic(const char * fmt, va_start(args, fmt); vsprintf(buf, fmt, args); va_end(args); + printk(KERN_EMERG "Kernel panic: %s\n",buf); if (in_interrupt()) printk(KERN_EMERG "In interrupt handler - not syncing\n"); @@ -62,14 +63,13 @@ NORET_TYPE void panic(const char * fmt, sys_sync(); bust_spinlocks(0); + notifier_call_chain(&panic_notifier_list, 0, buf); + #ifdef CONFIG_SMP smp_send_stop(); #endif - notifier_call_chain(&panic_notifier_list, 0, buf); - - if (panic_timeout > 0) - { + if (panic_timeout > 0) { /* * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked.. diff -urpN -X /home/fletch/.diff.exclude 000-bk3/kernel/sched.c 900-mjb3/kernel/sched.c --- 000-bk3/kernel/sched.c Sat Mar 8 10:16:06 2003 +++ 900-mjb3/kernel/sched.c Sat Mar 8 22:05:12 2003 @@ -33,6 +33,15 @@ #include #include +#ifdef CONFIG_NUMA +#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) +#else +#define cpu_to_node_mask(cpu) (cpu_online_map) +#endif + +/* used to soft spin in sched while dump is in progress */ +int dump_oncpu; + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -58,16 +67,27 @@ * maximum timeslice is 200 msecs. Timeslices get refilled after * they expire. */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) -#define NODE_THRESHOLD 125 +int min_timeslice = (10 * HZ) / 1000; +int max_timeslice = (200 * HZ) / 1000; +int child_penalty = 50; +int parent_penalty = 100; +int exit_weight = 3; +int prio_bonus_ratio = 25; +int interactive_delta = 2; +int max_sleep_avg = 10 * HZ; +int starvation_limit = 10 * HZ; +int node_threshold = 125; + +#define MIN_TIMESLICE (min_timeslice) +#define MAX_TIMESLICE (max_timeslice) +#define CHILD_PENALTY (child_penalty) +#define PARENT_PENALTY (parent_penalty) +#define EXIT_WEIGHT (exit_weight) +#define PRIO_BONUS_RATIO (prio_bonus_ratio) +#define INTERACTIVE_DELTA (interactive_delta) +#define MAX_SLEEP_AVG (max_sleep_avg) +#define STARVATION_LIMIT (starvation_limit) +#define NODE_THRESHOLD (node_threshold) /* * If a task is 'interactive' then we reinsert it in the active @@ -154,10 +174,9 @@ struct runqueue { task_t *curr, *idle; struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; - int prev_nr_running[NR_CPUS]; + int prev_cpu_load[NR_CPUS]; #ifdef CONFIG_NUMA atomic_t *node_nr_running; - unsigned int nr_balanced; int prev_node_load[MAX_NUMNODES]; #endif task_t *migration_thread; @@ -225,6 +244,111 @@ __init void node_nr_running_init(void) #endif /* CONFIG_NUMA */ + +struct schedstat { + /* sys_sched_yield stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + + /* load_balance stats */ + unsigned long lb_imbalance; + unsigned long lb_idle; + unsigned long lb_busy; + unsigned long lb_resched; + unsigned long lb_cnt; + unsigned long lb_nobusy; + unsigned long lb_bnode; + + /* pull_task stats */ + unsigned long pt_gained; + unsigned long pt_lost; + unsigned long pt_node_gained; + unsigned long pt_node_lost; + + /* balance_node stats */ + unsigned long bn_cnt; + unsigned long bn_idle; +} ____cacheline_aligned; + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 2 + +struct schedstat schedstats[NR_CPUS]; + +/* + * This could conceivably exceed a page's worth of output on machines with + * large number of cpus, where large == about 4096/100 or 40ish. Start + * worrying when we pass 32, probably. Then this has to stop being a + * "simple" entry in proc/proc_misc.c and needs to be an actual seq_file. + */ +int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct schedstat sums; + int i, len; + + memset(&sums, 0, sizeof(sums)); + len = sprintf(page, "version %d\n", SCHEDSTAT_VERSION); + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) continue; + sums.yld_exp_empty += schedstats[i].yld_exp_empty; + sums.yld_act_empty += schedstats[i].yld_act_empty; + sums.yld_both_empty += schedstats[i].yld_both_empty; + sums.yld_cnt += schedstats[i].yld_cnt; + sums.sched_noswitch += schedstats[i].sched_noswitch; + sums.sched_switch += schedstats[i].sched_switch; + sums.sched_cnt += schedstats[i].sched_cnt; + sums.lb_idle += schedstats[i].lb_idle; + sums.lb_busy += schedstats[i].lb_busy; + sums.lb_resched += schedstats[i].lb_resched; + sums.lb_cnt += schedstats[i].lb_cnt; + sums.lb_imbalance += schedstats[i].lb_imbalance; + sums.lb_nobusy += schedstats[i].lb_nobusy; + sums.lb_bnode += schedstats[i].lb_bnode; + sums.pt_node_gained += schedstats[i].pt_node_gained; + sums.pt_node_lost += schedstats[i].pt_node_lost; + sums.pt_gained += schedstats[i].pt_gained; + sums.pt_lost += schedstats[i].pt_lost; + sums.bn_cnt += schedstats[i].bn_cnt; + sums.bn_idle += schedstats[i].bn_idle; + len += sprintf(page + len, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu %lu\n", + i, schedstats[i].yld_both_empty, + schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty, + schedstats[i].yld_cnt, schedstats[i].sched_noswitch, + schedstats[i].sched_switch, schedstats[i].sched_cnt, + schedstats[i].lb_idle, schedstats[i].lb_busy, + schedstats[i].lb_resched, + schedstats[i].lb_cnt, schedstats[i].lb_imbalance, + schedstats[i].lb_nobusy, schedstats[i].lb_bnode, + schedstats[i].pt_gained, schedstats[i].pt_lost, + schedstats[i].pt_node_gained, schedstats[i].pt_node_lost, + schedstats[i].bn_cnt, schedstats[i].bn_idle); + } + len += sprintf(page + len, + "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu\n", + sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty, + sums.yld_cnt, sums.sched_noswitch, sums.sched_switch, + sums.sched_cnt, sums.lb_idle, sums.lb_busy, sums.lb_resched, + sums.lb_cnt, sums.lb_imbalance, sums.lb_nobusy, sums.lb_bnode, + sums.pt_gained, sums.pt_lost, sums.pt_node_gained, + sums.pt_node_lost, sums.bn_cnt, sums.bn_idle); + + return len; +} + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -685,7 +809,6 @@ static inline task_t * context_switch(ru return prev; } - /* * nr_running, nr_uninterruptible and nr_context_switches: * @@ -703,6 +826,11 @@ unsigned long nr_running(void) return sum; } +unsigned long nr_running_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_running; +} + unsigned long nr_uninterruptible(void) { unsigned long i, sum = 0; @@ -867,31 +995,11 @@ static int find_busiest_node(int this_no return node; } -static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq) -{ - int this_node = cpu_to_node(this_cpu); - /* - * Avoid rebalancing between nodes too often. - * We rebalance globally once every NODE_BALANCE_RATE load balances. - */ - if (++(this_rq->nr_balanced) == NODE_BALANCE_RATE) { - int node = find_busiest_node(this_node); - this_rq->nr_balanced = 0; - if (node >= 0) - return (node_to_cpumask(node) | (1UL << this_cpu)); - } - return node_to_cpumask(this_node); -} - -#else /* !CONFIG_NUMA */ - -static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq) -{ - return cpu_online_map; -} - #endif /* CONFIG_NUMA */ +int idle_node_rebalance_ratio = 10; +int busy_node_rebalance_ratio = 100; + #if CONFIG_SMP /* @@ -909,10 +1017,10 @@ static inline unsigned int double_lock_b spin_lock(&busiest->lock); spin_lock(&this_rq->lock); /* Need to recalculate nr_running */ - if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) nr_running = this_rq->nr_running; else - nr_running = this_rq->prev_nr_running[this_cpu]; + nr_running = this_rq->prev_cpu_load[this_cpu]; } else spin_lock(&busiest->lock); } @@ -949,10 +1057,10 @@ static inline runqueue_t *find_busiest_q * that case we are less picky about moving a task across CPUs and * take what can be taken. */ - if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) nr_running = this_rq->nr_running; else - nr_running = this_rq->prev_nr_running[this_cpu]; + nr_running = this_rq->prev_cpu_load[this_cpu]; busiest = NULL; max_load = 1; @@ -961,11 +1069,11 @@ static inline runqueue_t *find_busiest_q continue; rq_src = cpu_rq(i); - if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i])) + if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i])) load = rq_src->nr_running; else - load = this_rq->prev_nr_running[i]; - this_rq->prev_nr_running[i] = rq_src->nr_running; + load = this_rq->prev_cpu_load[i]; + this_rq->prev_cpu_load[i] = rq_src->nr_running; if ((load > max_load) && (rq_src != this_rq)) { busiest = rq_src; @@ -1003,6 +1111,12 @@ out: */ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) { + if (cpu_to_node(this_cpu) != cpu_to_node(src_rq - runqueues)) { + schedstats[this_cpu].pt_node_gained++; + schedstats[src_rq - runqueues].pt_node_lost++; + } + schedstats[this_cpu].pt_gained++; + schedstats[src_rq - runqueues].pt_lost++; dequeue_task(p, src_array); nr_running_dec(src_rq); set_task_cpu(p, this_cpu); @@ -1029,7 +1143,7 @@ static inline void pull_task(runqueue_t * We call this with the current runqueue locked, * irqs disabled. */ -static void load_balance(runqueue_t *this_rq, int idle) +static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask) { int imbalance, idx, this_cpu = smp_processor_id(); runqueue_t *busiest; @@ -1037,11 +1151,14 @@ static void load_balance(runqueue_t *thi struct list_head *head, *curr; task_t *tmp; - busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, - cpus_to_balance(this_cpu, this_rq)); - if (!busiest) + schedstats[this_cpu].lb_cnt++; + busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); + if (!busiest) { + schedstats[this_cpu].lb_nobusy++; goto out; + } + schedstats[this_cpu].lb_imbalance += imbalance; /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1113,21 +1230,80 @@ out: * frequency and balancing agressivity depends on whether the CPU is * idle or not. * - * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on + * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on * systems with HZ=100, every 10 msecs.) + * + * On NUMA, do a node-rebalance every 400 msecs. */ -#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) +#define BUSY_REBALANCE_TICK (HZ/5 ?: 1) + +#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * idle_node_rebalance_ratio) +#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * busy_node_rebalance_ratio) -static inline void idle_tick(runqueue_t *rq) +#if CONFIG_NUMA +static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) { - if (jiffies % IDLE_REBALANCE_TICK) - return; - spin_lock(&rq->lock); - load_balance(rq, 1); - spin_unlock(&rq->lock); + int node = find_busiest_node(cpu_to_node(this_cpu)); + unsigned long cpumask, this_cpumask = 1UL << this_cpu; + + schedstats[this_cpu].bn_cnt++; + if (idle) + schedstats[this_cpu].bn_idle++; + if (node >= 0) { + cpumask = node_to_cpumask(node) | this_cpumask; + spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_bnode++; + load_balance(this_rq, idle, cpumask); + spin_unlock(&this_rq->lock); + } } +#endif + +static void rebalance_tick(runqueue_t *this_rq, int idle) +{ + int this_cpu = smp_processor_id(); + unsigned long j = jiffies; + /* + * First do inter-node rebalancing, then intra-node rebalancing, + * if both events happen in the same tick. The inter-node + * rebalancing does not necessarily have to create a perfect + * balance within the node, since we load-balance the most loaded + * node with the current CPU. (ie. other CPUs in the local node + * are not balanced.) + */ + if (idle) { +#if CONFIG_NUMA + if (!(j % IDLE_NODE_REBALANCE_TICK)) + balance_node(this_rq, idle, this_cpu); +#endif + if (!(j % IDLE_REBALANCE_TICK)) { + spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_idle++; + load_balance(this_rq, 0, cpu_to_node_mask(this_cpu)); + spin_unlock(&this_rq->lock); + } + return; + } +#if CONFIG_NUMA + if (!(j % BUSY_NODE_REBALANCE_TICK)) + balance_node(this_rq, idle, this_cpu); +#endif + if (!(j % BUSY_REBALANCE_TICK)) { + spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_busy++; + load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); + spin_unlock(&this_rq->lock); + } +} +#else +/* + * on UP we do not need to balance between CPUs: + */ +static inline void rebalance_tick(runqueue_t *this_rq, int idle) +{ +} #endif DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } }; @@ -1170,9 +1346,7 @@ void scheduler_tick(int user_ticks, int kstat_cpu(cpu).cpustat.iowait += sys_ticks; else kstat_cpu(cpu).cpustat.idle += sys_ticks; -#if CONFIG_SMP - idle_tick(rq); -#endif + rebalance_tick(rq, 1); return; } if (TASK_NICE(p) > 0) @@ -1228,11 +1402,8 @@ void scheduler_tick(int user_ticks, int enqueue_task(p, rq->active); } out: -#if CONFIG_SMP - if (!(jiffies % BUSY_REBALANCE_TICK)) - load_balance(rq, 0); -#endif spin_unlock(&rq->lock); + rebalance_tick(rq, 0); } void scheduling_functions_start_here(void) { } @@ -1240,19 +1411,28 @@ void scheduling_functions_start_here(voi /* * schedule() is the main scheduler function. */ -asmlinkage void schedule(void) +asmlinkage void do_schedule(void) { task_t *prev, *next; runqueue_t *rq; prio_array_t *array; struct list_head *queue; - int idx; + int idx, mycpu = smp_processor_id(); + + /* + * If crash dump is in progress, this other cpu's + * need to wait until it completes. + * NB: this code is optimized away for kernels without dumping enabled. + */ + if (unlikely(dump_oncpu)) + goto dump_scheduling_disabled; /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ + schedstats[mycpu].sched_cnt++; if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { if (unlikely(in_atomic())) { printk(KERN_ERR "bad: scheduling while atomic!\n"); @@ -1291,7 +1471,8 @@ need_resched: pick_next_task: if (unlikely(!rq->nr_running)) { #if CONFIG_SMP - load_balance(rq, 1); + schedstats[mycpu].lb_resched++; + load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); if (rq->nr_running) goto pick_next_task; #endif @@ -1305,11 +1486,13 @@ pick_next_task: /* * Switch the active and expired arrays. */ + schedstats[mycpu].sched_switch++; rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; } + schedstats[mycpu].sched_noswitch++; idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; @@ -1336,6 +1519,16 @@ switch_tasks: preempt_enable_no_resched(); if (test_thread_flag(TIF_NEED_RESCHED)) goto need_resched; + + return; + + dump_scheduling_disabled: + /* allow scheduling only if this is the dumping cpu */ + if (dump_oncpu != smp_processor_id()+1) { + while (dump_oncpu) + cpu_relax(); + } + return; } #ifdef CONFIG_PREEMPT @@ -1474,6 +1667,22 @@ void complete_all(struct completion *x) spin_unlock_irqrestore(&x->wait.lock, flags); } +asmlinkage void user_schedule(void) +{ +#ifdef CONFIG_KGDB_THREAD + current->thread.kgdbregs = NULL; +#endif + do_schedule(); +} + +#ifdef CONFIG_KGDB_THREAD +asmlinkage void kern_do_schedule(struct pt_regs regs) +{ + current->thread.kgdbregs = ®s; + do_schedule(); +} +#endif + void wait_for_completion(struct completion *x) { might_sleep(); @@ -1966,6 +2175,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; + int mycpu = smp_processor_id(); /* * We implement yielding by moving the task into the expired @@ -1974,7 +2184,15 @@ asmlinkage long sys_sched_yield(void) * (special rule: RT tasks will just roundrobin in the active * array.) */ + schedstats[mycpu].yld_cnt++; if (likely(!rt_task(current))) { + if (current->array->nr_active == 1) { + schedstats[mycpu].yld_act_empty++; + if (!rq->expired->nr_active) + schedstats[mycpu].yld_both_empty++; + } else if (!rq->expired->nr_active) { + schedstats[mycpu].yld_exp_empty++; + } dequeue_task(current, array); enqueue_task(current, rq->expired); } else { diff -urpN -X /home/fletch/.diff.exclude 000-bk3/kernel/sysctl.c 900-mjb3/kernel/sysctl.c --- 000-bk3/kernel/sysctl.c Mon Dec 16 21:50:51 2002 +++ 900-mjb3/kernel/sysctl.c Sat Mar 8 22:04:05 2003 @@ -55,6 +55,18 @@ extern char core_pattern[]; extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; +extern int min_timeslice; +extern int max_timeslice; +extern int child_penalty; +extern int parent_penalty; +extern int exit_weight; +extern int prio_bonus_ratio; +extern int interactive_delta; +extern int max_sleep_avg; +extern int starvation_limit; +extern int node_threshold; +extern int idle_node_rebalance_ratio; +extern int busy_node_rebalance_ratio; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -112,6 +124,7 @@ static struct ctl_table_header root_tabl static ctl_table kern_table[]; static ctl_table vm_table[]; +static ctl_table sched_table[]; #ifdef CONFIG_NET extern ctl_table net_table[]; #endif @@ -156,6 +169,7 @@ static ctl_table root_table[] = { {CTL_FS, "fs", NULL, 0, 0555, fs_table}, {CTL_DEBUG, "debug", NULL, 0, 0555, debug_table}, {CTL_DEV, "dev", NULL, 0, 0555, dev_table}, + {CTL_SCHED, "sched", NULL, 0, 0555, sched_table}, {0} }; @@ -358,7 +372,49 @@ static ctl_table debug_table[] = { static ctl_table dev_table[] = { {0} -}; +}; + +static ctl_table sched_table[] = { + {SCHED_MAX_TIMESLICE, "max_timeslice", &max_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_MIN_TIMESLICE, "min_timeslice", &min_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_CHILD_PENALTY, "child_penalty", &child_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PARENT_PENALTY, "parent_penalty", &parent_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_EXIT_WEIGHT, "exit_weight", &exit_weight, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio", &prio_bonus_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_INTERACTIVE_DELTA, "interactive_delta", &interactive_delta, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_MAX_SLEEP_AVG, "max_sleep_avg", &max_sleep_avg, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_STARVATION_LIMIT, "starvation_limit", &starvation_limit, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_NODE_THRESHOLD, "node_threshold", &node_threshold, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + sysctl_intvec, NULL, &one, NULL}, + {SCHED_IDLE_NODE_REBALANCE_RATIO, "idle_node_rebalance_ratio", + &idle_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_BUSY_NODE_REBALANCE_RATIO, "busy_node_rebalance_ratio", + &busy_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {0} +}; extern void init_irq_proc (void); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/kernel/timer.c 900-mjb3/kernel/timer.c --- 000-bk3/kernel/timer.c Wed Mar 5 07:37:08 2003 +++ 900-mjb3/kernel/timer.c Sat Mar 8 22:05:12 2003 @@ -736,6 +736,8 @@ static unsigned long count_active_tasks( * Requires xtime_lock to access. */ unsigned long avenrun[3]; +unsigned long tasks_running[3]; +unsigned long cpu_tasks_running[3][NR_CPUS]; /* * calc_load - given tick count, update the avenrun load estimates. @@ -743,8 +745,9 @@ unsigned long avenrun[3]; */ static inline void calc_load(unsigned long ticks) { - unsigned long active_tasks; /* fixed-point */ + unsigned long active_tasks, running_tasks; /* fixed-point */ static int count = LOAD_FREQ; + int cpu; count -= ticks; if (count < 0) { @@ -753,6 +756,19 @@ static inline void calc_load(unsigned lo CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); + running_tasks = nr_running() * FIXED_1; + CALC_LOAD(tasks_running[0], EXP_1, running_tasks); + CALC_LOAD(tasks_running[1], EXP_5, running_tasks); + CALC_LOAD(tasks_running[2], EXP_15, running_tasks); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!cpu_online(cpu)) + continue; + running_tasks = nr_running_cpu(cpu) * FIXED_1; + CALC_LOAD(cpu_tasks_running[0][cpu], EXP_1, running_tasks); + CALC_LOAD(cpu_tasks_running[1][cpu], EXP_5, running_tasks); + CALC_LOAD(cpu_tasks_running[2][cpu], EXP_15, running_tasks); + } + } } diff -urpN -X /home/fletch/.diff.exclude 000-bk3/lib/Kconfig 900-mjb3/lib/Kconfig --- 000-bk3/lib/Kconfig Sun Nov 17 20:29:47 2002 +++ 900-mjb3/lib/Kconfig Sat Mar 8 22:05:11 2003 @@ -17,13 +17,13 @@ config CRC32 # config ZLIB_INFLATE tristate - default y if CRAMFS=y || PPP_DEFLATE=y || JFFS2_FS=y || ZISOFS_FS=y || BINFMT_ZFLAT=y - default m if CRAMFS=m || PPP_DEFLATE=m || JFFS2_FS=m || ZISOFS_FS=m || BINFMT_ZFLAT=m + default y if CRAMFS=y || PPP_DEFLATE=y || JFFS2_FS=y || ZISOFS_FS=y || BINFMT_ZFLAT=y || CRASH_DUMP_COMPRESS_GZIP=y + default m if CRAMFS=m || PPP_DEFLATE=m || JFFS2_FS=m || ZISOFS_FS=m || BINFMT_ZFLAT=m || CRASH_DUMP_COMPRESS_GZIP=m config ZLIB_DEFLATE tristate - default m if PPP_DEFLATE!=y && JFFS2_FS!=y && (PPP_DEFLATE=m || JFFS2_FS=m) - default y if PPP_DEFLATE=y || JFFS2_FS=y + default m if PPP_DEFLATE!=y && JFFS2_FS!=y && (PPP_DEFLATE=m || JFFS2_FS=m) || CRASH_DUMP_COMPRESS_GZIP=m + default y if PPP_DEFLATE=y || JFFS2_FS=y || CRASH_DUMP_COMPRESS_GZIP=y endmenu diff -urpN -X /home/fletch/.diff.exclude 000-bk3/mm/memory.c 900-mjb3/mm/memory.c --- 000-bk3/mm/memory.c Wed Mar 5 07:37:08 2003 +++ 900-mjb3/mm/memory.c Sat Mar 8 22:04:41 2003 @@ -101,8 +101,7 @@ static inline void free_one_pmd(struct m static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) { - int j; - pmd_t * pmd; + pmd_t * pmd, * md, * emd; if (pgd_none(*dir)) return; @@ -113,8 +112,21 @@ static inline void free_one_pgd(struct m } pmd = pmd_offset(dir, 0); pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(tlb, pmd+j); + /* + * Beware if changing the loop below. It once used int j, + * for (j = 0; j < PTRS_PER_PMD; j++) + * free_one_pmd(pmd+j); + * but some older i386 compilers (e.g. egcs-2.91.66, gcc-2.95.3) + * terminated the loop with a _signed_ address comparison + * using "jle", when configured for HIGHMEM64GB (X86_PAE). + * If also configured for 3GB of kernel virtual address space, + * if page at physical 0x3ffff000 virtual 0x7ffff000 is used as + * a pmd, when that mm exits the loop goes on to free "entries" + * found at 0x80000000 onwards. The loop below compiles instead + * to be terminated by unsigned address comparison using "jb". + */ + for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++) + free_one_pmd(tlb,md); pmd_free_tlb(tlb, pmd); } @@ -286,9 +298,16 @@ skip_copy_pte_range: goto cont_copy_pte_range_noset; } pfn = pte_pfn(pte); + /* the pte points outside of valid memory, the + * mapping is assumed to be good, meaningful + * and not mapped via rmap - duplicate the + * mapping as is. + */ + if (!pfn_valid(pfn)) { + set_pte(dst_pte, pte); + goto cont_copy_pte_range_noset; + } page = pfn_to_page(pfn); - if (!pfn_valid(pfn)) - goto cont_copy_pte_range; if (PageReserved(page)) goto cont_copy_pte_range; @@ -986,6 +1005,7 @@ static int do_wp_page(struct mm_struct * ++mm->rss; page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + SetPageAnon(new_page); pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); @@ -1195,6 +1215,7 @@ static int do_swap_page(struct mm_struct flush_page_to_ram(page); flush_icache_page(vma, page); set_pte(page_table, pte); + SetPageAnon(page); pte_chain = page_add_rmap(page, page_table, pte_chain); /* No need to invalidate - it was non-present before */ @@ -1261,6 +1282,7 @@ do_anonymous_page(struct mm_struct *mm, entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); mark_page_accessed(page); + SetPageAnon(page); } set_pte(page_table, entry); @@ -1332,6 +1354,7 @@ do_no_page(struct mm_struct *mm, struct copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); + SetPageAnon(page); new_page = page; } diff -urpN -X /home/fletch/.diff.exclude 000-bk3/mm/mmap.c 900-mjb3/mm/mmap.c --- 000-bk3/mm/mmap.c Wed Mar 5 07:37:08 2003 +++ 900-mjb3/mm/mmap.c Wed Mar 12 19:47:38 2003 @@ -1258,20 +1258,24 @@ int do_munmap(struct mm_struct *mm, unsi /* * If we need to split any vma, do it now to save pain later. + * + * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially + * unmapped vm_area_struct will remain in use: so lower split_vma + * places tmp vma above, and higher split_vma places tmp vma below. */ if (start > mpnt->vm_start) { if (split_vma(mm, mpnt, start, 0)) return -ENOMEM; prev = mpnt; - mpnt = mpnt->vm_next; } /* Does it split the last one? */ last = find_vma(mm, end); if (last && end > last->vm_start) { - if (split_vma(mm, last, end, 0)) + if (split_vma(mm, last, end, 1)) return -ENOMEM; } + mpnt = prev? prev->vm_next: mm->mmap; /* * Remove the vma's, and unmap the actual pages diff -urpN -X /home/fletch/.diff.exclude 000-bk3/mm/page_alloc.c 900-mjb3/mm/page_alloc.c --- 000-bk3/mm/page_alloc.c Thu Feb 13 11:08:15 2003 +++ 900-mjb3/mm/page_alloc.c Sat Mar 8 22:05:11 2003 @@ -85,7 +85,8 @@ static void bad_page(const char *functio page->mapping = NULL; } -#ifndef CONFIG_HUGETLB_PAGE +#if !defined(CONFIG_HUGETLB_PAGE) && !defined(CONFIG_CRASH_DUMP) \ + && !defined(CONFIG_CRASH_DUMP_MODULE) #define prep_compound_page(page, order) do { } while (0) #define destroy_compound_page(page, order) do { } while (0) #else @@ -220,6 +221,8 @@ static inline void free_pages_check(cons bad_page(function, page); if (PageDirty(page)) ClearPageDirty(page); + if (PageAnon(page)) + ClearPageAnon(page); } /* @@ -265,6 +268,7 @@ void __free_pages_ok(struct page *page, mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); + list_add(&page->list, &list); free_pages_bulk(page_zone(page), 1, &list, order); } @@ -447,6 +451,7 @@ static void free_hot_cold_page(struct pa inc_page_state(pgfree); free_pages_check(__FUNCTION__, page); + pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); if (pcp->count >= pcp->high) diff -urpN -X /home/fletch/.diff.exclude 000-bk3/mm/rmap.c 900-mjb3/mm/rmap.c --- 000-bk3/mm/rmap.c Thu Jan 9 19:16:15 2003 +++ 900-mjb3/mm/rmap.c Sat Mar 8 22:04:43 2003 @@ -86,6 +86,89 @@ kmem_cache_t *pte_chain_cache; * If the page has a single-entry pte_chain, collapse that back to a PageDirect * representation. This way, it's only done under memory pressure. */ +static inline int +page_referenced_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long loffset; + unsigned long address; + int referenced = 0; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + if (loffset < vma->vm_pgoff) + goto out; + + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + + if (address >= vma->vm_end) + goto out; + + if (!spin_trylock(&mm->page_table_lock)) { + referenced = 1; + goto out; + } + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) { + goto out_unlock; + } + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) { + goto out_unlock; + } + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) { + goto out_unmap; + } + if (page_to_pfn(page) != pte_pfn(*pte)) { + goto out_unmap; + } + if (ptep_test_and_clear_young(pte)) + referenced++; +out_unmap: + pte_unmap(pte); + +out_unlock: + spin_unlock(&mm->page_table_lock); + +out: + return referenced; +} + +static int +page_referenced_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int referenced = 0; + + if (!page->pte.mapcount) + return 0; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return 1; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + referenced += page_referenced_obj_one(vma, page); + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + referenced += page_referenced_obj_one(vma, page); + } + + up(&mapping->i_shared_sem); + + return referenced; +} + int page_referenced(struct page * page) { struct pte_chain * pc; @@ -94,6 +177,10 @@ int page_referenced(struct page * page) if (TestClearPageReferenced(page)) referenced++; + if (!PageAnon(page)) { + referenced += page_referenced_obj(page); + goto out; + } if (PageDirect(page)) { pte_t *pte = rmap_ptep_map(page->pte.direct); if (ptep_test_and_clear_young(pte)) @@ -127,6 +214,7 @@ int page_referenced(struct page * page) __pte_chain_free(pc); } } +out: return referenced; } @@ -159,6 +247,18 @@ page_add_rmap(struct page *page, pte_t * pte_chain_lock(page); + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + inc_page_state(nr_mapped); + page->pte.mapcount++; + pte_chain_unlock(page); + return pte_chain; + } + #ifdef DEBUG_RMAP /* * This stuff needs help to get up to highmem speed. @@ -247,6 +347,20 @@ void page_remove_rmap(struct page * page pte_chain_lock(page); + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + BUG(); + page->pte.mapcount--; + if (!page->pte.mapcount) + dec_page_state(nr_mapped); + pte_chain_unlock(page); + return; + } + if (PageDirect(page)) { if (page->pte.direct == pte_paddr) { page->pte.direct = 0; @@ -310,6 +424,112 @@ out: return; } +static inline int +try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + unsigned long loffset; + unsigned long address; + int ret = SWAP_SUCCESS; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + if (loffset < vma->vm_pgoff) + goto out; + + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + + if (address >= vma->vm_end) + goto out; + + if (!spin_trylock(&mm->page_table_lock)) { + ret = SWAP_AGAIN; + goto out; + } + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) { + goto out_unlock; + } + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) { + goto out_unlock; + } + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) { + goto out_unmap; + } + if (page_to_pfn(page) != pte_pfn(*pte)) { + goto out_unmap; + } + + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unmap; + } + + flush_cache_page(vma, address); + pteval = ptep_get_and_clear(pte); + flush_tlb_page(vma, address); + + if (pte_dirty(pteval)) + set_page_dirty(page); + + if (!page->pte.mapcount) + BUG(); + + mm->rss--; + page->pte.mapcount--; + page_cache_release(page); + +out_unmap: + pte_unmap(pte); + +out_unlock: + spin_unlock(&mm->page_table_lock); + +out: + return ret; +} + +static int +try_to_unmap_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int ret = SWAP_SUCCESS; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return SWAP_AGAIN; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret != SWAP_SUCCESS) + goto out; + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret != SWAP_SUCCESS) + goto out; + } + + if (page->pte.mapcount) + BUG(); + +out: + up(&mapping->i_shared_sem); + return ret; +} + /** * try_to_unmap_one - worker function for try_to_unmap * @page: page to unmap @@ -413,6 +633,11 @@ int try_to_unmap(struct page * page) /* We need backing store to swap out a page. */ if (!page->mapping) BUG(); + + if (!PageAnon(page)) { + ret = try_to_unmap_obj(page); + goto out; + } if (PageDirect(page)) { ret = try_to_unmap_one(page, page->pte.direct); diff -urpN -X /home/fletch/.diff.exclude 000-bk3/mm/slab.c 900-mjb3/mm/slab.c --- 000-bk3/mm/slab.c Wed Mar 5 07:37:08 2003 +++ 900-mjb3/mm/slab.c Wed Mar 12 19:47:38 2003 @@ -1960,11 +1960,16 @@ kfree_percpu(const void *objp) unsigned int kmem_cache_size(kmem_cache_t *cachep) { + unsigned int objlen = cachep->objsize; + #if DEBUG if (cachep->flags & SLAB_RED_ZONE) - return (cachep->objsize - 2*BYTES_PER_WORD); + objlen -= 2*BYTES_PER_WORD; + if (cachep->flags & SLAB_STORE_USER) + objlen -= BYTES_PER_WORD; #endif - return cachep->objsize; + + return objlen; } kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags) diff -urpN -X /home/fletch/.diff.exclude 000-bk3/mm/swapfile.c 900-mjb3/mm/swapfile.c --- 000-bk3/mm/swapfile.c Thu Jan 9 19:16:15 2003 +++ 900-mjb3/mm/swapfile.c Sat Mar 8 22:04:41 2003 @@ -390,6 +390,7 @@ unuse_pte(struct vm_area_struct *vma, un return; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + SetPageAnon(page); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); ++vma->vm_mm->rss; diff -urpN -X /home/fletch/.diff.exclude 000-bk3/net/core/dev.c 900-mjb3/net/core/dev.c --- 000-bk3/net/core/dev.c Wed Mar 5 07:37:08 2003 +++ 900-mjb3/net/core/dev.c Sat Mar 8 22:05:11 2003 @@ -1250,8 +1250,6 @@ int netif_rx(struct sk_buff *skb) struct softnet_data *queue; unsigned long flags; - if (!skb->stamp.tv_sec) - do_gettimeofday(&skb->stamp); /* * The code is rearranged so that the path is the most @@ -1261,6 +1259,13 @@ int netif_rx(struct sk_buff *skb) this_cpu = smp_processor_id(); queue = &softnet_data[this_cpu]; + if (skb->dev->rx_hook) + goto rx_hook; +rx_hook_continue: + + if (!skb->stamp.tv_sec ) + do_gettimeofday(&skb->stamp); + netdev_rx_stat[this_cpu].total++; if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { if (queue->input_pkt_queue.qlen) { @@ -1303,6 +1308,15 @@ drop: kfree_skb(skb); return NET_RX_DROP; +rx_hook: + { + int ret; + + ret = skb->dev->rx_hook(skb); + if (ret == NET_RX_DROP) + goto drop; + goto rx_hook_continue; + } } /* Deliver skb to an old protocol, which is not threaded well diff -urpN -X /home/fletch/.diff.exclude 000-bk3/scripts/schedcapture 900-mjb3/scripts/schedcapture --- 000-bk3/scripts/schedcapture Wed Dec 31 16:00:00 1969 +++ 900-mjb3/scripts/schedcapture Sat Mar 8 22:04:02 2003 @@ -0,0 +1,6 @@ +while true +do + cat /proc/schedstat + echo + sleep 20 +done diff -urpN -X /home/fletch/.diff.exclude 000-bk3/scripts/schedstat 900-mjb3/scripts/schedstat --- 000-bk3/scripts/schedstat Wed Dec 31 16:00:00 1969 +++ 900-mjb3/scripts/schedstat Sat Mar 8 22:04:02 2003 @@ -0,0 +1,168 @@ +#!/usr/bin/perl + +$slice = 20; # seconds +while (<>) { + @curr = split; + if ($curr[0] =~ /cpu(\d)/) { + $per_cpu_curr[$1] = [ @curr ]; + $max_cpu = $1 if ($1 > $max_cpu); + next; + } + next if (/^$/); + if ($curr[0] eq "version") { + if ($curr[1] != 2) { + die "Version mismatch. Update this tool.\n"; + } + next; + } + # + # format of line in /proc/schedstat + # + # tag 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + # + # tag is "cpuN" or "cpu". Right now, we ignore "cpuN" lines (this tool + # doesn't collate per-cpu statistics, although it would be trivial to + # do so.) + # + # version == 1 + # NOTE: the active queue is considered empty if it has only one process + # in it, since obviously the process calling sched_yield is that process. + # + # First four are sched_yield statistics: + # 1) # of times both the active and the expired queue were empty + # 2) # of times just the active queue was empty + # 3) # of times just the expired queue was empty + # 4) # of times sched_yield() was called + # + # Next two are schedule() statistics: + # 5) # of times the active queue had at least one other process on it. + # 6) # of times we switched to the expired queue and reused it + # 7) # of times schedule() was called + # + # Next seven are statistics dealing with load balancing: + # 8) # of times load_balance was called at an idle tick + # 9) # of times load_balance was called at an busy tick + # 10) # of times load_balance was called from schedule() + # 11) # of times load_balance was called + # 12) sum of imbalances discovered (if any) with each call to + # load_balance + # 13) # of times load_balance was called when we did not find a + # "busiest" queue + # 14) # of times load_balance was called from balance_node() + # + # Next four are statistics dealing with pull_task(): + # 15) # of times pull_task was called at an idle tick + # 16) # of times pull_task was called at an busy tick + # 17) # of times pull_task was called from schedule() + # 18) # of times pull_task was called + # + # Next two are statistics dealing with balance_node(): + # 19) # of times balance_node was called + # 20) # of times balance_node was called at an idle tick + # + #$curr[7] = $sched_cnt; + foreach $i (1..20) { + $diff[$i] = $curr[$i] - $prev[$i]; + } + + for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { + @arr_curr = @{$per_cpu_curr[$cpu]}; + @arr_prev = @{$per_cpu_prev[$cpu]}; + foreach $i (1..20) { + $arr_diff[$i] = $arr_curr[$i] - $arr_prev[$i]; + } + $per_cpu_diff[$cpu] = [ @arr_diff ]; + } + + #for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { +# print "@{$per_cpu_curr[$cpu]}\n"; +# } +# print "@curr\n"; + printf "%02d:%02d:%02d--------------------------------------------------------------\n", + $tick*$slice/3600, ($tick*$slice/60)%60, ($tick*$slice)%60; + + # + # sched_yield() stats + # + printf " %7d sys_sched_yield()\n", $diff[4]; + printf " %7d(%6.2f%%) found (only) active queue empty on current cpu\n", + $diff[2]-$diff[1], $diff[4] ? (100*($diff[2]-$diff[1])/$diff[4]) : 0; + printf " %7d(%6.2f%%) found (only) expired queue empty on current cpu\n", + $diff[3], $diff[4] ? (100*$diff[3]/$diff[4]) : 0; + printf " %7d(%6.2f%%) found both queues empty on current cpu\n", + $diff[1], $diff[4] ? (100*$diff[1]/$diff[4]) : 0; + printf " %7d(%6.2f%%) found neither queue empty on current cpu\n\n", + $diff[4]-($diff[3]+$diff[2]), + $diff[4] ? 100*($diff[4]-($diff[3]+$diff[2]))/$diff[4] : 0; + + # + # schedule() stats + # + printf " %7d schedule()\n", $diff[7]; + printf " %7d(%6.2f%%) switched active and expired queues\n", + $diff[6], $diff[7] ? (100*$diff[6]/$diff[7]) : 0; + printf " %7d(%6.2f%%) used existing active queue\n\n", + $diff[5]-$diff[6], $diff[7] ? (100*($diff[5]-$diff[6])/$diff[7]) : 0; + + # + # load_balance() stats + # + printf " %7d load_balance()\n", $diff[11]; + printf " %7d(%6.2f%%) called while idle\n", $diff[8], + 100*$diff[8]/$diff[11]; + printf " %7d(%6.2f%%) called while busy\n", $diff[9], + 100*($diff[9])/$diff[11]; + printf " %7d(%6.2f%%) called from schedule()\n", $diff[10], + 100*$diff[10]/$diff[11]; + printf " %7d(%6.2f%%) called from balance_node()\n", $diff[14], + 100*$diff[14]/$diff[11]; + printf " %7d no \"busiest\" queue found\n",$diff[13]; + if ($diff[11]-$diff[13]) { + $imbalance = $diff[12] / ($diff[11]-$diff[13]); + if ($imbalance < 10) { + printf " %7.3f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } elsif ($imbalance < 100) { + printf " %8.2f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } else { + printf " %9.1f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } + } + else { + printf " no imbalances\n"; + } + + # + # pull_task() stats + # + print "\n"; + printf " %7d pull_task()\n", $diff[15]; + for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { + @arr = @{$per_cpu_diff[$cpu]}; + if ($arr[15] || $arr[16]) { + printf " %7d/%-7d cpu %d lost/gained task to/from another cpu\n", + $arr[15], $arr[16], $cpu; + } + if ($arr[17] || $arr[18]) { + printf " %7d/%-7d cpu %d lost/gained task to/from another node\n", + $arr[17], $arr[18], $cpu; + } + } + print "\n"; + + # + # balance_node() stats + # + printf " %7d balance_node()\n", $diff[19]; + printf " %7d(%6.2f%%) called while idle\n", $diff[20], + $diff[19] ? 100*$diff[20]/$diff[19] : 0; + printf " %7d(%6.2f%%) called while busy\n", $diff[19] - $diff[20], + $diff[19] ? 100*(($diff[19]-$diff[20]))/$diff[19] : 0; + + printf("\n"); + @prev = @curr; + @per_cpu_prev = @per_cpu_curr; + $tick++; +}