diff -urpN -X /home/fletch/.diff.exclude 001-bk10/Documentation/filesystems/proc.txt 900-mjb5/Documentation/filesystems/proc.txt --- 001-bk10/Documentation/filesystems/proc.txt Sun Mar 16 13:38:20 2003 +++ 900-mjb5/Documentation/filesystems/proc.txt Sun Mar 16 13:38:53 2003 @@ -37,6 +37,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/sched - scheduler tunables ------------------------------------------------------------------------------ Preface @@ -1662,6 +1663,104 @@ IPX. The /proc/net/ipx_route table holds a list of IPX routes. For each route it gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. + +2.11 /proc/sys/sched - scheduler tunables +----------------------------------------- + +Useful knobs for tuning the scheduler live in /proc/sys/sched. + +child_penalty +------------- + +Percentage of the parent's sleep_avg that children inherit. sleep_avg is +a running average of the time a process spends sleeping. Tasks with high +sleep_avg values are considered interactive and given a higher dynamic +priority and a larger timeslice. You typically want this some value just +under 100. + +exit_weight +----------- + +When a CPU hog task exits, its parent's sleep_avg is reduced by a factor of +exit_weight against the exiting task's sleep_avg. + +interactive_delta +----------------- + +If a task is "interactive" it is reinserted into the active array after it +has expired its timeslice, instead of being inserted into the expired array. +How "interactive" a task must be in order to be deemed interactive is a +function of its nice value. This interactive limit is scaled linearly by nice +value and is offset by the interactive_delta. + +max_sleep_avg +------------- + +max_sleep_avg is the largest value (in ms) stored for a task's running sleep +average. The larger this value, the longer a task needs to sleep to be +considered interactive (maximum interactive bonus is a function of +max_sleep_avg). + +max_timeslice +------------- + +Maximum timeslice, in milliseconds. This is the value given to tasks of the +highest dynamic priority. + +min_timeslice +------------- + +Minimum timeslice, in milliseconds. This is the value given to tasks of the +lowest dynamic priority. Every task gets at least this slice of the processor +per array switch. + +parent_penalty +-------------- + +Percentage of the parent's sleep_avg that it retains across a fork(). +sleep_avg is a running average of the time a process spends sleeping. Tasks +with high sleep_avg values are considered interactive and given a higher +dynamic priority and a larger timeslice. Normally, this value is 100 and thus +task's retain their sleep_avg on fork. If you want to punish interactive +tasks for forking, set this below 100. + +prio_bonus_ratio +---------------- + +Middle percentage of the priority range that tasks can receive as a dynamic +priority. The default value of 25% ensures that nice values at the +extremes are still enforced. For example, nice +19 interactive tasks will +never be able to preempt a nice 0 CPU hog. Setting this higher will increase +the size of the priority range the tasks can receive as a bonus. Setting +this lower will decrease this range, making the interactivity bonus less +apparent and user nice values more applicable. + +starvation_limit +---------------- + +Sufficiently interactive tasks are reinserted into the active array when they +run out of timeslice. Normally, tasks are inserted into the expired array. +Reinserting interactive tasks into the active array allows them to remain +runnable, which is important to interactive performance. This could starve +expired tasks, however, since the interactive task could prevent the array +switch. To prevent starving the tasks on the expired array for too long. the +starvation_limit is the longest (in ms) we will let the expired array starve +at the expense of reinserting interactive tasks back into active. Higher +values here give more preferance to running interactive tasks, at the expense +of expired tasks. Lower values provide more fair scheduling behavior, at the +expense of interactivity. The units are in milliseconds. + +idle_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N idle rebalance ticks, where N = idle_node_rebalance_ratio. + +busy_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N busy rebalance ticks, where N = busy_node_rebalance_ratio. ------------------------------------------------------------------------------ Summary diff -urpN -X /home/fletch/.diff.exclude 001-bk10/Documentation/i386/gdb-serial.txt 900-mjb5/Documentation/i386/gdb-serial.txt --- 001-bk10/Documentation/i386/gdb-serial.txt Wed Dec 31 16:00:00 1969 +++ 900-mjb5/Documentation/i386/gdb-serial.txt Sun Mar 16 13:38:57 2003 @@ -0,0 +1,386 @@ +Version +======= + +This version of the gdbstub package was developed and tested on +kernel version 2.3.48. It will not install on a 2.2 kernel. It may +not work on earlier versions of 2.3 kernels. It is possible that +it will continue to work on later versions of 2.3 and then +versions of 2.4 (I hope). + + +Debugging Setup +=============== + +Designate one machine as the "development" machine. This is the +machine on which you run your compiles and which has your source +code for the kernel. Designate a second machine as the "target" +machine. This is the machine that will run your experimental +kernel. + +The two machines will be connected together via a serial line out +one or the other of the COM ports of the PC. You will need a modem +eliminator and the appropriate cables. + +On the DEVELOPMENT machine you need to apply the patch for the gdb +hooks. You have probably already done that if you are reading this +file. + +On your DEVELOPMENT machine, go to your kernel source directory and +do "make menuconfig". Go down to the kernel hacking menu item and +open it up. Enable the kernel gdb stub code by selecting that item. + +Save and exit the menuconfig program. Then do "make clean" and +"make bzImage" (or whatever target you want to make). This gets +the kernel compiled with the "-g" option set -- necessary for +debugging. + +You have just built the kernel on your DEVELOPMENT machine that you +intend to run on our TARGET machine. + +To install this new kernel, use the following installation procedure. +Remember, you are on the DEVELOPMENT machine patching the kernel source +for the kernel that you intend to run on the TARGET machine. + +Copy this kernel to your target machine using your usual procedures. +I usually arrange to copy development:/usr/src/linux/arch/i386/boot/zImage +to /vmlinuz on the TARGET machine via a LAN based NFS access. That is, +I run the cp command on the target and copy from the development machine +via the LAN. Run Lilo on the new kernel on the target machine so that it +will boot! Then boot the kernel on the target machine. + +There is an utility program named "gdbstart" in the +development:/usr/src/linux/arch/i386/kernel directory. +You should copy this program over to your target machine, probably into +/sbin. This utility program is run on the target machine to +activate the kernel hooks for the debugger. It is invoked as follows: + + gdbstart [-s speed] [-t tty-dev] + defaults: /dev/ttyS0 with speed unmodified by gdbstart + +Don't run the program just yet. We'll get to that in a bit. + +Decide on which tty port you want the machines to communicate, then +cable them up back-to-back using the null modem. COM1 is /dev/ttyS0 +and COM2 is /dev/ttyS1. + +On the DEVELOPMENT machine, create a file called .gdbinit in the +directory /usr/src/linux. An example .gdbinit file looks like this: + +define rmt +set remotebaud 38400 +target remote /dev/ttyS0 +end + +Assuming that you added my gdbinit stuff to your .gdbinit, edit .gdbinit +and find the section that looks like this: + + define rmt + set remotebaud 38400 + target remote /dev/ttyS0 + end + +Change the "target" definition so that it specifies the tty port that +you intend to use. Change the "remotebaud" definition to match the +data rate that you are going to use for the com line. + +On the TARGET machine I find it helpful to create shell script file +named "debug" in the root home directory with the following contents: + + gdbstart -s 38400 -t /dev/ttyS0 < + EOF + +This runs the gdbstart program and gives it the carriage return that +it prompts for. This sets the data rate from the target machine's side. + +You are now ready to try it out. + +On your TARGET machine, freshly rebooted with your gdbstub-equipped +kernel, type "debug" in the root home directory. The system will appear +to hang with some messages on the screen from the debug stub. What +it is doing is waiting for contact from the development machine. + +On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". +When gdb gets the symbols loaded and prompts you, enter "rmt" (that's +the macro from the .gdbinit file that you just edited). If everything +is working correctly you should see gdb print out a few lines indicating +that a breakpoint has been taken. It will actually show a line of +code in the target kernel inside the gdbstub activation code. + +The gdb interaction should look something like this: + + linux-dev:/usr/src/linux# gdb vmlinux + GDB is free software and you are welcome to distribute copies of it + under certain conditions; type "show copying" to see the conditions. + There is absolutely no warranty for GDB; type "show warranty" for details. + GDB 4.15.1 (i486-slackware-linux), + Copyright 1995 Free Software Foundation, Inc... + (gdb) rmt + breakpoint () at i386-stub.c:750 + 750 } + (gdb) + + +You can now use whatever gdb commands you like to set breakpoints. +Enter "continue" to start your target machine executing again. At this +point the target system will run at full speed until it encounters +your breakpoint or gets a segment violation in the kernel, or whatever. + + +Triggering gdbstub at Kernel Boot Time +====================================== + +The gdbstub patch now has the ability for gdb to connect to the kernel during +bootup (as opposed to waiting for the system to come all the way up and then +running the gdbstart program on the target machine). This new functionality was +added by Scott Foehner at SGI. + +To force a kernel that has been compiled with gdbstub to pause during the boot +process and wait for a connection from gdb, the paramter "gdb" should be passed +to the kernel. This can be done by typing "gdb" after the name of the kernel +on the LILO command line. The patch defaults to use ttyS1 at a baud rate of +38400. These parameters can be changed by using "gdbttyS=" and +"gdbbaud=" on the command line. + +Example: + +LILO boot: linux gdb gdbttyS=1 gdbbaud=38400 + +Note that this command is entered on the TARGET machine as it is booting +the kernel that was compiled on the DEVELOPMENT machine. + +An alternate approach is to place a line in the /etc/lilo.conf file on +your TARGET machine. Under the heading for the kernel that you intend +to boot, place a line that looks like this: + + append = "gdb gdbttyS=1 gdbbaud=38400" + +This will cause the kernel to enter the gdbstub automatically at boot +time. + +BE SURE to run "lilo" after changing the /etc/lilo.conf file. + + +The "gdbstart" Program +===================== + +This utility program is used to set up the com port and data rate +for the connection from the target system to the development system. +Its usage has been described above. + +This version of the patch uses the same tty ioctl for kernel versions +2.0.30 onwards. Thus, the gdbstart utility does not need to be re-compiled +to install the patch in a later version of the kernel. The ioctl added +to the kernel for this purpose is far enough "off the end" of existing +ioctls (as of 2.1.120) that it should not interfere with any new kernel +tty ioctls for quite some time (famous last words). + +The source for the gdbstart program resides in the arch/i386/kernel directory. + + +Debugging hints +=============== + +You can break into the target machine at any time from the development +machine by typing ^C. If the target machine has interrupts enabled +this will stop it in the kernel and enter the debugger. + +There is unfortunately no way of breaking into the kernel if it is +in a loop with interrupts disabled, so if this happens to you then +you need to place exploratory breakpoints or printk's into the kernel +to find out where it is looping. + +There is a copy of an e-mail in the kgdb distribution directory which +describes how to create an NMI on an ISA bus machine using a paper +clip. I have a sophisticated version of this made by wiring a push +button switch into a PC104/ISA bus adapter card. The adapter card +nicely furnishes wire wrap pins for all the ISA bus signals. + +When you are done debugging the kernel on the target machine it is +a good idea to leave it in a running state. This makes reboots +faster, bypassing the fsck. So do a gdb "continue" as the last gdb +command if this is possible. To terminate gdb itself on the development +machine and leave the target machine running, type ^Z to suspend gdb +and then kill it with "kill %1" or something similar. + +If gdbstub Does Not Work +======================== + +If it doesn't work, you will have to troubleshoot it. Do the easy things +first like double checking your cabling and data rates. You might +try some non-kernel based programs to see if the back-to-back connection +works properly. Just something simple like cat /etc/hosts >/dev/ttyS0 +on one machine and cat /dev/ttyS0 on the other will tell you if you +can send data from one machine to the other. There is no point in tearing +out your hair in the kernel if the line doesn't work. + +All of the real action takes place in the file +/usr/src/linux/arch/i386/kernel/gdbstub.c. That is the code on the target +machine that interacts with gdb on the development machine. In gdb you can +turn on a debug switch with the following command: + + set remotedebug + +This will print out the protocol messages that gdb is exchanging with +the target machine. + +Another place to look is /usr/src/linux/drivers/char/gdbserial.c +That is the code that talks to the serial port on the target side. +There might be a problem there. + +If you are really desperate you can use printk debugging in the +gdbstub code in the target kernel until you get it working. In particular, +there is a global variable in /usr/src/linux/arch/i386/kernel/gdbstub.c +named "remote_debug". Compile your kernel with this set to 1, rather +than 0 and the debug stub will print out lots of stuff as it does +what it does. + + +Debugging Loadable Modules +========================== + +This technique comes courtesy of Edouard Parmelan + + +When you run gdb, enter the command + +source gdbinit-modules + +This will read in a file of gdb macros that was installed in your +kernel source directory with kgdb was installed. This file implements +the following commands: + +mod-list + Lists the loaded modules in the form + +mod-print-symbols + Prints all the symbols in the indicated module. + +mod-add-symbols + Loads the symbols from the object file and associates them + with the indicated module. + +After you have loaded the module that you want to debug, use the command +mod-list to find the of your module. Then use that +address in the mod-add-symbols command to load your module's symbols. +From that point onward you can debug your module as if it were a part +of the kernel. + +The file gdbinit-modules also contains a command named mod-add-lis as +an example of how to construct a command of your own to load your +favorite module. The idea is to "can" the pathname of the module +in the command so you don't have to type so much. + +Threads +======= + +Each process in a target machine is seen as a gdb thread. gdb thread related +commands (info threads, thread n) can be used. + +ia-32 hardware breakpoints +========================== + +gdb stub contains support for hardware breakpoints using debugging features +of ia-32(x86) processors. These breakpoints do not need code modification. +They use debugging registers. 4 hardware breakpoints are available in ia-32 +processors. + +Each hardware breakpoint can be of one of the following three types. +1. Execution breakpoint - An Execution breakpoint is triggered when code at the + breakpoint address is executed. + + As limited number of hardware breakpoints are available, it is advisable + to use software breakpoints ( break command ) instead of execution + hardware breakpoints, unless modification of code is to be avoided. + +2. Write breakpoint - A write breakpoint is triggered when memory location at the + breakpoint address is written. + + A write or can be placed for data of variable length. Length of a write + breakpoint indicates length of the datatype to be watched. Length is 1 + for 1 byte data , 2 for 2 byte data, 3 for 4 byte data. + +3. Access breakpoint - An access breakpoint is triggered when memory location at + the breakpoint address is either read or written. + + Access breakpoints also have lengths similar to write breakpoints. + +IO breakpoints in ia-32 are not supported. + +Since gdb stub at present does not use the protocol used by gdb for hardware +breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros +for hardware breakpoints are described below. + +hwebrk - Places an execution breakpoint + hwebrk breakpointno address +hwwbrk - Places a write breakpoint + hwwbrk breakpointno length address +hwabrk - Places an access breakpoint + hwabrk breakpointno length address +hwrmbrk - Removes a breakpoint + hwrmbrk breakpointno +exinfo - Tells whether a software or hardware breakpoint has occured. + Prints number of the hardware breakpoint if a hardware breakpoint has + occured. + +Arguments required by these commands are as follows +breakpointno - 0 to 3 +length - 1 to 3 +address - Memory location in hex digits ( without 0x ) e.g c015e9bc + +MP support +========== + +When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb client, +all the processors are forced to enter the debugger. Current thread +corresponds to the thread running on the processor where breakpoint occured. +Threads running on other processor(s) appear similar to other non running +threads in the 'info threads' output. + +ia-32 hardware debugging registers on all processors are set to same values. +Hence any hardware breakpoints may occur on any processor. + +gdb troubleshooting +=================== + +1. gdb hangs +Kill it. restart gdb. Connect to target machine. + +2. gdb cannot connect to target machine (after killing a gdb and restarting +another) +If the target machine was not inside debugger when you killed gdb, gdb cannot +connect because the target machine won't respond. +In this case echo "Ctrl+C"(ascii 3) in the serial line. +e.g. echo -e "\003" > /dev/ttyS1 +This forces that target machine into debugger after which you can connect. + +3. gdb cannot connect even after echoing Ctrl+C into serial line +Try changing serial line settings min to 1 and time to 0 +e.g. stty min 1 time 0 < /dev/ttyS1 +Try echoing again + +check serial line speed and set it to correct value if required +e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 + +Final Items +=========== + +I picked up this code from Dave Grothe and enhanced it. + +If you make some really cool modification to this stuff, or if you +fix a bug, please let me know. + +Amit S. Kale + + +(First kgdb by David Grothe ) + +(modified by Tigran Aivazian ) + Putting gdbstub into the kernel config menu. + +(modified by Scott Foehner ) + Hooks for entering gdbstub at boot time. + +(modified by Amit S. Kale ) + Threads, ia-32 hw debugging, mp support, console support, + nmi watchdog handling. diff -urpN -X /home/fletch/.diff.exclude 001-bk10/Documentation/sysrq.txt 900-mjb5/Documentation/sysrq.txt --- 001-bk10/Documentation/sysrq.txt Tue Feb 25 23:03:43 2003 +++ 900-mjb5/Documentation/sysrq.txt Sun Mar 16 13:38:57 2003 @@ -73,6 +73,8 @@ On other - If you know of the key combos 'l' - Send a SIGKILL to all processes, INCLUDING init. (Your system will be non-functional after this.) +'g' - Enter the kernel debugger (if configured and supported). + 'h' - Will display help ( actually any other key than those listed above will display help. but 'h' is easy to remember :-) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/Makefile 900-mjb5/Makefile --- 001-bk10/Makefile Sun Mar 16 13:38:20 2003 +++ 900-mjb5/Makefile Sun Mar 16 19:41:58 2003 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 5 SUBLEVEL = 64 -EXTRAVERSION = -bk10 +EXTRAVERSION = -mjb5 # *DOCUMENTATION* # To see a list of typical targets execute "make help" @@ -281,6 +281,10 @@ ifneq ($(KBUILD_BUILTIN),1) KBUILD_BUILTIN := 1 endif endif +endif + +ifdef CONFIG_X86_REMOTE_DEBUG +CFLAGS += -g endif # diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/Kconfig 900-mjb5/arch/i386/Kconfig --- 001-bk10/arch/i386/Kconfig Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/Kconfig Sun Mar 16 18:59:37 2003 @@ -373,6 +373,11 @@ config X86_SSE2 depends on MK8 || MPENTIUM4 default y +config X86_CMOV + bool + depends on M686 || MPENTIUMII || MPENTIUMIII || MPENTIUM4 || MK8 || MCRUSOE + default y + config HUGETLB_PAGE bool "Huge TLB Page Support" help @@ -476,21 +481,6 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. -# Common NUMA Features -config NUMA - bool "Numa Memory Allocation Support" - depends on (HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT && ACPI && !ACPI_HT_ONLY))) || X86_PC - -config DISCONTIGMEM - bool - depends on NUMA - default y - -config HAVE_ARCH_BOOTMEM_NODE - bool - depends on NUMA - default y - config X86_TSC bool depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2) && !X86_NUMAQ @@ -670,6 +660,44 @@ config HIGHMEM64G endchoice +choice + help + On i386, a process can only virtually address 4GB of memory. This + lets you select how much of that virtual space you would like to + devoted to userspace, and how much to the kernel. + + Some userspace programs would like to address as much as possible and + have few demands of the kernel other than it get out of the way. These + users may opt to use the 3.5GB option to give their userspace program + as much room as possible. Due to alignment issues imposed by PAE, + the "3.5GB" option is unavailable if "64GB" high memory support is + enabled. + + Other users (especially those who use PAE) may be running out of + ZONE_NORMAL memory. Those users may benefit from increasing the + kernel's virtual address space size by taking it away from userspace, + which may not need all of its space. An indicator that this is + happening is when /proc/Meminfo's "LowFree:" is a small percentage of + "LowTotal:" while "HighFree:" is very large. + + If unsure, say "3GB" + prompt "User address space size" + default 1GB + +config 05GB + bool "3.5 GB" + depends on !HIGHMEM64G + +config 1GB + bool "3 GB" + +config 2GB + bool "2 GB" + +config 3GB + bool "1 GB" +endchoice + config HIGHMEM bool depends on HIGHMEM64G || HIGHMEM4G @@ -680,6 +708,30 @@ config X86_PAE depends on HIGHMEM64G default y +# Common NUMA Features +config NUMA + bool "Numa Memory Allocation Support" + depends on SMP && HIGHMEM64G && (X86_PC || X86_NUMAQ || (X86_SUMMIT && ACPI && !ACPI_HT_ONLY)) + default n if X86_PC + default y if (X86_NUMAQ || X86_SUMMIT) + +# Need comments to help the hapless user trying to turn on NUMA support +comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" + depends on X86_NUMAQ && (!HIGHMEM64G || !SMP) + +comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI" + depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY) + +config DISCONTIGMEM + bool + depends on NUMA + default y + +config HAVE_ARCH_BOOTMEM_NODE + bool + depends on NUMA + default y + config HIGHPTE bool "Allocate 3rd-level pagetables from highmem" depends on HIGHMEM4G || HIGHMEM64G @@ -689,6 +741,25 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config 4K_STACK + bool "Use smaller 4k per-task stacks" + help + This option will shrink the kernel's per-task stack from 8k to + 4k. This will greatly increase your chance of overflowing it. + But, if you use the per-cpu interrupt stacks as well, your chances + go way down. Also try the CONFIG_X86_STACK_CHECK overflow + detection. It is much more reliable than the currently in-kernel + version. + +config SHAREPTE + bool "Share 3rd-level pagetables between processes" + help + Normally each address space has its own complete page table for all + its mappings. This can mean many mappings of a set of shared data + pages. With this option, the VM will attempt to share the bottom + level of the page table between address spaces that are sharing data + pages. + config MATH_EMULATION bool "Math emulation" ---help--- @@ -748,6 +819,25 @@ config MTRR See for more information. +choice + help + This is unrelated to your processor's speed. This variable alters + how often the system is asked to generate timer interrupts. A larger + value can lead to a more responsive system, but also causes extra + overhead from the increased number of context switches. + + If in doubt, leave it at the default of 1000. + + prompt "Kernel HZ" + default 1000HZ + +config 100HZ + bool "100 Hz" + +config 1000HZ + bool "1000 Hz" +endchoice + config HAVE_DEC_LOCK bool depends on (SMP || PREEMPT) && X86_CMPXCHG @@ -1436,12 +1526,54 @@ source "arch/i386/oprofile/Kconfig" menu "Kernel hacking" +config CRASH_DUMP + tristate "Crash dump support (EXPERIMENTAL)" + depends on EXPERIMENTAL + default n + ---help--- + Say Y here to enable saving an image of system memory when a panic + or other error occurs. Dumps can also be forced with the SysRq+d + key if MAGIC_SYSRQ is enabled. + +config CRASH_DUMP_BLOCKDEV + tristate "Crash dump block device driver" + depends on CRASH_DUMP + help + Say Y to allow saving crash dumps directly to a disk device. + +config CRASH_DUMP_NETDEV + tristate "Crash dump network device driver" + depends on CRASH_DUMP + help + Say Y to allow saving crash dumps over a network device. + +config CRASH_DUMP_COMPRESS_RLE + tristate "Crash dump RLE compression" + depends on CRASH_DUMP + help + Say Y to allow saving dumps with Run Length Encoding compression. + +config CRASH_DUMP_COMPRESS_GZIP + tristate "Crash dump GZIP compression" + depends on CRASH_DUMP + help + Say Y to allow saving dumps with Gnu Zip compression. + config DEBUG_KERNEL bool "Kernel debugging" help Say Y here if you are developing drivers or trying to debug and identify kernel problems. +config KPROBES + bool "Kprobes" + depends on DEBUG_KERNEL + help + Kprobes allows you to trap at almost any kernel address, using + register_kprobe(), and providing a callback function. This is useful + for kernel debugging, non-intrusive instrumentation and testing. If + in doubt, say "N". + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL @@ -1454,6 +1586,17 @@ config DEBUG_SLAB allocation as well as poisoning memory on free to catch use of freed memory. +config X86_REMOTE_DEBUG + bool "KGDB: Remote (serial) kernel debugging with gdb" + +config KGDB_THREAD + bool "KGDB: Thread analysis" + depends on X86_REMOTE_DEBUG + +config GDB_CONSOLE + bool "KGDB: Console messages through gdb" + depends on X86_REMOTE_DEBUG + config DEBUG_IOVIRT bool "Memory mapped I/O debugging" depends on DEBUG_KERNEL @@ -1479,6 +1622,26 @@ config MAGIC_SYSRQ keys are documented in . Don't say Y unless you really know what this hack does. +config EARLY_PRINTK + bool "Early console support" + default n + depends on DEBUG_KERNEL + help + Write kernel log output directly into the VGA buffer or serial port. + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server.You should normally N here, + unless you want to debug such a crash. + + Syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. + config DEBUG_SPINLOCK bool "Spinlock debugging" depends on DEBUG_KERNEL @@ -1488,6 +1651,22 @@ config DEBUG_SPINLOCK best used in conjunction with the NMI watchdog so that spinlock deadlocks are also debuggable. +config SPINLINE + bool "Spinlock inlining" + depends on DEBUG_KERNEL + help + This will change spinlocks from out of line to inline, making them + account cost to the callers in readprofile, rather than the lock + itself (as ".text.lock.filename"). This can be helpful for finding + the callers of locks. + +config LOCKMETER + bool "Kernel lock metering" + depends on SMP + help + Say Y to enable kernel lock metering, which adds overhead to SMP locks, + but allows you to see various statistics using the lockstat command. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM @@ -1509,12 +1688,33 @@ config DEBUG_SPINLOCK_SLEEP noisy if they are called with a spinlock held. config FRAME_POINTER - bool "Compile the kernel with frame pointers" + bool + default y if X86_REMOTE_DEBUG + default n if !X86_REMOTE_DEBUG help If you say Y here the resulting kernel image will be slightly larger and slower, but it will give very useful debugging information. If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. + +config X86_STACK_CHECK + bool "Detect stack overflows" + depends on FRAME_POINTER + help + Say Y here to have the kernel attempt to detect when the per-task + kernel stack overflows. This is much more robust checking than + the above overflow check, which will only occasionally detect + an overflow. The level of guarantee here is much greater. + + Some older versions of gcc don't handle the -p option correctly. + Kernprof is affected by the same problem, which is described here: + http://oss.sgi.com/projects/kernprof/faq.html#Q9 + + Basically, if you get oopses in __free_pages_ok during boot when + you have this turned on, you need to fix gcc. The Redhat 2.96 + version and gcc-3.x seem to work. + + If not debugging a stack overflow problem, say N config X86_EXTRA_IRQS bool diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/Makefile 900-mjb5/arch/i386/Makefile --- 001-bk10/arch/i386/Makefile Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/Makefile Sun Mar 16 13:38:59 2003 @@ -76,6 +76,10 @@ mcore-$(CONFIG_X86_SUMMIT) := mach-defa # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +ifdef CONFIG_X86_STACK_CHECK +CFLAGS += -p +endif + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ @@ -89,6 +93,7 @@ drivers-$(CONFIG_OPROFILE) += arch/i386 CFLAGS += $(mflags-y) AFLAGS += $(mflags-y) +AFLAGS_vmlinux.lds.o += -imacros $(TOPDIR)/include/asm-i386/page.h boot := arch/i386/boot diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/boot/Makefile 900-mjb5/arch/i386/boot/Makefile --- 001-bk10/arch/i386/boot/Makefile Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/boot/Makefile Sun Mar 16 13:39:02 2003 @@ -102,3 +102,4 @@ zlilo: $(BOOTIMAGE) install: $(BOOTIMAGE) sh $(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" + if [ -f init/kerntypes.o ]; then cp init/kerntypes.o $(INSTALL_PATH)/Kerntypes; fi diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/boot/compressed/misc.c 900-mjb5/arch/i386/boot/compressed/misc.c --- 001-bk10/arch/i386/boot/compressed/misc.c Thu Jan 2 22:04:58 2003 +++ 900-mjb5/arch/i386/boot/compressed/misc.c Sun Mar 16 13:38:59 2003 @@ -377,3 +377,7 @@ asmlinkage int decompress_kernel(struct if (high_loaded) close_output_buffer_if_we_run_high(mv); return high_loaded; } + +/* We don't actually check for stack overflows this early. */ +__asm__(".globl mcount ; mcount: ret\n"); + diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/Makefile 900-mjb5/arch/i386/kernel/Makefile --- 001-bk10/arch/i386/kernel/Makefile Wed Mar 5 07:36:57 2003 +++ 900-mjb5/arch/i386/kernel/Makefile Sun Mar 16 13:39:05 2003 @@ -17,6 +17,7 @@ obj-$(CONFIG_MCA) += mca.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbstub.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o @@ -25,10 +26,20 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o n obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o suspend_asm.o obj-$(CONFIG_X86_NUMAQ) += numaq.o +obj-$(CONFIG_X86_SUMMIT) += summit.o obj-$(CONFIG_EDD) += edd.o +obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module.o obj-y += sysenter.o obj-$(CONFIG_ACPI_SRAT) += srat.o + +ifdef CONFIG_X86_REMOTE_DEBUG +GDBSTART=gdbstart +GDBCLEAN= -rm -f gdbstart /sbin/gdbstart +else +GDBSTART= +GDBCLEAN= +endif EXTRA_AFLAGS := -traditional diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/apic.c 900-mjb5/arch/i386/kernel/apic.c --- 001-bk10/arch/i386/kernel/apic.c Wed Mar 5 07:36:57 2003 +++ 900-mjb5/arch/i386/kernel/apic.c Sun Mar 16 13:38:58 2003 @@ -1053,7 +1053,8 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_apic_timer_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_apic_timer_interrupt(struct pt_regs* regs) { int cpu = smp_processor_id(); @@ -1073,14 +1074,16 @@ void smp_apic_timer_interrupt(struct pt_ * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(®s); + smp_local_timer_interrupt(regs); irq_exit(); + return regs; } /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -asmlinkage void smp_spurious_interrupt(void) +struct pt_regs * IRQHANDLER(smp_spurious_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_spurious_interrupt(struct pt_regs* regs) { unsigned long v; @@ -1098,13 +1101,15 @@ asmlinkage void smp_spurious_interrupt(v printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", smp_processor_id()); irq_exit(); + return regs; } /* * This interrupt should never happen with our APIC/SMP architecture */ -asmlinkage void smp_error_interrupt(void) +struct pt_regs * IRQHANDLER(smp_error_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_error_interrupt(struct pt_regs* regs) { unsigned long v, v1; @@ -1129,6 +1134,7 @@ asmlinkage void smp_error_interrupt(void printk (KERN_INFO "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); + return regs; } /* diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/cpu/mcheck/p4.c 900-mjb5/arch/i386/kernel/cpu/mcheck/p4.c --- 001-bk10/arch/i386/kernel/cpu/mcheck/p4.c Thu Jan 2 22:04:58 2003 +++ 900-mjb5/arch/i386/kernel/cpu/mcheck/p4.c Sun Mar 16 13:38:58 2003 @@ -61,11 +61,13 @@ static void intel_thermal_interrupt(stru /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_thermal_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_thermal_interrupt(struct pt_regs* regs) { irq_enter(); vendor_thermal_interrupt(®s); irq_exit(); + return regs; } /* P4/Xeon Thermal regulation detect and init */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/entry.S 900-mjb5/arch/i386/kernel/entry.S --- 001-bk10/arch/i386/kernel/entry.S Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/kernel/entry.S Sun Mar 16 18:34:49 2003 @@ -49,6 +49,10 @@ #include #include "irq_vectors.h" +#ifndef CONFIG_KGDB_THREAD +#define user_schedule schedule +#endif + EBX = 0x00 ECX = 0x04 EDX = 0x08 @@ -160,7 +164,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp @@ -224,7 +228,7 @@ need_resched: jz restore_all movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) sti - call schedule + call user_schedule movl $0,TI_PRE_COUNT(%ebp) cli jmp need_resched @@ -306,7 +310,7 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: - call schedule + call user_schedule cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -394,17 +398,78 @@ ENTRY(irq_entries_start) vector=vector+1 .endr + +# lets play optimizing compiler... +#ifdef CONFIG_X86_CMOV +#define COND_MOVE cmovnz %esi,%esp; +#else +#define COND_MOVE \ + jz 1f; \ + mov %esi,%esp; \ +1: +#endif + +# These macros will switch you to, and from a per-cpu interrupt stack +# They take the pt_regs arg and move it from the normal place on the +# stack to %eax. Any handler function can retrieve it using regparm(1). +# The handlers are expected to return the stack to switch back to in +# the same register. +# +# This means that the irq handlers need to return their arg +# +# SWITCH_TO_IRQSTACK clobbers %ebx, %ecx, %edx, %esi +# old stack gets put in %eax + +.macro SWITCH_TO_IRQSTACK + GET_THREAD_INFO(%ebx); + movl TI_IRQ_STACK(%ebx),%ecx; + movl TI_TASK(%ebx),%edx; + movl %esp,%eax; + + # %ecx+THREAD_SIZE is next stack -4 keeps us in the right one + leal (THREAD_SIZE-4)(%ecx),%esi; + + # is there a valid irq_stack? + testl %ecx,%ecx; + COND_MOVE; + + # update the task pointer in the irq stack + GET_THREAD_INFO(%esi); + movl %edx,TI_TASK(%esi); + + # update the preempt count in the irq stack + movl TI_PRE_COUNT(%ebx),%ecx; + movl %ecx,TI_PRE_COUNT(%esi); +.endm + +# copy flags from the irq stack back into the task's thread_info +# %esi is saved over the irq handler call and contains the irq stack's +# thread_info pointer +# %eax was returned from the handler, as described above +# %ebx contains the original thread_info pointer + +.macro RESTORE_FROM_IRQSTACK + movl %eax,%esp; + movl TI_FLAGS(%esi),%eax; + movl $0,TI_FLAGS(%esi); + LOCK orl %eax,TI_FLAGS(%ebx); +.endm + ALIGN common_interrupt: SAVE_ALL + SWITCH_TO_IRQSTACK call do_IRQ + RESTORE_FROM_IRQSTACK jmp ret_from_intr #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ - call smp_/**/name; \ + SWITCH_TO_IRQSTACK; \ + call smp_/**/name; \ + RESTORE_FROM_IRQSTACK; \ jmp ret_from_intr; /* The include is where all of the SMP etc. interrupts come from */ @@ -490,9 +555,16 @@ device_not_available_emulate: ENTRY(debug) CHECK_SYSENTER_EIP + pushl $-1 # mark this as an int + SAVE_ALL + movl %esp,%edx pushl $0 - pushl $do_debug - jmp error_code + pushl %edx + call do_debug + addl $8,%esp + testl %eax,%eax + jnz restore_all + jmp ret_from_exception ENTRY(nmi) CHECK_SYSENTER_EIP @@ -506,9 +578,16 @@ ENTRY(nmi) RESTORE_ALL ENTRY(int3) + pushl $-1 # mark this as an int + SAVE_ALL + movl %esp,%edx pushl $0 - pushl $do_int3 - jmp error_code + pushl %edx + call do_int3 + addl $8,%esp + testl %eax,%eax + jnz restore_all + jmp ret_from_exception ENTRY(overflow) pushl $0 @@ -534,6 +613,31 @@ ENTRY(invalid_TSS) pushl $do_invalid_TSS jmp error_code +#ifdef CONFIG_KGDB_THREAD +ENTRY(kern_schedule) + pushl %ebp + movl %esp, %ebp + pushl %ss + pushl %ebp + pushfl + pushl %cs + pushl 4(%ebp) + pushl %eax + pushl %es + pushl %ds + pushl %eax + pushl (%ebp) + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + call kern_do_schedule + movl %ebp, %esp + pop %ebp + ret +#endif + ENTRY(segment_not_present) pushl $do_segment_not_present jmp error_code @@ -565,6 +669,61 @@ ENTRY(spurious_interrupt_bug) pushl $0 pushl $do_spurious_interrupt_bug jmp error_code + + +#ifdef CONFIG_X86_STACK_CHECK +.data + .globl stack_overflowed +stack_overflowed: + .long 0 +.text + +ENTRY(mcount) + push %eax + movl $(THREAD_SIZE - 1),%eax + andl %esp,%eax + cmpl $STACK_WARN,%eax /* more than half the stack is used*/ + jle 1f +2: + popl %eax + ret +1: + lock; btsl $0,stack_overflowed + jc 2b + + # switch to overflow stack + movl %esp,%eax + movl $(stack_overflow_stack + THREAD_SIZE - 4),%esp + + pushf + cli + pushl %eax + + # push eip then esp of error for stack_overflow_panic + pushl 4(%eax) + pushl %eax + + # update the task pointer and cpu in the overflow stack's thread_info. + GET_THREAD_INFO_WITH_ESP(%eax) + movl TI_TASK(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_TASK + movl TI_CPU(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_CPU + + call stack_overflow + + # pop off call arguments + addl $8,%esp + + popl %eax + popf + movl %eax,%esp + popl %eax + movl $0,stack_overflowed + ret + +#warning stack check enabled +#endif .data ENTRY(sys_call_table) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/gdbstart.c 900-mjb5/arch/i386/kernel/gdbstart.c --- 001-bk10/arch/i386/kernel/gdbstart.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/arch/i386/kernel/gdbstart.c Sun Mar 16 13:38:57 2003 @@ -0,0 +1,147 @@ +/* + * This program opens a tty file and issues the GDB stub activating + * ioctl on it. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +char *tty_name = "/dev/ttyS0" ; /* COM1 port */ +int speed = 9600 ; /* default speed */ +struct termios save_ts ; /* original term struct */ + +void print_usage(void) +{ + printf("gdbstub [-s speed] [-t tty-dev]\n") ; + printf(" defaults: /dev/ttyS0 with speed unmodified by this program\n"); + +} /* print_usage */ + +void tty_err(char *msg) +{ + char buf[100] ; + + strcpy(buf, msg) ; + strcat(buf, ": ") ; + strcat(buf, tty_name) ; + perror(buf) ; + exit(1) ; + +} /* tty_err */ + + +void setup_term(int fd) +{ + struct termios ts ; + int speed_code ; + + if (tcgetattr(fd, &ts) < 0) tty_err("tcgetattr") ; + + save_ts = ts ; + switch (speed) + { + case 4800: + speed_code = B4800 ; + break ; + case 9600: + speed_code = B9600 ; + break ; + case 19200: + speed_code = B19200 ; + break ; + case 38400: + speed_code = B38400 ; + break ; + case 57600: + speed_code = B57600 ; + break ; + case 115200: + speed_code = B115200 ; + break ; + case 230400: + speed_code = B230400 ; + break ; + default: + printf("Invalid speed: %d\n", speed) ; + exit(1) ; + } + + ts.c_cflag = CS8 | CREAD | CLOCAL ; + if (cfsetospeed(&ts, speed_code) < 0) tty_err("cfsetospeed") ; + if (cfsetispeed(&ts, speed_code) < 0) tty_err("cfsetispeed") ; + + if (tcsetattr(fd, TCSANOW, &ts) < 0) tty_err("tcsetattr") ; + +} /* setup_term */ + +int main(int argc, char **argv) +{ + int opt ; + int fil ; + int rslt ; + + while ((opt = getopt(argc, argv, "hs:t:")) > 0) + { + switch (opt) + { + case 's': + speed = atol(optarg) ; + break ; + case 't': + tty_name = optarg ; + break ; + case ':': + printf("Invalid option\n") ; + break ; + case '?': + case 'h': + default: + print_usage() ; + return 1; + } + } + + fil = open(tty_name, O_RDWR) ; + if (fil < 0) + { + perror(tty_name) ; + return 1; + } + + + setup_term(fil) ; + + /* + * When we issue this ioctl, control will not return until + * the debugger running on the remote host machine says "go". + */ + printf("\nAbout to activate GDB stub in the kernel on %s\n", tty_name) ; + printf("Hit CR to continue, kill program to abort -- ") ; + getchar() ; + sync() ; + rslt = ioctl(fil, TIOCGDB, 0) ; + if (rslt < 0) + { + perror("TIOCGDB ioctl") ; + return 1; + } + + printf("\nGDB stub successfully activated\n") ; + + for (;;) + { + pause() ; + } + + if (tcsetattr(fil, TCSANOW, &save_ts) < 0) tty_err("tcsetattr") ; + + exit(0); +} /* main */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/gdbstub.c 900-mjb5/arch/i386/kernel/gdbstub.c --- 001-bk10/arch/i386/kernel/gdbstub.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/arch/i386/kernel/gdbstub.c Sun Mar 16 13:38:57 2003 @@ -0,0 +1,1208 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (C) 2000-2001 VERITAS Software Corporation. + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: Amit Kale + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Origianl kgdb, compatibility with 2.1.xx kernel by David Grothe + * Integrated into 2.2.5 kernel by Tigran Aivazian + * thread support, + * support for multiple processors, + * support for ia-32(x86) hardware debugging, + * Console support, + * handling nmi watchdog + * Amit S. Kale ( akale@veritas.com ) + * + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif +#include + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int putDebugChar(int); /* write a single character */ +extern int getDebugChar(void); /* read and return a single char */ + +extern int pid_max; + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 1024 + +static char initialized; /* boolean flag. != 0 means we've been initialized */ + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES 64 +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +enum regnames { _EAX, /* 0 */ + _ECX, /* 1 */ + _EDX, /* 2 */ + _EBX, /* 3 */ + _ESP, /* 4 */ + _EBP, /* 5 */ + _ESI, /* 6 */ + _EDI, /* 7 */ + _PC /* 8 also known as eip */ , + _PS /* 9 also known as eflags */ , + _CS, /* 10 */ + _SS, /* 11 */ + _DS, /* 12 */ + _ES, /* 13 */ + _FS, /* 14 */ + _GS +}; /* 15 */ + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* */ + +#define BREAKPOINT() asm(" int $3"); + +/* Put the error code here just in case the user cares. */ +int gdb_i386errcode; +/* Likewise, the vector number here (since GDB only gets the signal + number through the usual means, and that's not very specific). */ +int gdb_i386vector = -1; + +static spinlock_t slavecpulocks[KGDB_MAX_NO_CPUS]; +volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#ifdef CONFIG_SMP +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +spinlock_t kgdb_nmispinlock = SPIN_LOCK_UNLOCKED; +#else +unsigned kgdb_spinlock = 0; +unsigned kgdb_nmispinlock = 0; +#endif + +static void +kgdb_usercode(void) +{ +} + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + do { + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + if (!putDebugChar(ch)) + return; + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + + } while ((getDebugChar() & 0x7f) != '+'); + +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static short error; + +static void +regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_EAX] = regs->eax; + gdb_regs[_EBX] = regs->ebx; + gdb_regs[_ECX] = regs->ecx; + gdb_regs[_EDX] = regs->edx; + gdb_regs[_ESI] = regs->esi; + gdb_regs[_EDI] = regs->edi; + gdb_regs[_EBP] = regs->ebp; + gdb_regs[_DS] = regs->xds; + gdb_regs[_ES] = regs->xes; + gdb_regs[_PS] = regs->eflags; + gdb_regs[_CS] = regs->xcs; + gdb_regs[_PC] = regs->eip; + gdb_regs[_ESP] = (int) (®s->esp); + gdb_regs[_SS] = __KERNEL_DS; + gdb_regs[_FS] = 0xFFFF; + gdb_regs[_GS] = 0xFFFF; +} /* regs_to_gdb_regs */ + +static void +gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) +{ + regs->eax = gdb_regs[_EAX]; + regs->ebx = gdb_regs[_EBX]; + regs->ecx = gdb_regs[_ECX]; + regs->edx = gdb_regs[_EDX]; + regs->esi = gdb_regs[_ESI]; + regs->edi = gdb_regs[_EDI]; + regs->ebp = gdb_regs[_EBP]; + regs->xds = gdb_regs[_DS]; + regs->xes = gdb_regs[_ES]; + regs->eflags = gdb_regs[_PS]; + regs->xcs = gdb_regs[_CS]; + regs->eip = gdb_regs[_PC]; +#if 0 /* can't change these */ + regs->esp = gdb_regs[_ESP]; + regs->xss = gdb_regs[_SS]; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif + +} /* gdb_regs_to_regs */ + +/* Indicate to caller of mem2hex or hex2mem that there has been an + error. */ +static volatile int kgdb_memerr = 0; +volatile int kgdb_memerr_expected = 0; +static volatile int kgdb_memerr_cnt = 0; +static int garbage_loc = -1; + +int +get_char(char *addr) +{ + return *addr; +} + +void +set_char(char *addr, int val) +{ + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set kgdb_memerr in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + + ch = get_char(mem++); + + if (may_fault && kgdb_memerr) { + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + kgdb_memerr_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch); + + if (may_fault && kgdb_memerr) { + return (mem); + } + } + if (may_fault) + kgdb_memerr_expected = 0; + return (mem); +} + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToInt(char **ptr, int *intValue) +{ + int numChars = 0; + int hexValue; + + *intValue = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *intValue = (*intValue << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#ifdef CONFIG_KGDB_THREAD +static int +stubhex(int ch) +{ + if (ch >= 'a' && ch <= 'f') + return ch - 'a' + 10; + if (ch >= '0' && ch <= '9') + return ch - '0'; + if (ch >= 'A' && ch <= 'F') + return ch - 'A' + 10; + return -1; +} + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif + +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +#ifdef CONFIG_KGDB_THREAD +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif + +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} + +#ifdef CONFIG_KGDB_THREAD +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } +#if 0 + thread = init_tasks[0]; + do { + if (thread->pid == pid) { + return thread; + } + thread = thread->next_task; + } while (thread != init_tasks[0]); +#endif + return NULL; +} +#endif + +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned addr; +} breakinfo[4] = { { +enabled:0}, { +enabled:0}, { +enabled:0}, { +enabled:0}}; + +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned dr7; + + asm volatile ("movl %%db7, %0\n":"=r" (dr7) + :); + do { + unsigned addr0, addr1, addr2, addr3; + asm volatile ("movl %%db0, %0\n" + "movl %%db1, %1\n" + "movl %%db2, %2\n" + "movl %%db3, %3\n":"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3):); + } while (0); + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movl %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movl %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movl %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movl %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movl %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +void +gdb_wait(void *arg) +{ + unsigned flags; + int processor; + + local_irq_save(flags); + processor = smp_processor_id(); + procindebug[processor] = 1; + current->thread.kgdbregs = arg; + spin_lock(slavecpulocks + processor); + correct_hw_break(); + procindebug[processor] = 0; + local_irq_restore(flags); +} + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + int addr, length; + int breakno, breaktype; + char *ptr; + int newPC; + unsigned long flags = ~0UL; + int gdb_regs[NUMREGBYTES / 4]; + int i; + int dr6; + int reboot = 0; +#ifdef CONFIG_KGDB_THREAD + int nothreads; + int maxthreads; + int threadid; + threadref thref; + struct task_struct *thread = NULL; +#endif +#define regs (*linux_regs) + + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { + return (0); + } + + if (kgdb_memerr_expected) { + /* + * This fault occured because of the get_char or set_char + * routines. These two routines use either eax of edx to + * indirectly reference the location in memory that they + * are working with. For a page fault, when we return + * the instruction will be retried, so we have to make + * sure that these registers point to valid memory. + */ + kgdb_memerr = 1; /* set mem error flag */ + kgdb_memerr_expected = 0; + kgdb_memerr_cnt++; /* helps in debugging */ + regs.eax = (long) &garbage_loc; /* make valid address */ + regs.edx = (long) &garbage_loc; /* make valid address */ + return (0); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_nmispinlock)) +#else + if (!kgdb_nmispinlock) +#endif + { + + /* Get kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_lock(&kgdb_spinlock); +#else + kgdb_spinlock = 1; +#endif + + local_irq_save(flags); + + /* Disable hardware debugging while we are in kgdb */ + __asm__("movl %0,%%db7": /* no output */ + :"r"(0)); + + for (i = 0; i < NR_CPUS; i++) { + spin_lock_init(&slavecpulocks[i]); + _raw_spin_lock(&slavecpulocks[i]); + } + + if (num_online_cpus() > 1) { + /* Force other cpus in debugger */ + if (smp_call_function(gdb_wait, NULL, 0, 99) != 0) { + return (1); + } + } + + procindebug[smp_processor_id()] = 1; + } + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'g': /* return the value of the CPU registers */ + if (!usethread || usethread == current) { + regs_to_gdb_regs(gdb_regs, ®s); + } else { + memset(gdb_regs, 0, NUMREGBYTES); + if (usethread->thread.kgdbregs) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + get_char((char *) usethread->thread. + kgdbregs); + kgdb_memerr_expected = 0; + if (kgdb_memerr) { + gdb_regs[_PC] = + (int) kgdb_usercode; + } else { + regs_to_gdb_regs(gdb_regs, + usethread-> + thread. + kgdbregs); + } + } else { + gdb_regs[_PC] = (int) kgdb_usercode; + } + } + mem2hex((char *) gdb_regs, remcomOutBuffer, NUMREGBYTES, + 0); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], (char *) gdb_regs, + NUMREGBYTES, 0); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) { + ptr = 0; + mem2hex((char *) addr, + remcomOutBuffer, length, + 1); + if (kgdb_memerr) { + strcpy(remcomOutBuffer, + "E03"); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + } + break; + + /* MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) + if (*(ptr++) == ':') { + hex2mem(ptr, + (char *) addr, + length, 1); + + if (kgdb_memerr) { + strcpy + (remcomOutBuffer, + "E03"); + } else { + strcpy + (remcomOutBuffer, + "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + } + break; + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + case 'c': + case 's': +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* try to read optional parameter, pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) { + regs.eip = addr; + } + + newPC = regs.eip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno)) { + if (breakinfo[breakno].type == + 0) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + } + correct_hw_break(); + asm volatile ("movl %0, %%db6\n"::"r" (0)); + for (i = 0; i < NR_CPUS; i++) { + _raw_spin_unlock(&slavecpulocks[i]); + } + + procindebug[smp_processor_id()] = 0; + /* Release kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_unlock(&kgdb_spinlock); +#else + kgdb_spinlock = 0; +#endif + if (flags != ~0UL) + local_irq_restore(flags); + return (0); + + /* kill the program */ + case 'k': + break; + + /* query */ + case 'q': + switch (remcomInBuffer[1]) { +#ifdef CONFIG_KGDB_THREAD + case 'L': + /* List threads */ + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads + && threadid < pid_max; threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + } + } + if (threadid == pid_max) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; + + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; +#endif + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, err_code, + remcomOutBuffer); + break; + } + break; + +#ifdef CONFIG_KGDB_THREAD + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + usethread = thread; + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; +#endif + + case 'r': + reboot = 1; + strcpy(remcomOutBuffer, "OK"); + break; + case 'Y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + ptr++; + hexToInt(&ptr, &breaktype); + ptr++; + hexToInt(&ptr, &length); + ptr++; + hexToInt(&ptr, &addr); + if (set_hw_break + (breakno & 0x3, breaktype & 0x3, length & 0x3, addr) + == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + if (reboot == 1) { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt)); + __asm__ __volatile__("int3"); + } + } +} + +/* this function is used to set up exception handlers for tracing and + breakpoints */ +void +set_debug_traps(void) +{ + /* + * linux_debug_hook is defined in traps.c. We store a pointer + * to our own exception handler into it. + */ + linux_debug_hook = handle_exception; + + /* + * In case GDB is started before us, ack any packets (presumably + * "$?#xx") sitting there. */ + putDebugChar('+'); + + initialized = 1; +} + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ + +void +breakpoint(void) +{ + if (initialized) + BREAKPOINT(); +} + +#ifdef CONFIG_GDB_CONSOLE +char gdbconbuf[BUFMAX]; + +void +gdb_console_write(struct console *co, const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + + if (!gdb_initialized) { + return; + } + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } +} +#endif +static int __init +kgdb_opt_gdb(char *dummy) +{ + gdb_enter = 1; + return 1; +} +static int __init +kgdb_opt_gdbttyS(char *str) +{ + gdb_ttyS = simple_strtoul(str, NULL, 10); + return 1; +} +static int __init +kgdb_opt_gdbbaud(char *str) +{ + gdb_baud = simple_strtoul(str, NULL, 10); + return 1; +} + +/* + * Sequence of these lines has to be maintained because gdb option is a prefix + * of the other two options + */ + +__setup("gdbttyS=", kgdb_opt_gdbttyS); +__setup("gdbbaud=", kgdb_opt_gdbbaud); +__setup("gdb", kgdb_opt_gdb); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/head.S 900-mjb5/arch/i386/kernel/head.S --- 001-bk10/arch/i386/kernel/head.S Tue Feb 25 23:03:43 2003 +++ 900-mjb5/arch/i386/kernel/head.S Sun Mar 16 13:38:58 2003 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -325,7 +326,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/i386_ksyms.c 900-mjb5/arch/i386/kernel/i386_ksyms.c --- 001-bk10/arch/i386/kernel/i386_ksyms.c Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/kernel/i386_ksyms.c Sun Mar 16 13:39:02 2003 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,8 @@ #include #include #include +#include +#include extern void dump_thread(struct pt_regs *, struct user *); extern spinlock_t rtc_lock; @@ -148,6 +151,20 @@ EXPORT_SYMBOL(smp_num_siblings); EXPORT_SYMBOL(cpu_sibling_map); #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +void __this_fixmap_does_not_exist(void) +{ + BUG(); +} +EXPORT_SYMBOL(__this_fixmap_does_not_exist); + +void __br_lock_usage_bug(void) +{ + BUG(); +} +EXPORT_SYMBOL(__br_lock_usage_bug); +#endif + #ifdef CONFIG_SMP EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(cpu_online_map); @@ -185,6 +202,7 @@ EXPORT_SYMBOL(rtc_lock); EXPORT_SYMBOL_GPL(set_nmi_callback); EXPORT_SYMBOL_GPL(unset_nmi_callback); +EXPORT_SYMBOL_GPL(die_chain); #undef memcpy #undef memset @@ -213,4 +231,26 @@ EXPORT_SYMBOL(kmap_atomic_to_page); #ifdef CONFIG_EDD_MODULE EXPORT_SYMBOL(edd); EXPORT_SYMBOL(eddnr); +#endif + +#ifdef CONFIG_CRASH_DUMP_MODULE +#ifdef CONFIG_SMP +extern irq_desc_t irq_desc[NR_IRQS]; +extern unsigned long irq_affinity[NR_IRQS]; +extern void stop_this_cpu(void *); +EXPORT_SYMBOL(irq_desc); +EXPORT_SYMBOL(irq_affinity); +EXPORT_SYMBOL(stop_this_cpu); +EXPORT_SYMBOL(dump_send_ipi); +#endif +extern int pfn_is_ram(unsigned long); +EXPORT_SYMBOL(pfn_is_ram); +#ifdef ARCH_HAS_NMI_WATCHDOG +EXPORT_SYMBOL(touch_nmi_watchdog); +#endif +#endif + +#ifdef CONFIG_X86_STACK_CHECK +extern void mcount(void); +EXPORT_SYMBOL(mcount); #endif diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/init_task.c 900-mjb5/arch/i386/kernel/init_task.c --- 001-bk10/arch/i386/kernel/init_task.c Thu Feb 13 11:08:02 2003 +++ 900-mjb5/arch/i386/kernel/init_task.c Sun Mar 16 13:38:59 2003 @@ -14,6 +14,14 @@ static struct signal_struct init_signals static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +union thread_union init_irq_union + __attribute__((__section__(".data.init_task"))); + +#ifdef CONFIG_X86_STACK_CHECK +union thread_union stack_overflow_stack + __attribute__((__section__(".data.init_task"))); +#endif + /* * Initial thread structure. * diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/io_apic.c 900-mjb5/arch/i386/kernel/io_apic.c --- 001-bk10/arch/i386/kernel/io_apic.c Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/kernel/io_apic.c Sun Mar 16 13:38:48 2003 @@ -116,40 +116,84 @@ static void __init replace_pin_at_irq(un } } -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_pin_list *entry = irq_2_pin + irq; \ - \ - for (;;) { \ - unsigned int reg; \ - pin = entry->pin; \ - if (pin == -1) \ - break; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ - if (!entry->next) \ - break; \ - entry = irq_2_pin + entry->next; \ - } \ - FINAL; \ -} - -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ -DO_ACTION( __mask_and_edge, 0, = (reg & 0xffff7fff) | 0x00010000, ) - /* mask = 1, trigger = 0 */ -DO_ACTION( __unmask_and_level, 0, = (reg & 0xfffeffff) | 0x00008000, ) - /* mask = 0, trigger = 1 */ +/* mask = 1 */ +static void __mask_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + io_apic_modify(entry->apic, 0x10 + pin*2, reg |= 0x00010000); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + io_apic_sync(entry->apic); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + io_apic_modify(entry->apic, 0x10 + pin*2, reg &= 0xfffeffff); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg = (reg & 0xffff7fff) | 0x00010000; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg = (reg & 0xfffeffff) | 0x00008000; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} static void mask_IO_APIC_irq (unsigned int irq) { @@ -197,13 +241,23 @@ static void clear_IO_APIC (void) static void set_ioapic_affinity (unsigned int irq, unsigned long mask) { unsigned long flags; + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; /* * Only the first 8 bits are valid. */ mask = mask << 24; spin_lock_irqsave(&ioapic_lock, flags); - __DO_ACTION(1, = mask, ) + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(entry->apic, 0x10 + 1 + pin*2, mask); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } spin_unlock_irqrestore(&ioapic_lock, flags); } diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/irq.c 900-mjb5/arch/i386/kernel/irq.c --- 001-bk10/arch/i386/kernel/irq.c Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/kernel/irq.c Sun Mar 16 13:38:58 2003 @@ -315,7 +315,8 @@ void enable_irq(unsigned int irq) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs regs) +struct pt_regs * IRQHANDLER(do_IRQ(struct pt_regs *regs)); +struct pt_regs * do_IRQ(struct pt_regs *regs) { /* * We ack quickly, we don't want the irq controller @@ -327,7 +328,7 @@ asmlinkage unsigned int do_IRQ(struct pt * 0 return value means that this irq is already being * handled by some other CPU. (or is disabled) */ - int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */ + int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code */ int cpu = smp_processor_id(); irq_desc_t *desc = irq_desc + irq; struct irqaction * action; @@ -392,7 +393,7 @@ asmlinkage unsigned int do_IRQ(struct pt */ for (;;) { spin_unlock(&desc->lock); - handle_IRQ_event(irq, ®s, action); + handle_IRQ_event(irq, regs, action); spin_lock(&desc->lock); if (likely(!(desc->status & IRQ_PENDING))) @@ -411,7 +412,7 @@ out: irq_exit(); - return 1; + return regs; } /** @@ -869,8 +870,9 @@ static int irq_affinity_write_proc (stru return -EINVAL; irq_affinity[irq] = new_value; +#ifndef CONFIG_X86_SUMMIT irq_desc[irq].handler->set_affinity(irq, new_value); - +#endif return full_count; } diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/kprobes.c 900-mjb5/arch/i386/kernel/kprobes.c --- 001-bk10/arch/i386/kernel/kprobes.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/arch/i386/kernel/kprobes.c Sun Mar 16 13:38:57 2003 @@ -0,0 +1,160 @@ +/* + * Support for kernel probes. + * (C) 2002 Vamsi Krishna S . + */ + +#include +#include +#include +#include +#include + +/* kprobe_status settings */ +#define KPROBE_HIT_ACTIVE 0x00000001 +#define KPROBE_HIT_SS 0x00000002 + +static struct kprobe *current_kprobe; +static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags; + +/* + * returns non-zero if opcode modifies the interrupt flag. + */ +static inline int is_IF_modifier(u8 opcode) +{ + switch(opcode) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0xcf: /* iret/iretd */ + case 0x9d: /* popf/popfd */ + return 1; + } + return 0; +} + +static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + *p->addr = p->opcode; + regs->eip = (unsigned long)p->addr; +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled thorough out this function. + */ +int kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + u8 *addr = (u8 *)(regs->eip-1); + + /* We're in an interrupt, but this is clear and BUG()-safe. */ + preempt_disable(); + + /* Check we're not actually recursing */ + if (kprobe_running()) { + /* We *are* holding lock here, so this is safe. + Disarm the probe we just hit, and ignore it. */ + p = get_kprobe(addr); + if (p) { + disarm_kprobe(p, regs); + ret = 1; + } + /* If it's not ours, can't be delete race, (we hold lock). */ + goto no_kprobe; + } + + lock_kprobes(); + p = get_kprobe(addr); + if (!p) { + unlock_kprobes(); + /* Unregistered (on another cpu) after this hit? Ignore */ + if (*addr != BREAKPOINT_INSTRUCTION) + ret = 1; + /* Not one of ours: let kernel handle it */ + goto no_kprobe; + } + + kprobe_status = KPROBE_HIT_ACTIVE; + current_kprobe = p; + kprobe_saved_eflags = kprobe_old_eflags + = (regs->eflags & (TF_MASK|IF_MASK)); + if (is_IF_modifier(p->opcode)) + kprobe_saved_eflags &= ~IF_MASK; + + p->pre_handler(p, regs); + + regs->eflags |= TF_MASK; + regs->eflags &= ~IF_MASK; + + /* We hold lock, now we remove breakpoint and single step. */ + disarm_kprobe(p, regs); + kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +static void rearm_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + regs->eflags &= ~TF_MASK; + *p->addr = BREAKPOINT_INSTRUCTION; +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate and they + * remain disabled thorough out this function. And we hold kprobe lock. + */ +int post_kprobe_handler(struct pt_regs *regs) +{ + if (!kprobe_running()) + return 0; + + if (current_kprobe->post_handler) + current_kprobe->post_handler(current_kprobe, regs, 0); + + /* + * We singlestepped with interrupts disabled. So, the result on + * the stack would be incorrect for "pushfl" instruction. + * Note that regs->esp is actually the top of the stack when the + * trap occurs in kernel space. + */ + if (current_kprobe->opcode == 0x9c) { /* pushfl */ + regs->esp &= ~(TF_MASK | IF_MASK); + regs->esp |= kprobe_old_eflags; + } + + rearm_kprobe(current_kprobe, regs); + regs->eflags |= kprobe_saved_eflags; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, eflags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->eflags & TF_MASK) + return 0; + + return 1; +} + +/* Interrupts disabled, kprobe_lock held. */ +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + if (current_kprobe->fault_handler + && current_kprobe->fault_handler(current_kprobe, regs, trapnr)) + return 1; + + if (kprobe_status & KPROBE_HIT_SS) { + rearm_kprobe(current_kprobe, regs); + regs->eflags |= kprobe_old_eflags; + + unlock_kprobes(); + preempt_enable_no_resched(); + } + return 0; +} diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/nmi.c 900-mjb5/arch/i386/kernel/nmi.c --- 001-bk10/arch/i386/kernel/nmi.c Wed Mar 5 07:36:57 2003 +++ 900-mjb5/arch/i386/kernel/nmi.c Sun Mar 16 13:39:02 2003 @@ -20,11 +20,27 @@ #include #include #include +#include +#include #include #include #include +#ifdef CONFIG_X86_REMOTE_DEBUG +extern gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs) ; \ + after; \ + } \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + unsigned int nmi_watchdog = NMI_NONE; static unsigned int nmi_hz = HZ; unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ @@ -361,12 +377,59 @@ void nmi_watchdog_tick (struct pt_regs * sum = irq_stat[cpu].apic_timer_irqs; if (last_irq_sums[cpu] == sum) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_spinlock)) +#else + if (kgdb_spinlock) +#endif + { + /* We are inside kgdb, this isn't a stuck cpu */ + alert_counter[cpu] = 0; + } else { +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + if (!procindebug[cpu]) { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } + return; + } + } +#endif /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; if (alert_counter[cpu] == 5*nmi_hz) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_trylock(&kgdb_nmispinlock)) +#else + kgdb_nmispinlock = 1; +#endif + { + procindebug[cpu] = 1; + CHK_REMOTE_DEBUG(2,SIGBUS,0,regs,) + } +#ifdef CONFIG_SMP + else { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } +#endif +#endif spin_lock(&nmi_print_lock); /* * We are in trouble anyway, lets at least try @@ -375,6 +438,7 @@ void nmi_watchdog_tick (struct pt_regs * bust_spinlocks(1); printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip); show_registers(regs); + notify_die(DIE_WATCHDOG, "nmi_watchdog", regs, 0); printk("console shuts up ...\n"); console_silent(); spin_unlock(&nmi_print_lock); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/process.c 900-mjb5/arch/i386/kernel/process.c --- 001-bk10/arch/i386/kernel/process.c Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/kernel/process.c Sun Mar 16 13:38:59 2003 @@ -160,7 +160,25 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); -void show_regs(struct pt_regs * regs) +void stack_overflow(unsigned long esp, unsigned long eip) +{ + int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC); + + printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%x %d %d\n", + esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing ); + + if (panicing) + print_symbol("stack overflow from %s\n", eip); + else + print_symbol("excessive stack use from %s\n", eip); + printk("esp: %p\n", (void*)esp); + show_trace((void*)esp); + + if (panicing) + panic("stack overflow\n"); +} + +asmlinkage void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -443,6 +461,7 @@ struct task_struct * __switch_to(struct /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + next_p->thread_info->irq_stack = prev_p->thread_info->irq_stack; unlazy_fpu(prev_p); /* diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/reboot.c 900-mjb5/arch/i386/kernel/reboot.c --- 001-bk10/arch/i386/kernel/reboot.c Sun Nov 17 20:29:30 2002 +++ 900-mjb5/arch/i386/kernel/reboot.c Sun Mar 16 13:39:02 2003 @@ -8,6 +8,7 @@ #include #include #include +#include /* * Power off function, if any @@ -256,7 +257,8 @@ void machine_restart(char * __unused) * Stop all CPUs and turn off local APICs and the IO-APIC, so * other OSs see a clean IRQ state. */ - smp_send_stop(); + if (notify_die(DIE_STOP,"cpustop",0,0) != NOTIFY_BAD) + smp_send_stop(); disable_IO_APIC(); #endif diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/setup.c 900-mjb5/arch/i386/kernel/setup.c --- 001-bk10/arch/i386/kernel/setup.c Wed Mar 5 07:36:57 2003 +++ 900-mjb5/arch/i386/kernel/setup.c Sun Mar 16 13:39:05 2003 @@ -918,6 +918,9 @@ void __init setup_arch(char **cmdline_p) if (smp_found_config) get_smp_config(); #endif +#ifdef CONFIG_X86_SUMMIT + setup_summit(); +#endif register_memory(max_low_pfn); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/smp.c 900-mjb5/arch/i386/kernel/smp.c --- 001-bk10/arch/i386/kernel/smp.c Wed Mar 5 07:36:57 2003 +++ 900-mjb5/arch/i386/kernel/smp.c Sun Mar 16 13:39:02 2003 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -144,6 +145,13 @@ static inline void __send_IPI_shortcut(u */ cfg = __prepare_ICR(shortcut, vector); + if (vector == DUMP_VECTOR) { + /* + * Setup DUMP IPI to be delivered as an NMI + */ + cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI; + } + /* * Send the IPI. The write to APIC_ICR fires this off. */ @@ -305,7 +313,8 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ -asmlinkage void smp_invalidate_interrupt (void) +struct pt_regs * IRQHANDLER(smp_invalidate_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; @@ -336,6 +345,7 @@ asmlinkage void smp_invalidate_interrupt out: put_cpu_no_resched(); + return regs; } static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, @@ -450,6 +460,11 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, 0, 1, 1); } +void dump_send_ipi(void) +{ + send_IPI_allbutself(DUMP_VECTOR); +} + /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing @@ -509,10 +524,17 @@ int smp_call_function (void (*func) (voi { struct call_data_struct data; int cpus = num_online_cpus()-1; + int count = 0; + int gdb; - if (!cpus) + if (cpus <= 0) return 0; + gdb = 0; + if (wait == 99) { + wait = 0; + gdb = 1; + } data.func = func; data.info = info; atomic_set(&data.started, 0); @@ -527,18 +549,33 @@ int smp_call_function (void (*func) (voi send_IPI_allbutself(CALL_FUNCTION_VECTOR); /* Wait for response */ - while (atomic_read(&data.started) != cpus) + while (atomic_read(&data.started) != cpus) { + if (gdb) { + if (count++ == 2000000) { + printk("%s: timeout\n", __FUNCTION__); + break; + } + if (count == 1000000) { + printk("looks bad\n"); + printk("cpus=%d, started=%d\n", cpus, + atomic_read(&data.started)); + } + if (count > 1000000) + udelay(1); + } barrier(); + } if (wait) while (atomic_read(&data.finished) != cpus) barrier(); + spin_unlock(&call_lock); return 0; } -static void stop_this_cpu (void * dummy) +void stop_this_cpu (void * dummy) { /* * Remove this CPU: @@ -569,14 +606,17 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) +struct pt_regs * IRQHANDLER(smp_reschedule_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + return regs; } -asmlinkage void smp_call_function_interrupt(void) +struct pt_regs * IRQHANDLER(smp_call_function_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_call_function_interrupt(struct pt_regs *regs) { - void (*func) (void *info) = call_data->func; + void (*func) (void *info, struct pt_regs *) = (void (*)(void *, struct pt_regs*))call_data->func; void *info = call_data->info; int wait = call_data->wait; @@ -591,12 +631,12 @@ asmlinkage void smp_call_function_interr * At this point the info structure may be out of scope unless wait==1 */ irq_enter(); - (*func)(info); + (*func)(info, regs); irq_exit(); if (wait) { mb(); atomic_inc(&call_data->finished); } + return regs; } - diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/smpboot.c 900-mjb5/arch/i386/kernel/smpboot.c --- 001-bk10/arch/i386/kernel/smpboot.c Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/kernel/smpboot.c Sun Mar 16 13:39:02 2003 @@ -48,6 +48,7 @@ #include #include #include +#include #include #include "smpboot_hooks.h" @@ -62,7 +63,7 @@ int smp_num_siblings = 1; int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ /* Bitmask of currently online CPUs */ -unsigned long cpu_online_map; +unsigned long cpu_online_map = 1; static volatile unsigned long cpu_callin_map; volatile unsigned long cpu_callout_map; @@ -71,6 +72,11 @@ static unsigned long smp_commenced_mask; /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +/* Per CPU interrupt stacks */ +extern union thread_union init_irq_union; +union thread_union *irq_stacks[NR_CPUS] __cacheline_aligned = + { &init_irq_union, }; + /* Set when the idlers are all forked */ int smp_threads_ready; @@ -412,6 +418,8 @@ void __init smp_callin(void) */ smp_store_cpu_info(cpuid); + notify_die(DIE_CPUINIT, "cpuinit", NULL, 0); + disable_APIC_timer(); local_irq_disable(); /* @@ -770,6 +778,28 @@ wakeup_secondary_cpu(int phys_apicid, un } #endif /* WAKE_SECONDARY_VIA_INIT */ +static void __init setup_irq_stack(struct task_struct *p, int cpu) +{ + unsigned long stk; + + stk = __get_free_pages(GFP_KERNEL, THREAD_ORDER); + if (!stk) + panic("I can't seem to allocate my irq stack. Oh well, giving up."); + + irq_stacks[cpu] = (void *)stk; + memset(irq_stacks[cpu], 0, THREAD_SIZE); + irq_stacks[cpu]->thread_info.cpu = cpu; + irq_stacks[cpu]->thread_info.preempt_count = 1; + /* interrupts are not preemptable */ + p->thread_info->irq_stack = &irq_stacks[cpu]->thread_info; + + /* If we want to make the irq stack more than one unit + * deep, we can chain then off of the irq_stack pointer + * here. + */ +} + + extern unsigned long cpu_initialized; static int __init do_boot_cpu(int apicid) @@ -793,6 +823,8 @@ static int __init do_boot_cpu(int apicid idle = fork_by_hand(); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + + setup_irq_stack(idle, cpu); /* * We remove it from the pidhash and the runqueue diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/summit.c 900-mjb5/arch/i386/kernel/summit.c --- 001-bk10/arch/i386/kernel/summit.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/arch/i386/kernel/summit.c Sun Mar 16 13:39:05 2003 @@ -0,0 +1,162 @@ +/* + * arch/i386/kernel/summit.c - IBM Summit-Specific Code + * + * Written By: Matthew Dobson, IBM Corporation + * + * Copyright (c) 2003 IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + * + */ + +#include +#include +#include +#include + +static void __init setup_pci_node_map_for_wpeg(int wpeg_num, struct rio_table_hdr *rth, + struct scal_detail **scal_nodes, struct rio_detail **rio_nodes){ + int twst_num = 0, node = 0, first_bus = 0; + int i, bus, num_busses; + + for(i = 0; i < rth->num_rio_dev; i++){ + if (rio_nodes[i]->node_id == rio_nodes[wpeg_num]->owner_id){ + twst_num = rio_nodes[i]->owner_id; + break; + } + } + if (i == rth->num_rio_dev){ + printk("%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__); + return; + } + + for(i = 0; i < rth->num_scal_dev; i++){ + if (scal_nodes[i]->node_id == twst_num){ + node = scal_nodes[i]->node_id; + break; + } + } + if (i == rth->num_scal_dev){ + printk("%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__); + return; + } + + switch (rio_nodes[wpeg_num]->type){ + case CompatWPEG: + /* The Compatability Winnipeg controls the legacy busses + (busses 0 & 1), the 66MHz PCI bus [2 slots] (bus 2), + and the "extra" busses in case a PCI-PCI bridge card is + used in either slot (busses 3 & 4): total 5 busses. */ + num_busses = 5; + /* The BIOS numbers the busses starting at 1, and in a + slightly wierd manner. You'll have to trust that + the math used below to determine the number of the + first bus works. */ + first_bus = (rio_nodes[wpeg_num]->first_slot - 1) * 2; + break; + case AltWPEG: + /* The Alternate/Secondary Winnipeg controls the 1st 133MHz + bus [1 slot] & its "extra" bus (busses 0 & 1), the 2nd + 133MHz bus [1 slot] & its "extra" bus (busses 2 & 3), the + 100MHz bus [2 slots] (bus 4), and the "extra" busses for + the 2 100MHz slots (busses 5 & 6): total 7 busses. */ + num_busses = 7; + first_bus = (rio_nodes[wpeg_num]->first_slot * 2) - 1; + break; + case LookOutAWPEG: + case LookOutBWPEG: + printk("%s: LookOut Winnipegs not supported yet!\n", __FUNCTION__); + return; + default: + printk("%s: Unsupported Winnipeg type!\n", __FUNCTION__); + return; + } + + for(bus = first_bus; bus < first_bus + num_busses; bus++) + mp_bus_id_to_node[bus] = node; +} + +static void __init build_detail_arrays(struct rio_table_hdr *rth, + struct scal_detail **sd, struct rio_detail **rd){ + unsigned long ptr; + int i, scal_detail_size, rio_detail_size; + + switch (rth->version){ + default: + printk("%s: Bad Rio Grande Table Version: %d\n", __FUNCTION__, rth->version); + /* Fall through to default to version 2 spec */ + case 2: + scal_detail_size = 11; + rio_detail_size = 13; + break; + case 3: + scal_detail_size = 12; + rio_detail_size = 15; + break; + } + + ptr = (unsigned long)rth + 3; + for(i = 0; i < rth->num_scal_dev; i++) + sd[i] = (struct scal_detail *)(ptr + (scal_detail_size * i)); + + ptr += scal_detail_size * rth->num_scal_dev; + for(i = 0; i < rth->num_rio_dev; i++) + rd[i] = (struct rio_detail *)(ptr + (rio_detail_size * i)); +} + +void __init setup_summit(void) +{ + struct rio_table_hdr *rio_table_hdr = NULL; + struct scal_detail *scal_devs[MAX_NUMNODES]; + struct rio_detail *rio_devs[MAX_NUMNODES*2]; + unsigned long ptr; + unsigned short offset; + int i; + + memset(mp_bus_id_to_node, -1, sizeof(mp_bus_id_to_node)); + + /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ + ptr = *(unsigned short *)phys_to_virt(0x40Eul); + ptr = (unsigned long)phys_to_virt(ptr << 4); + + offset = 0x180; + while (offset){ + /* The block id is stored in the 2nd word */ + if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ + /* set the pointer past the offset & block id */ + rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); + break; + } + /* The next offset is stored in the 1st word. 0 means no more */ + offset = *((unsigned short *)(ptr + offset)); + } + if (!rio_table_hdr){ + printk("%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__); + return; + } + + /* Deal with the ugly version 2/3 pointer arithmetic */ + build_detail_arrays(rio_table_hdr, scal_devs, rio_devs); + + for(i = 0; i < rio_table_hdr->num_rio_dev; i++) + if (is_WPEG(rio_devs[i]->type)) + /* It's a Winnipeg, it's got PCI Busses */ + setup_pci_node_map_for_wpeg(i, rio_table_hdr, scal_devs, rio_devs); +} diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/traps.c 900-mjb5/arch/i386/kernel/traps.c --- 001-bk10/arch/i386/kernel/traps.c Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/kernel/traps.c Sun Mar 16 13:39:02 2003 @@ -25,6 +25,7 @@ #include #include #include +#include #ifdef CONFIG_EISA #include @@ -42,6 +43,7 @@ #include #include #include +#include #include #include @@ -51,6 +53,24 @@ #include #include +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + +#ifdef CONFIG_X86_REMOTE_DEBUG +gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs) ; \ + after; \ + } \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + asmlinkage int system_call(void); asmlinkage void lcall7(void); asmlinkage void lcall27(void); @@ -244,17 +264,25 @@ bug: } spinlock_t die_lock = SPIN_LOCK_UNLOCKED; +struct notifier_block *die_chain; void die(const char * str, struct pt_regs * regs, long err) { console_verbose(); + if (notify_die(DIE_DIE, str, regs, err) == NOTIFY_BAD) + goto exit_segv; + spin_lock_irq(&die_lock); bust_spinlocks(1); handle_BUG(regs); printk("%s: %04lx\n", str, err & 0xffff); + CHK_REMOTE_DEBUG(1,SIGTRAP,err,regs,) show_registers(regs); bust_spinlocks(0); spin_unlock_irq(&die_lock); + + notify_die(DIE_OOPS, str, regs, err); + exit_segv: do_exit(SIGSEGV); } @@ -312,6 +340,7 @@ static inline void do_trap(int trapnr, i #define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,)\ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -329,7 +358,9 @@ asmlinkage void do_##name(struct pt_regs #define DO_VM86_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,return)\ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + return; \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -344,7 +375,6 @@ asmlinkage void do_##name(struct pt_regs } DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) -DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip) @@ -358,6 +388,13 @@ asmlinkage void do_general_protection(st { if (regs->eflags & VM_MASK) goto gp_in_vm86; + + if (kprobe_running() && kprobe_fault_handler(regs, 13)) + return; + + if (notify_die(DIE_PROTFAULT, "general protection", regs, error_code) + == NOTIFY_BAD) + return; if (!(regs->xcs & 3)) goto gp_in_kernel; @@ -372,8 +409,10 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) + if (!fixup_exception(regs)) { + CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,); die("general protection fault", regs, error_code); + } } static void mem_parity_error(unsigned char reason, struct pt_regs * regs) @@ -402,6 +441,7 @@ static void io_check_error(unsigned char outb(reason, 0x61); } + static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { #ifdef CONFIG_MCA @@ -436,10 +476,14 @@ static void default_do_nmi(struct pt_reg unknown_nmi_error(reason, regs); return; } + + if (notify_die(DIE_NMI, "nmi", regs, reason) == NOTIFY_BAD) + return; if (reason & 0x80) mem_parity_error(reason, regs); if (reason & 0x40) io_check_error(reason, regs); + /* * Reassert NMI in case it became active meanwhile * as it's edge-triggered. @@ -482,6 +526,20 @@ void unset_nmi_callback(void) nmi_callback = dummy_nmi_callback; } +asmlinkage int do_int3(struct pt_regs *regs, long error_code) +{ + /* This is an interrupt gate, because kprobes wants interrupts + disabled. Normal trap handlers don't. */ + if (kprobe_handler(regs)) + return 1; + if (notify_die(DIE_INT3, "int3", regs, error_code) == NOTIFY_BAD) + return 1; + + restore_interrupts(regs); + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); + return 0; +} + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -504,7 +562,7 @@ void unset_nmi_callback(void) * find every occurrence of the TF bit that could be saved away even * by user code) */ -asmlinkage void do_debug(struct pt_regs * regs, long error_code) +asmlinkage int do_debug(struct pt_regs * regs, long error_code) { unsigned int condition; struct task_struct *tsk = current; @@ -516,6 +574,18 @@ asmlinkage void do_debug(struct pt_regs if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); + if (post_kprobe_handler(regs)) + return 1; + + /* Interrupts not disabled for normal trap handling. */ + restore_interrupts(regs); + + if (notify_die(DIE_DEBUG, "debug", regs, error_code) == NOTIFY_BAD) + return 1; + + /* Interrupts not disabled for normal trap handling. */ + restore_interrupts(regs); + /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { if (!tsk->thread.debugreg[7]) @@ -539,8 +609,10 @@ asmlinkage void do_debug(struct pt_regs * allowing programs to debug themselves without the ptrace() * interface. */ +#ifndef CONFIG_X86_REMOTE_DEBUG if ((regs->xcs & 3) == 0) goto clear_TF_reenable; +#endif if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } @@ -552,11 +624,13 @@ asmlinkage void do_debug(struct pt_regs info.si_errno = 0; info.si_code = TRAP_BRKPT; - /* If this is a kernel mode trap, save the user PC on entry to - * the kernel, that's what the debugger can make sense of. - */ - info.si_addr = ((regs->xcs & 3) == 0) ? (void *)tsk->thread.eip : - (void *)regs->eip; + + /* If this is a kernel mode trap, we need to reset db7 to allow us + * to continue sanely */ + if ((regs->xcs & 3) == 0) + goto clear_dr7; + + info.si_addr = (void *)regs->eip; force_sig_info(SIGTRAP, &info, tsk); /* Disable additional traps. They'll be re-enabled when @@ -566,17 +640,20 @@ clear_dr7: __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); - return; + CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) + return 0; debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - return; + return 0; +#ifndef CONFIG_X86_REMOTE_DEBUG clear_TF_reenable: +#endif set_tsk_thread_flag(tsk, TIF_SINGLESTEP); clear_TF: regs->eflags &= ~TF_MASK; - return; + return 0; } /* @@ -741,6 +818,12 @@ asmlinkage void math_state_restore(struc struct task_struct *tsk = thread->task; clts(); /* Allow maths ops (or we recurse) */ + + if (kprobe_running() && kprobe_fault_handler(®s, 7)) + return; + if (notify_die(DIE_FPUTRAP, "fpu", ®s, 0) == NOTIFY_BAD) + return; + if (!tsk->used_math) init_fpu(tsk); restore_fpu(tsk); @@ -773,6 +856,39 @@ void __init trap_init_f00f_bug(void) } #endif +static inline void get_current_regs(struct pt_regs *regs) +{ + __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx)); + __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx)); + __asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx)); + __asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi)); + __asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi)); + __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp)); + __asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax)); + __asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp)); + __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss)); + __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs)); + __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds)); + __asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes)); + __asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags)); + regs->eip = (unsigned long)current_text_addr(); +} + +static int panic_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct pt_regs regs; + get_current_regs(®s); + + return notify_die(DIE_PANIC, (const char *)ptr, ®s, event); +} + +extern struct notifier_block *panic_notifier_list; +static int panic_event(struct notifier_block *, unsigned long, void *); +static struct notifier_block panic_block = { + .notifier_call = panic_event, +}; + #define _set_gate(gate_addr,type,dpl,addr,seg) \ do { \ int __d0, __d1; \ @@ -839,9 +955,9 @@ void __init trap_init(void) #endif set_trap_gate(0,÷_error); - set_intr_gate(1,&debug); + _set_gate(idt_table+1,14,3,&debug,__KERNEL_CS); /* debug trap for kprobes */ set_intr_gate(2,&nmi); - set_system_gate(3,&int3); /* int3-5 can be called from all */ + _set_gate(idt_table+3,14,3,&int3,__KERNEL_CS); /* int3-5 can be called from all */ set_system_gate(4,&overflow); set_system_gate(5,&bounds); set_trap_gate(6,&invalid_op); @@ -870,10 +986,14 @@ void __init trap_init(void) set_call_gate(&default_ldt[0],lcall7); set_call_gate(&default_ldt[4],lcall27); + notify_die(DIE_TRAPINIT, "traps initialized", 0, 0); /* * Should be a barrier for any external CPU state. */ cpu_init(); trap_init_hook(); + + notifier_chain_register(&panic_notifier_list, &panic_block); } + diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/kernel/vm86.c 900-mjb5/arch/i386/kernel/vm86.c --- 001-bk10/arch/i386/kernel/vm86.c Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/kernel/vm86.c Sun Mar 16 13:39:06 2003 @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include @@ -125,6 +127,7 @@ struct pt_regs * save_v86_state(struct k static void mark_screen_rdonly(struct task_struct * tsk) { + struct ptpage *ptepage; pgd_t *pgd; pmd_t *pmd; pte_t *pte, *mapped; @@ -148,6 +151,8 @@ static void mark_screen_rdonly(struct ta pmd_clear(pmd); goto out; } + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); pte = mapped = pte_offset_map(pmd, 0xA0000); for (i = 0; i < 32; i++) { if (pte_present(*pte)) @@ -155,6 +160,7 @@ static void mark_screen_rdonly(struct ta pte++; } pte_unmap(mapped); + pte_page_unlock(ptepage); out: spin_unlock(&tsk->mm->page_table_lock); preempt_enable(); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/lib/dec_and_lock.c 900-mjb5/arch/i386/lib/dec_and_lock.c --- 001-bk10/arch/i386/lib/dec_and_lock.c Sun Nov 17 20:29:28 2002 +++ 900-mjb5/arch/i386/lib/dec_and_lock.c Sun Mar 16 13:39:06 2003 @@ -10,6 +10,7 @@ #include #include +#ifndef ATOMIC_DEC_AND_LOCK int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { int counter; @@ -38,3 +39,5 @@ slow_path: spin_unlock(lock); return 0; } +#endif + diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/mm/fault.c 900-mjb5/arch/i386/mm/fault.c --- 001-bk10/arch/i386/mm/fault.c Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/mm/fault.c Sun Mar 16 13:39:02 2003 @@ -2,6 +2,11 @@ * linux/arch/i386/mm/fault.c * * Copyright (C) 1995 Linus Torvalds + * + * Change History + * + * Tigran Aivazian Remote debugging support. + * */ #include @@ -20,12 +25,17 @@ #include #include /* For unblank_screen() */ #include +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif +#include #include #include #include #include #include +#include extern void die(const char *,struct pt_regs *,long); @@ -161,6 +171,15 @@ asmlinkage void do_page_fault(struct pt_ /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); + /* for many cases list will be empty, so optimize for that case */ + if (unlikely(die_chain != NULL) + && notify_die(DIE_PAGEFAULT, "page fault", regs, error_code) + == NOTIFY_BAD) + return; + + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + return; + /* It's safe to allow irq's after cr2 has been saved */ if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); @@ -193,6 +212,15 @@ asmlinkage void do_page_fault(struct pt_ if (in_atomic() || !mm) goto no_context; +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs) ; + return; /* return w/modified regs */ + } + } +#endif + down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -291,6 +319,19 @@ bad_area: force_sig_info(SIGSEGV, &info, tsk); return; } + +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + return; /* Return with modified registers */ + } + } else { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + } + } +#endif #ifdef CONFIG_X86_F00F_BUG /* diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/mm/init.c 900-mjb5/arch/i386/mm/init.c --- 001-bk10/arch/i386/mm/init.c Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/mm/init.c Sun Mar 16 13:39:02 2003 @@ -164,7 +164,7 @@ static inline int page_kills_ppro(unsign return 0; } -static inline int page_is_ram(unsigned long pagenr) +int page_is_ram(unsigned long pagenr) { int i; @@ -184,6 +184,12 @@ static inline int page_is_ram(unsigned l return 1; } return 0; +} + +/* To enable modules to check if a page is in RAM */ +int pfn_is_ram(unsigned long pfn) +{ + return (page_is_ram(pfn)); } #if CONFIG_HIGHMEM diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/mm/pgtable.c 900-mjb5/arch/i386/mm/pgtable.c --- 001-bk10/arch/i386/mm/pgtable.c Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/mm/pgtable.c Sun Mar 16 13:39:06 2003 @@ -146,24 +146,27 @@ pte_t *pte_alloc_one_kernel(struct mm_st return pte; } -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +struct ptpage *pte_alloc_one(struct mm_struct *mm, unsigned long address) { int count = 0; - struct page *pte; + struct ptpage *pte; do { #if CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, 0); + pte = (struct ptpage *)alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, 0); #else - pte = alloc_pages(GFP_KERNEL, 0); + pte = (struct ptpage *)alloc_pages(GFP_KERNEL, 0); #endif - if (pte) - clear_highpage(pte); - else { + if (pte) { + clear_highpage((struct page *)pte); + pte->mapcount = pte->swapcount= 0; + pte->pte.mmdirect = 0; + break; + } else { current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ); } - } while (!pte && (count++ < 10)); + } while (count++ < 10); return pte; } diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/i386/vmlinux.lds.S 900-mjb5/arch/i386/vmlinux.lds.S --- 001-bk10/arch/i386/vmlinux.lds.S Sun Mar 16 13:38:20 2003 +++ 900-mjb5/arch/i386/vmlinux.lds.S Sun Mar 16 13:38:50 2003 @@ -10,7 +10,7 @@ ENTRY(startup_32) jiffies = jiffies_64; SECTIONS { - . = 0xC0000000 + 0x100000; + . = __PAGE_OFFSET + 0x100000; /* read-only */ _text = .; /* Text and read-only data */ .text : { diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/s390/boot/Makefile 900-mjb5/arch/s390/boot/Makefile --- 001-bk10/arch/s390/boot/Makefile Mon Dec 16 21:50:39 2002 +++ 900-mjb5/arch/s390/boot/Makefile Sun Mar 16 13:39:02 2003 @@ -17,4 +17,4 @@ $(obj)/listing: vmlinux FORCE install: $(CONFIGURE) $(obj)/image sh -x $(obj)/install.sh $(KERNELRELEASE) $(obj)/image \ - System.map Kerntypes "$(INSTALL_PATH)" + System.map init/kerntypes.o "$(INSTALL_PATH)" diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/s390/boot/install.sh 900-mjb5/arch/s390/boot/install.sh --- 001-bk10/arch/s390/boot/install.sh Sun Nov 17 20:29:48 2002 +++ 900-mjb5/arch/s390/boot/install.sh Sun Mar 16 13:39:02 2003 @@ -16,7 +16,8 @@ # $1 - kernel version # $2 - kernel image file # $3 - kernel map file -# $4 - default install path (blank if root directory) +# $4 - kernel type file +# $5 - default install path (blank if root directory) # # User may have a custom install script @@ -26,13 +27,22 @@ if [ -x /sbin/installkernel ]; then exec # Default install - same as make zlilo -if [ -f $4/vmlinuz ]; then - mv $4/vmlinuz $4/vmlinuz.old +if [ -f $5/vmlinuz ]; then + mv $5/vmlinuz $5/vmlinuz.old fi -if [ -f $4/System.map ]; then - mv $4/System.map $4/System.old +if [ -f $5/System.map ]; then + mv $5/System.map $5/System.old fi -cat $2 > $4/vmlinuz -cp $3 $4/System.map +if [ -f $5/Kerntypes ]; then + mv $5/Kerntypes $5/Kerntypes.old +fi + +cat $2 > $5/vmlinuz +cp $3 $5/System.map + +# copy the kernel type file if it exists +if [ -f $4 ]; then + cp $4 $5/Kerntypes +fi diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/s390x/boot/Makefile 900-mjb5/arch/s390x/boot/Makefile --- 001-bk10/arch/s390x/boot/Makefile Mon Dec 16 21:50:39 2002 +++ 900-mjb5/arch/s390x/boot/Makefile Sun Mar 16 13:39:02 2003 @@ -18,4 +18,4 @@ $(obj)/listing: vmlinux FORCE install: $(CONFIGURE) $(obj)/image sh -x $(obj)/install.sh $(KERNELRELEASE) $(obj)/image \ - System.map Kerntypes "$(INSTALL_PATH)" + System.map kerntypes.o "$(INSTALL_PATH)" diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/s390x/boot/install.sh 900-mjb5/arch/s390x/boot/install.sh --- 001-bk10/arch/s390x/boot/install.sh Sun Nov 17 20:29:57 2002 +++ 900-mjb5/arch/s390x/boot/install.sh Sun Mar 16 13:39:02 2003 @@ -16,7 +16,8 @@ # $1 - kernel version # $2 - kernel image file # $3 - kernel map file -# $4 - default install path (blank if root directory) +# $4 - kernel type file +# $5 - default install path (blank if root directory) # # User may have a custom install script @@ -26,13 +27,22 @@ if [ -x /sbin/installkernel ]; then exec # Default install - same as make zlilo -if [ -f $4/vmlinuz ]; then - mv $4/vmlinuz $4/vmlinuz.old +if [ -f $5/vmlinuz ]; then + mv $5/vmlinuz $5/vmlinuz.old fi -if [ -f $4/System.map ]; then - mv $4/System.map $4/System.old +if [ -f $5/System.map ]; then + mv $5/System.map $5/System.old fi -cat $2 > $4/vmlinuz -cp $3 $4/System.map +if [ -f $5/Kerntypes ]; then + mv $5/Kerntypes $5/Kerntypes.old +fi + +cat $2 > $5/vmlinuz +cp $3 $5/System.map + +# copy the kernel type file if it exists +if [ -f $4 ]; then + cp $4 $5/Kerntypes +fi diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/sparc64/kernel/rtrap.S 900-mjb5/arch/sparc64/kernel/rtrap.S --- 001-bk10/arch/sparc64/kernel/rtrap.S Sun Nov 17 20:29:45 2002 +++ 900-mjb5/arch/sparc64/kernel/rtrap.S Sun Mar 16 18:34:49 2003 @@ -15,6 +15,10 @@ #include #include +#ifndef CONFIG_KGDB_THREAD +#define user_schedule schedule +#endif + #define RTRAP_PSTATE (PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV|PSTATE_IE) #define RTRAP_PSTATE_IRQOFF (PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV) #define RTRAP_PSTATE_AG_IRQOFF (PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV|PSTATE_AG) @@ -33,7 +37,7 @@ __handle_softirq: ba,a,pt %xcc, __handle_softirq_continue nop __handle_preemption: - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate ba,pt %xcc, __handle_preemption_continue wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate @@ -48,7 +52,7 @@ __handle_user_windows: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -92,7 +96,7 @@ __handle_perfctrs: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -271,7 +275,7 @@ to_kernel: brnz %l5, kern_fpucheck sethi %hi(PREEMPT_ACTIVE), %l6 stw %l6, [%g6 + TI_PRE_COUNT] - call schedule + call user_schedule nop ba,pt %xcc, rtrap stw %g0, [%g6 + TI_PRE_COUNT] diff -urpN -X /home/fletch/.diff.exclude 001-bk10/arch/x86_64/kernel/entry.S 900-mjb5/arch/x86_64/kernel/entry.S --- 001-bk10/arch/x86_64/kernel/entry.S Tue Feb 25 23:03:45 2003 +++ 900-mjb5/arch/x86_64/kernel/entry.S Sun Mar 16 18:34:49 2003 @@ -46,6 +46,10 @@ #define PDAREF(field) %gs:field +#ifndef CONFIG_KGDB_THREAD +#define user_schedule schedule +#endif + #ifdef CONFIG_PREEMPT #define preempt_stop cli #else @@ -187,7 +191,7 @@ sysret_careful: jnc sysret_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp sysret_check @@ -256,7 +260,7 @@ int_careful: jnc int_very_careful sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp int_with_check @@ -420,7 +424,7 @@ retint_careful: jnc retint_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi GET_THREAD_INFO(%rcx) cli @@ -454,7 +458,7 @@ retint_kernel: jc retint_restore_args movl $PREEMPT_ACTIVE,threadinfo_preempt_count(%rcx) sti - call schedule + call user_schedule cli GET_THREAD_INFO(%rcx) movl $0,threadinfo_preempt_count(%rcx) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/Makefile 900-mjb5/drivers/Makefile --- 001-bk10/drivers/Makefile Sun Mar 16 13:38:20 2003 +++ 900-mjb5/drivers/Makefile Sun Mar 16 13:39:02 2003 @@ -50,3 +50,4 @@ obj-$(CONFIG_ISDN_BOOL) += isdn/ obj-$(CONFIG_MCA) += mca/ obj-$(CONFIG_EISA) += eisa/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_CRASH_DUMP) += dump/ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/char/Makefile 900-mjb5/drivers/char/Makefile --- 001-bk10/drivers/char/Makefile Sun Mar 16 13:38:20 2003 +++ 900-mjb5/drivers/char/Makefile Sun Mar 16 13:38:57 2003 @@ -25,6 +25,7 @@ obj-$(CONFIG_COMPUTONE) += ip2.o ip2main obj-$(CONFIG_RISCOM8) += riscom8.o obj-$(CONFIG_ISI) += isicom.o obj-$(CONFIG_ESPSERIAL) += esp.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbserial.o obj-$(CONFIG_SYNCLINK) += synclink.o obj-$(CONFIG_SYNCLINKMP) += synclinkmp.o obj-$(CONFIG_N_HDLC) += n_hdlc.o diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/char/gdbserial.c 900-mjb5/drivers/char/gdbserial.c --- 001-bk10/drivers/char/gdbserial.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/char/gdbserial.c Sun Mar 16 13:38:57 2003 @@ -0,0 +1,274 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * + * Modified by Scott Foehner (sfoehner@engr.sgi.com) to allow connect + * on boot-up + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#undef PRNT /* define for debug printing */ + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +extern void set_debug_traps(void); /* GDB routine */ +extern int gdb_serial_setup(int ttyS, int baud, int *port, int *irq); +extern void shutdown_for_gdb(struct async_struct *info); + /* in serial.c */ + +int gdb_irq; +int gdb_port; +int gdb_ttyS = 1; /* Default: ttyS1 */ +int gdb_baud = 38400; +int gdb_enter = 0; /* Default: do not do gdb_hook on boot */ +int gdb_initialized = 0; + +static int initialized = -1; + +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(void) +{ + if (inb(gdb_port + UART_LSR) & UART_LSR_DR) + return (inb(gdb_port + UART_RX)); + + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + */ +static int +read_char(void) +{ + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + int chr; + + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + return (chr); + } + + return (read_data_bfr()); /* read from hardware */ + +} /* read_char */ + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(int chr) +{ + while (!(inb(gdb_port + UART_LSR) & UART_LSR_THRE)) ; + + outb(chr, gdb_port + UART_TX); + +} /* write_char */ + +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static void +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + int chr; + int iir; + + do { + chr = read_data_bfr(); + iir = inb(gdb_port + UART_IIR); +#ifdef PRNT + printk("gdb_interrupt: chr=%02x '%c' after read iir=%02x\n", + chr, chr > ' ' && chr < 0x7F ? chr : ' ', iir); +#endif + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + breakpoint(); + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { /* buffer overflow, clear it */ + gdb_buf_in_inx = 0; + atomic_set(&gdb_buf_in_cnt, 0); + gdb_buf_out_inx = 0; + break; + } + + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + atomic_inc(&gdb_buf_in_cnt); + } + while (iir & UART_IIR_RDI); + +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +extern int serial8250_init(void); + +int +gdb_hook(void) +{ + int retval; + +#ifdef CONFIG_SMP + if (NR_CPUS > KGDB_MAX_NO_CPUS) { + printk + ("kgdb: too manu cpus. Cannot enable debugger with more than 8 cpus\n"); + return (-1); + } +#endif + + /* + * Call first time just to get the ser ptr + */ + + serial8250_init(); + + if (gdb_serial_setup(gdb_ttyS, gdb_baud, &gdb_port, &gdb_irq)) { + printk("gdb_serial_setup() error"); + return (-1); + } + + retval = request_irq(gdb_irq, + gdb_interrupt, SA_INTERRUPT, "GDB-stub", NULL); + if (retval == 0) + initialized = 1; + else { + initialized = 0; + printk("gdb_hook: request_irq(irq=%d) failed: %d\n", gdb_irq, + retval); + } + + /* + * Call GDB routine to setup the exception vectors for the debugger + */ + set_debug_traps(); + + /* + * Call the breakpoint() routine in GDB to start the debugging + * session. + */ + printk("Waiting for connection from remote gdb... "); + breakpoint(); + gdb_null(); + + printk("Connected.\n"); + + gdb_initialized = 1; + return (0); + +} /* gdb_hook_interrupt2 */ + +/* + * getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. + */ +int +getDebugChar(void) +{ + volatile int chr; + +#ifdef PRNT + printk("getDebugChar: "); +#endif + + while ((chr = read_char()) < 0) + touch_nmi_watchdog(); + +#ifdef PRNT + printk("%c\n", chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + return (chr); + +} /* getDebugChar */ + +/* + * putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. + */ +void +putDebugChar(int chr) +{ +#ifdef PRNT + printk("putDebugChar: chr=%02x '%c'\n", chr, + chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + + write_char(chr); /* this routine will wait */ + +} /* putDebugChar */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/char/sysrq.c 900-mjb5/drivers/char/sysrq.c --- 001-bk10/drivers/char/sysrq.c Tue Feb 25 23:03:46 2003 +++ 900-mjb5/drivers/char/sysrq.c Sun Mar 16 13:38:57 2003 @@ -107,6 +107,18 @@ static struct sysrq_key_op sysrq_reboot_ .action_msg = "Resetting", }; +#ifdef CONFIG_X86_REMOTE_DEBUG +static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) { + int gdb_hook(void); + gdb_hook(); +} +static struct sysrq_key_op sysrq_gdb_op = { + handler: sysrq_handle_gdb, + help_msg: "Gdb", + action_msg: "Entering debugger", +}; +#endif /* SYNC SYSRQ HANDLERS BLOCK */ @@ -352,7 +364,11 @@ static struct sysrq_key_op *sysrq_key_ta /* d */ NULL, /* e */ &sysrq_term_op, /* f */ NULL, +#ifdef CONFIG_X86_REMOTE_DEBUG +/* g */ &sysrq_gdb_op, +#else /* CONFIG_X86_REMOTE_DEBUG */ /* g */ NULL, +#endif /* CONFIG_X86_REMOTE_DEBUG */ /* h */ NULL, /* i */ &sysrq_kill_op, /* j */ NULL, diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/char/tty_io.c 900-mjb5/drivers/char/tty_io.c --- 001-bk10/drivers/char/tty_io.c Sun Mar 16 13:38:20 2003 +++ 900-mjb5/drivers/char/tty_io.c Sun Mar 16 13:39:04 2003 @@ -91,6 +91,9 @@ #include #include #include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif #include #include @@ -1034,7 +1037,9 @@ static void release_mem(struct tty_struc } o_tty->magic = 0; (*o_tty->driver.refcount)--; + file_list_lock(); list_del(&o_tty->tty_files); + file_list_unlock(); free_tty_struct(o_tty); } @@ -1046,7 +1051,9 @@ static void release_mem(struct tty_struc } tty->magic = 0; (*tty->driver.refcount)--; + file_list_lock(); list_del(&tty->tty_files); + file_list_unlock(); module_put(tty->driver.owner); free_tty_struct(tty); } @@ -2219,6 +2226,9 @@ void __init console_init(void) (*call)(); call++; } +#ifdef CONFIG_GDB_CONSOLE + gdb_console_init(); +#endif } static struct tty_driver dev_tty_driver, dev_syscons_driver; diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/dump/Makefile 900-mjb5/drivers/dump/Makefile --- 001-bk10/drivers/dump/Makefile Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/dump/Makefile Sun Mar 16 13:39:02 2003 @@ -0,0 +1,14 @@ +# +# Makefile for the dump device drivers. +# +export-objs := dump_setup.o + +dump-y := dump_setup.o dump_fmt.o dump_filters.o dump_scheme.o dump_execute.o +dump-$(CONFIG_X86) += dump_i386.o +dump-objs += $(dump-y) + +obj-$(CONFIG_CRASH_DUMP) += dump.o +obj-$(CONFIG_CRASH_DUMP_BLOCKDEV) += dump_blockdev.o +obj-$(CONFIG_CRASH_DUMP_NETDEV) += dump_netdev.o +obj-$(CONFIG_CRASH_DUMP_COMPRESS_RLE) += dump_rle.o +obj-$(CONFIG_CRASH_DUMP_COMPRESS_GZIP) += dump_gzip.o diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/dump/dump_blockdev.c 900-mjb5/drivers/dump/dump_blockdev.c --- 001-bk10/drivers/dump/dump_blockdev.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/dump/dump_blockdev.c Sun Mar 16 13:39:02 2003 @@ -0,0 +1,447 @@ +/* + * Implements the dump driver interface for saving a dump to + * a block device through the kernel's generic low level block i/o + * routines. + * + * Started: June 2002 - Mohamed Abbas + * Moved original lkcd kiobuf dump i/o code from dump_base.c + * to use generic dump device interfaces + * + * Sept 2002 - Bharata B. Rao + * Convert dump i/o to directly use bio instead of kiobuf for 2.5 + * + * Oct 2002 - Suparna Bhattacharya + * Rework to new dumpdev.h structures, implement open/close/ + * silence, misc fixes (blocknr removal, bio_add_page usage) + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "dump_methods.h" + +extern void *dump_page_buf; + +/* The end_io callback for dump i/o completion */ +static int +dump_bio_end_io(struct bio *bio, unsigned int bytes_done, int error) +{ + struct dump_blockdev *dump_bdev; + + if (bio->bi_size) { + /* some bytes still left to transfer */ + return 1; /* not complete */ + } + + dump_bdev = (struct dump_blockdev *)bio->bi_private; + if (error) { + printk("IO error while writing the dump, aborting\n"); + } + + dump_bdev->err = error; + + /* no wakeup needed, since caller polls for completion */ + return 0; +} + +/* Check if the dump bio is already mapped to the specified buffer */ +static int +dump_block_map_valid(struct dump_blockdev *dev, struct page *page, + int len) +{ + struct bio *bio = dev->bio; + + if (!bio->bi_vcnt) + return 0; /* first time, not mapped */ + + + if ((bio_page(bio) != page) || (len != bio->bi_vcnt << PAGE_SHIFT)) + return 0; /* buffer not mapped */ + + /* quick check to decide if we need to redo bio_add_page */ + if (bdev_get_queue(bio->bi_bdev)->merge_bvec_fn) + return 0; /* device may have other restrictions */ + + return 1; /* already mapped */ +} + +/* + * Set up the dump bio for i/o from the specified buffer + * Return value indicates whether the full buffer could be mapped or not + */ +static int +dump_block_map(struct dump_blockdev *dev, void *buf, int len) +{ + struct page *page = virt_to_page(buf); + struct bio *bio = dev->bio; + unsigned long bsize = 0; + + bio->bi_bdev = dev->bdev; + bio->bi_sector = (dev->start_offset + dev->ddev.curr_offset) >> 9; + bio->bi_idx = 0; /* reset index to the beginning */ + + if (dump_block_map_valid(dev, page, len)) { + /* already mapped and usable rightaway */ + bio->bi_size = len; /* reset size to the whole bio */ + } else { + /* need to map the bio */ + bio->bi_size = 0; + bio->bi_vcnt = 0; + bsize = bdev_hardsect_size(bio->bi_bdev); + + /* first a few sanity checks */ + if (len < bsize) { + printk("map: len less than hardsect size \n"); + return -EINVAL; + } + + if ((unsigned long)buf & bsize) { + printk("map: not aligned \n"); + return -EINVAL; + } + + /* assume contig. page aligned low mem buffer( no vmalloc) */ + if ((page_address(page) != buf) || (len & (PAGE_SIZE - 1))) { + printk("map: invalid buffer alignment!\n"); + return -EINVAL; + } + /* finally we can go ahead and map it */ + while (bio->bi_size < len) + if (bio_add_page(bio, page++, PAGE_SIZE, 0) == 0) { + break; + } + + bio->bi_end_io = dump_bio_end_io; + bio->bi_private = dev; + } + + if (bio->bi_size != len) { + printk("map: bio size = %d not enough for len = %d!\n", + bio->bi_size, len); + return -E2BIG; + } + return 0; +} + +static void +dump_free_bio(struct bio *bio) +{ + if (bio) + kfree(bio->bi_io_vec); + kfree(bio); +} + +/* + * Prepares the dump device so we can take a dump later. + * The caller is expected to have filled up the kdev_id field in the + * block dump dev structure. + * + * At dump time when dump_block_write() is invoked it will be too + * late to recover, so as far as possible make sure obvious errors + * get caught right here and reported back to the caller. + */ +static int +dump_block_open(struct dump_dev *dev, unsigned long arg) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + struct block_device *bdev; + int retval = 0; + struct bio_vec *bvec; + + /* make sure this is a valid block device */ + if (!arg) { + retval = -EINVAL; + goto err; + } + + /* get a corresponding block_dev struct for this */ + bdev = bdget((dev_t)arg); + if (!bdev) { + retval = -ENODEV; + goto err; + } + + /* get the block device opened */ + if ((retval = blkdev_get(bdev, O_RDWR | O_LARGEFILE, 0, BDEV_RAW))) { + goto err1; + } + + if ((dump_bdev->bio = kmalloc(sizeof(struct bio), GFP_KERNEL)) + == NULL) { + printk("Cannot allocate bio\n"); + retval = -ENOMEM; + goto err2; + } + + bio_init(dump_bdev->bio); + + if ((bvec = kmalloc(sizeof(struct bio_vec) * + (DUMP_BUFFER_SIZE >> PAGE_SHIFT), GFP_KERNEL)) == NULL) { + retval = -ENOMEM; + goto err3; + } + + /* assign the new dump dev structure */ + dump_bdev->kdev_id = to_kdev_t((dev_t)arg); + dump_bdev->bdev = bdev; + + /* make a note of the limit */ + dump_bdev->limit = bdev->bd_inode->i_size; + + /* now make sure we can map the dump buffer */ + dump_bdev->bio->bi_io_vec = bvec; + dump_bdev->bio->bi_max_vecs = DUMP_BUFFER_SIZE >> PAGE_SHIFT; + + retval = dump_block_map(dump_bdev, dump_config.dumper->dump_buf, + DUMP_BUFFER_SIZE); + + if (retval) { + printk("open: dump_block_map failed, ret %d\n", retval); + goto err3; + } + + printk("Block device (%d,%d) successfully configured for dumping\n", + major(dump_bdev->kdev_id), + minor(dump_bdev->kdev_id)); + + + /* after opening the block device, return */ + return retval; + +err3: dump_free_bio(dump_bdev->bio); + dump_bdev->bio = NULL; +err2: if (bdev) blkdev_put(bdev, BDEV_RAW); + goto err; +err1: if (bdev) bdput(bdev); + dump_bdev->bdev = NULL; +err: return retval; +} + +/* + * Close the dump device and release associated resources + * Invoked when unconfiguring the dump device. + */ +static int +dump_block_release(struct dump_dev *dev) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + + /* release earlier bdev if present */ + if (dump_bdev->bdev) { + blkdev_put(dump_bdev->bdev, BDEV_RAW); + dump_bdev->bdev = NULL; + } + + dump_free_bio(dump_bdev->bio); + dump_bdev->bio = NULL; + + return 0; +} + + +/* + * Prepare the dump device for use (silence any ongoing activity + * and quiesce state) when the system crashes. + */ +static int +dump_block_silence(struct dump_dev *dev) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + + /* For now we assume we have the device to ourselves */ + /* Just a quick sanity check */ + if (!blk_queue_empty(bdev_get_queue(dump_bdev->bdev))) { + /* i/o in flight - safer to quit */ + return -EBUSY; + } + + /* + * Move to a softer level of silencing where no spin_lock_irqs + * are held on other cpus + */ + dump_silence_level = DUMP_SOFT_SPIN_CPUS; + + __dump_irq_enable(); + + printk("Dumping to block device (%d,%d) on CPU %d ...\n", + major(dump_bdev->kdev_id), minor(dump_bdev->kdev_id), + smp_processor_id()); + + return 0; +} + +/* + * Invoked when dumping is done. This is the time to put things back + * (i.e. undo the effects of dump_block_silence) so the device is + * available for normal use. + */ +static int +dump_block_resume(struct dump_dev *dev) +{ + __dump_irq_restore(); + return 0; +} + + +/* + * Seek to the specified offset in the dump device. + * Makes sure this is a valid offset, otherwise returns an error. + */ +static int +dump_block_seek(struct dump_dev *dev, loff_t off) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + loff_t offset = off + dump_bdev->start_offset; + + if (offset & ( PAGE_SIZE - 1)) { + printk("seek: non-page aligned\n"); + return -EINVAL; + } + + if (offset & (bdev_hardsect_size(dump_bdev->bdev) - 1)) { + printk("seek: not sector aligned \n"); + return -EINVAL; + } + + if (offset > dump_bdev->limit) { + printk("seek: not enough space left on device!\n"); + return -ENOSPC; + } + dev->curr_offset = off; + return 0; +} + +/* + * Write out a buffer after checking the device limitations, + * sector sizes, etc. Assumes the buffer is in directly mapped + * kernel address space (not vmalloc'ed). + * + * Returns: number of bytes written or -ERRNO. + */ +static int +dump_block_write(struct dump_dev *dev, void *buf, + unsigned long len) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + loff_t offset = dev->curr_offset + dump_bdev->start_offset; + int retval = -ENOSPC; + + if (offset >= dump_bdev->limit) { + printk("write: not enough space left on device!\n"); + goto out; + } + + /* don't write more blocks than our max limit */ + if (offset + len > dump_bdev->limit) + len = dump_bdev->limit - offset; + + + retval = dump_block_map(dump_bdev, buf, len); + if (retval){ + printk("write: dump_block_map failed! err %d\n", retval); + goto out; + } + + /* + * Write out the data to disk. + * Assumes the entire buffer mapped to a single bio, which we can + * submit and wait for io completion. In the future, may consider + * increasing the dump buffer size and submitting multiple bio s + * for better throughput. + */ + dump_bdev->err = -EAGAIN; + submit_bio(WRITE, dump_bdev->bio); + + dump_bdev->ddev.curr_offset += len; + retval = len; + out: + return retval; +} + +/* + * Name: dump_block_ready() + * Func: check if the last dump i/o is over and ready for next request + */ +static int +dump_block_ready(struct dump_dev *dev, void *buf) +{ + struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); + request_queue_t *q = bdev_get_queue(dump_bdev->bio->bi_bdev); + + /* check for io completion */ + if (dump_bdev->err == -EAGAIN) { + q->unplug_fn(q); + return -EAGAIN; + } + + if (dump_bdev->err) { + printk("dump i/o err\n"); + return dump_bdev->err; + } + + return 0; +} + + +struct dump_dev_ops dump_blockdev_ops = { + .open = dump_block_open, + .release = dump_block_release, + .silence = dump_block_silence, + .resume = dump_block_resume, + .seek = dump_block_seek, + .write = dump_block_write, + /* .read not implemented */ + .ready = dump_block_ready +}; + +static struct dump_blockdev default_dump_blockdev = { + .ddev = {.type_name = "blockdev", .ops = &dump_blockdev_ops, + .curr_offset = 0}, + /* + * leave enough room for the longest swap header possibly written + * written by mkswap (likely the largest page size supported by + * the arch + */ + .start_offset = DUMP_HEADER_OFFSET, + .err = 0 + /* assume the rest of the fields are zeroed by default */ +}; + +struct dump_blockdev *dump_blockdev = &default_dump_blockdev; + +static int __init +dump_blockdev_init(void) +{ + if (dump_register_device(&dump_blockdev->ddev) < 0) { + printk("block device driver registration failed\n"); + return -1; + } + + printk("block device driver for LKCD registered\n"); + return 0; +} + +static void __exit +dump_blockdev_cleanup(void) +{ + dump_unregister_device(&dump_blockdev->ddev); + printk("block device driver for LKCD unregistered\n"); +} + +MODULE_AUTHOR("LKCD Development Team "); +MODULE_DESCRIPTION("Block Dump Driver for Linux Kernel Crash Dump (LKCD)"); +MODULE_LICENSE("GPL"); + +module_init(dump_blockdev_init); +module_exit(dump_blockdev_cleanup); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/dump/dump_execute.c 900-mjb5/drivers/dump/dump_execute.c --- 001-bk10/drivers/dump/dump_execute.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/dump/dump_execute.c Sun Mar 16 13:39:02 2003 @@ -0,0 +1,124 @@ +/* + * The file has the common/generic dump execution code + * + * Started: Oct 2002 - Suparna Bhattacharya + * Split and rewrote high level dump execute code to make use + * of dump method interfaces. + * + * Derived from original code in dump_base.c created by + * Matt Robinson ) + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * Assumes dumper and dump config settings are in place + * (invokes corresponding dumper specific routines as applicable) + * + * This code is released under version 2 of the GNU GPL. + */ +#include +#include +#include +#include "dump_methods.h" + +extern int dump_device; + +struct notifier_block *dump_notifier_list; /* dump started/ended callback */ + +/* Dump progress indicator */ +void +dump_speedo(int i) +{ + static const char twiddle[4] = { '|', '\\', '-', '/' }; + printk("%c\b", twiddle[i&3]); +} + +/* Make the device ready and write out the header */ +int dump_begin(void) +{ + int err = 0; + + /* dump_dev = dump_config.dumper->dev; */ + dumper_reset(); + if ((err = dump_dev_silence())) { + /* quiesce failed, can't risk continuing */ + /* Todo/Future: switch to alternate dump scheme if possible */ + printk("dump silence dev failed ! error %d\n", err); + return err; + } + + pr_debug("Writing dump header\n"); + if ((err = dump_update_header())) { + printk("dump update header failed ! error %d\n", err); + dump_dev_resume(); + return err; + } + + dump_config.dumper->curr_offset = DUMP_BUFFER_SIZE; + + return 0; +} + +/* + * Write the dump terminator, a final header update and let go of + * exclusive use of the device for dump. + */ +int dump_complete(void) +{ + int ret = 0; + + if (dump_config.level != DUMP_LEVEL_HEADER) { + if ((ret = dump_update_end_marker())) { + printk("dump update end marker error %d\n", ret); + } + if ((ret = dump_update_header())) { + printk("dump update header error %d\n", ret); + } + } + ret = dump_dev_resume(); + + return ret; +} + +/* Saves all dump data */ +int dump_execute_savedump(void) +{ + int ret = 0; + + if ((ret = dump_begin())) { + return ret; + } + + if (dump_config.level != DUMP_LEVEL_HEADER) { + ret = dump_sequencer(); + } + dump_complete(); + + return ret; +} + +/* Does all the real work: Capture and save state */ +int dump_generic_execute(const char *panic_str, const struct pt_regs *regs) +{ + int ret = 0; + + if ((ret = dump_configure_header(panic_str, regs))) { + printk("dump config header failed ! error %d\n", ret); + return ret; + } + + /* tell interested parties that a dump is about to start */ + notifier_call_chain(&dump_notifier_list, DUMP_BEGIN, &dump_device); + + if (dump_config.level != DUMP_LEVEL_NONE) + ret = dump_execute_savedump(); + + pr_debug("dumped %ld blocks of %d bytes each\n", + dump_config.dumper->count, DUMP_BUFFER_SIZE); + + /* tell interested parties that a dump has completed */ + notifier_call_chain(&dump_notifier_list, DUMP_END, &dump_device); + + return ret; +} diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/dump/dump_filters.c 900-mjb5/drivers/dump/dump_filters.c --- 001-bk10/drivers/dump/dump_filters.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/dump/dump_filters.c Sun Mar 16 13:39:02 2003 @@ -0,0 +1,145 @@ +/* + * Default filters to select data to dump for various passes. + * + * Started: Oct 2002 - Suparna Bhattacharya + * Split and rewrote default dump selection logic to generic dump + * method interfaces + * Derived from a portion of dump_base.c created by + * Matt Robinson ) + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * Used during single-stage dumping and during stage 1 of the 2-stage scheme + * (Stage 2 of the 2-stage scheme uses the fully transparent filters + * i.e. passthru filters in dump_overlay.c) + * + * Future: Custom selective dump may involve a different set of filters. + * + * This code is released under version 2 of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include "dump_methods.h" + + +/* Copied from mm/bootmem.c - FIXME */ +/* return the number of _pages_ that will be allocated for the boot bitmap */ +unsigned long dump_calc_bootmap_pages (void) +{ + unsigned long mapsize; + unsigned long pages = num_physpages; + + mapsize = (pages+7)/8; + mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; + mapsize >>= PAGE_SHIFT; + + return mapsize; +} + + +#define DUMP_PFN_SAFETY_MARGIN 1024 /* 4 MB */ +/* temporary */ +extern unsigned long min_low_pfn; + +/* to track all used (compound + zero order) pages */ +#define PageInuse(p) (PageCompound(p) || page_count(p)) + +int dump_low_page(struct page *p) +{ + return page_to_pfn(p) < min_low_pfn + dump_calc_bootmap_pages() + + 1 + DUMP_PFN_SAFETY_MARGIN; +} + +static inline int kernel_page(struct page *p) +{ + /* FIXME: Need to exclude hugetlb pages. Clue: reserved but inuse */ + return PageReserved(p) || (!PageLRU(p) && PageInuse(p)); +} + +static inline int user_page(struct page *p) +{ + return PageInuse(p) && (!PageReserved(p) && PageLRU(p)); +} + +static inline int unreferenced_page(struct page *p) +{ + return !PageInuse(p) && !PageReserved(p); +} + + +/* loc marks the beginning of a range of pages */ +int dump_filter_kernpages(int pass, unsigned long loc, unsigned long sz) +{ + struct page *page = (struct page *)loc; + /* if any of the pages is a kernel page, select this set */ + while (sz) { + if (dump_low_page(page) || kernel_page(page)) + return 1; + sz -= PAGE_SIZE; + page++; + } + return 0; +} + + +/* loc marks the beginning of a range of pages */ +int dump_filter_userpages(int pass, unsigned long loc, unsigned long sz) +{ + struct page *page = (struct page *)loc; + int ret = 0; + /* select if the set has any user page, and no kernel pages */ + while (sz) { + if (user_page(page) && !dump_low_page(page)) { + ret = 1; + } else if (kernel_page(page) || dump_low_page(page)) { + return 0; + } + page++; + sz -= PAGE_SIZE; + } + return ret; +} + + + +/* loc marks the beginning of a range of pages */ +int dump_filter_unusedpages(int pass, unsigned long loc, unsigned long sz) +{ + struct page *page = (struct page *)loc; + + /* select if the set does not have any used pages */ + while (sz) { + if (!unreferenced_page(page) || dump_low_page(page)) { + return 0; + } + page++; + sz -= PAGE_SIZE; + } + return 1; +} + +/* dummy: last (non-existent) pass */ +int dump_filter_none(int pass, unsigned long loc, unsigned long sz) +{ + return 0; +} + +/* TBD: resolve level bitmask ? */ +struct dump_data_filter dump_filter_table[] = { + { .name = "kern", .selector = dump_filter_kernpages, + .level_mask = DUMP_MASK_KERN}, + { .name = "user", .selector = dump_filter_userpages, + .level_mask = DUMP_MASK_USED}, + { .name = "unused", .selector = dump_filter_unusedpages, + .level_mask = DUMP_MASK_UNUSED}, + { .name = "none", .selector = dump_filter_none, + .level_mask = DUMP_MASK_REST}, + { .name = "", .selector = NULL, .level_mask = 0} +}; + diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/dump/dump_fmt.c 900-mjb5/drivers/dump/dump_fmt.c --- 001-bk10/drivers/dump/dump_fmt.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/dump/dump_fmt.c Sun Mar 16 13:39:02 2003 @@ -0,0 +1,395 @@ +/* + * Implements the routines which handle the format specific + * aspects of dump for the default dump format. + * + * Used in single stage dumping and stage 1 of soft-boot based dumping + * Saves data in LKCD (lcrash) format + * + * Previously a part of dump_base.c + * + * Started: Oct 2002 - Suparna Bhattacharya + * Split off and reshuffled LKCD dump format code around generic + * dump method interfaces. + * + * Derived from original code created by + * Matt Robinson ) + * + * Contributions from SGI, IBM, HP, MCL, and others. + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2000 - 2002 TurboLinux, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "dump_methods.h" + +/* + * SYSTEM DUMP LAYOUT + * + * System dumps are currently the combination of a dump header and a set + * of data pages which contain the system memory. The layout of the dump + * (for full dumps) is as follows: + * + * +-----------------------------+ + * | generic dump header | + * +-----------------------------+ + * | architecture dump header | + * +-----------------------------+ + * | page header | + * +-----------------------------+ + * | page data | + * +-----------------------------+ + * | page header | + * +-----------------------------+ + * | page data | + * +-----------------------------+ + * | | | + * | | | + * | | | + * | | | + * | V | + * +-----------------------------+ + * | PAGE_END header | + * +-----------------------------+ + * + * There are two dump headers, the first which is architecture + * independent, and the other which is architecture dependent. This + * allows different architectures to dump different data structures + * which are specific to their chipset, CPU, etc. + * + * After the dump headers come a succession of dump page headers along + * with dump pages. The page header contains information about the page + * size, any flags associated with the page (whether it's compressed or + * not), and the address of the page. After the page header is the page + * data, which is either compressed (or not). Each page of data is + * dumped in succession, until the final dump header (PAGE_END) is + * placed at the end of the dump, assuming the dump device isn't out + * of space. + * + * This mechanism allows for multiple compression types, different + * types of data structures, different page ordering, etc., etc., etc. + * It's a very straightforward mechanism for dumping system memory. + */ + +struct __dump_header dump_header; /* the primary dump header */ +struct __dump_header_asm dump_header_asm; /* the arch-specific dump header */ + +/* + * Set up common header fields (mainly the arch indep section) + * Per-cpu state is handled by lcrash_save_context + */ +static int lcrash_init_dump_header(const char *panic_str) +{ + struct timeval dh_time; + /* make sure the dump header isn't TOO big */ + if ((sizeof(struct __dump_header) + + sizeof(struct __dump_header_asm)) > DUMP_BUFFER_SIZE) { + printk("lcrash_init_header(): combined " + "headers larger than DUMP_BUFFER_SIZE!\n"); + return -E2BIG; + } + + /* initialize the dump headers to zero */ + memset(&dump_header, 0, sizeof(dump_header)); + memset(&dump_header_asm, 0, sizeof(dump_header_asm)); + + /* configure dump header values */ + dump_header.dh_magic_number = DUMP_MAGIC_NUMBER; + dump_header.dh_version = DUMP_VERSION_NUMBER; + dump_header.dh_memory_start = PAGE_OFFSET; + dump_header.dh_memory_end = DUMP_MAGIC_NUMBER; + dump_header.dh_header_size = sizeof(struct __dump_header); + dump_header.dh_page_size = PAGE_SIZE; + dump_header.dh_dump_level = dump_config.level; + dump_header.dh_current_task = (unsigned long) current; + dump_header.dh_dump_compress = dump_config.dumper->compress-> + compress_type; + dump_header.dh_dump_flags = dump_config.flags; + dump_header.dh_dump_device = dump_config.dumper->dev->device_id; + +#if DUMP_DEBUG >= 6 + dump_header.dh_num_bytes = 0; +#endif + dump_header.dh_num_dump_pages = 0; + do_gettimeofday(&dh_time); + dump_header.dh_time.tv_sec = dh_time.tv_sec; + dump_header.dh_time.tv_usec = dh_time.tv_usec; + + memcpy((void *)&(dump_header.dh_utsname_sysname), + (const void *)&(system_utsname.sysname), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_nodename), + (const void *)&(system_utsname.nodename), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_release), + (const void *)&(system_utsname.release), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_version), + (const void *)&(system_utsname.version), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_machine), + (const void *)&(system_utsname.machine), __NEW_UTS_LEN + 1); + memcpy((void *)&(dump_header.dh_utsname_domainname), + (const void *)&(system_utsname.domainname), __NEW_UTS_LEN + 1); + + if (panic_str) { + memcpy((void *)&(dump_header.dh_panic_string), + (const void *)panic_str, DUMP_PANIC_LEN); + } + + dump_header_asm.dha_magic_number = DUMP_ASM_MAGIC_NUMBER; + dump_header_asm.dha_version = DUMP_ASM_VERSION_NUMBER; + dump_header_asm.dha_header_size = sizeof(dump_header_asm); + + dump_header_asm.dha_smp_num_cpus = num_online_cpus(); + pr_debug("smp_num_cpus in header %d\n", + dump_header_asm.dha_smp_num_cpus); + + dump_header_asm.dha_dumping_cpu = smp_processor_id(); + + return 0; +} + + +int dump_lcrash_configure_header(const char *panic_str, + const struct pt_regs *regs) +{ + int retval = 0; + + if ((retval = lcrash_init_dump_header(panic_str))) + return retval; + + /* capture register states for all processors */ + dump_save_this_cpu(regs); + __dump_save_other_cpus(); /* side effect:silence cpus */ + + /* configure architecture-specific dump header values */ + if ((retval = __dump_configure_header(regs))) + return retval; + + dump_config.dumper->header_dirty++; + return 0; +} + +/* save register and task context */ +void dump_lcrash_save_context(int cpu, const struct pt_regs *regs, + struct task_struct *tsk) +{ + dump_header_asm.dha_smp_current_task[cpu] = (uint32_t) tsk; + + __dump_save_regs(&dump_header_asm.dha_smp_regs[cpu], regs); + + /* take a snapshot of the stack */ + /* doing this enables us to tolerate slight drifts on this cpu */ + if (dump_header_asm.dha_stack[cpu]) { + memcpy((void *)dump_header_asm.dha_stack[cpu], + tsk->thread_info, THREAD_SIZE); + } + dump_header_asm.dha_stack_ptr[cpu] = (uint32_t)(tsk->thread_info); +} + +/* write out the header */ +int dump_write_header(void) +{ + int retval = 0, size; + void *buf = dump_config.dumper->dump_buf; + + /* accounts for DUMP_HEADER_OFFSET if applicable */ + if ((retval = dump_dev_seek(0))) { + printk("Unable to seek to dump header offset: %d\n", + retval); + return retval; + } + + memcpy(buf, (void *)&dump_header, sizeof(dump_header)); + size = sizeof(dump_header); + memcpy(buf + size, (void *)&dump_header_asm, sizeof(dump_header_asm)); + size += sizeof(dump_header_asm); + /* assuming header is dump buffer size always ? */ + retval = dump_ll_write(buf , DUMP_BUFFER_SIZE); + + if (retval < DUMP_BUFFER_SIZE) + return (retval >= 0) ? ENOSPC : retval; + + return 0; +} + +int dump_generic_update_header(void) +{ + int err = 0; + + if (dump_config.dumper->header_dirty) { + if ((err = dump_write_header())) { + printk("dump write header failed !err %d\n", err); + } else { + dump_config.dumper->header_dirty = 0; + } + } + + return err; +} + +static inline int is_curr_stack_page(struct page *page, unsigned long size) +{ + unsigned long thread_addr = (unsigned long)current_thread_info(); + unsigned long addr = (unsigned long)page_address(page); + + return !PageHighMem(page) && (addr < thread_addr + THREAD_SIZE) + && (addr + size > thread_addr); +} + +static inline int is_dump_page(struct page *page, unsigned long size) +{ + unsigned long addr = (unsigned long)page_address(page); + unsigned long dump_buf = (unsigned long)dump_config.dumper->dump_buf; + + return !PageHighMem(page) && (addr < dump_buf + DUMP_BUFFER_SIZE) + && (addr + size > dump_buf); +} + +int dump_allow_compress(struct page *page, unsigned long size) +{ + /* + * Don't compress the page if any part of it overlaps + * with the current stack or dump buffer (since the contents + * in these could be changing while compression is going on) + */ + return !is_curr_stack_page(page, size) && !is_dump_page(page, size); +} + +void lcrash_init_pageheader(struct __dump_page *dp, struct page *page, + unsigned long sz) +{ + memset(dp, sizeof(struct __dump_page), 0); + dp->dp_flags = 0; + dp->dp_size = 0; + if (sz > 0) + dp->dp_address = page_to_pfn(page) << PAGE_SHIFT; + +#if DUMP_DEBUG > 6 + dp->dp_page_index = dump_header.dh_num_dump_pages; + dp->dp_byte_offset = dump_header.dh_num_bytes + DUMP_BUFFER_SIZE + + DUMP_HEADER_OFFSET; /* ?? */ +#endif /* DUMP_DEBUG */ +} + +int dump_lcrash_add_data(unsigned long loc, unsigned long len) +{ + struct page *page = (struct page *)loc; + void *addr, *buf = dump_config.dumper->curr_buf; + struct __dump_page *dp = (struct __dump_page *)buf; + int bytes, size; + + if (buf > dump_config.dumper->dump_buf + DUMP_BUFFER_SIZE) + return -ENOMEM; + + lcrash_init_pageheader(dp, page, len); + buf += sizeof(struct __dump_page); + + while (len) { + addr = kmap_atomic(page, KM_DUMP); + size = bytes = (len > PAGE_SIZE) ? PAGE_SIZE : len; + /* check for compression */ + if (dump_allow_compress(page, bytes)) { + size = dump_compress_data((char *)addr, bytes, (char *)buf); + } + /* set the compressed flag if the page did compress */ + if (size && (size < bytes)) { + dp->dp_flags |= DUMP_DH_COMPRESSED; + } else { + /* compression failed -- default to raw mode */ + dp->dp_flags |= DUMP_DH_RAW; + memcpy(buf, addr, bytes); + size = bytes; + } + /* memset(buf, 'A', size); temporary: testing only !! */ + kunmap_atomic(addr, KM_DUMP); + dp->dp_size += size; + buf += size; + len -= bytes; + page++; + } + + /* now update the header */ +#if DUMP_DEBUG > 6 + dump_header.dh_num_bytes += dp->dp_size + sizeof(*dp); +#endif + dump_header.dh_num_dump_pages++; + dump_config.dumper->header_dirty++; + + dump_config.dumper->curr_buf = buf; + + return len; +} + +int dump_lcrash_update_end_marker(void) +{ + struct __dump_page *dp = + (struct __dump_page *)dump_config.dumper->curr_buf; + unsigned long left; + int ret = 0; + + lcrash_init_pageheader(dp, NULL, 0); + dp->dp_flags |= DUMP_DH_END; /* tbd: truncation test ? */ + + /* now update the header */ +#if DUMP_DEBUG > 6 + dump_header.dh_num_bytes += sizeof(*dp); +#endif + dump_config.dumper->curr_buf += sizeof(*dp); + left = dump_config.dumper->curr_buf - dump_config.dumper->dump_buf; + + printk("\n"); + + while (left) { + if ((ret = dump_dev_seek(dump_config.dumper->curr_offset))) { + printk("Seek failed at offset 0x%llx\n", + dump_config.dumper->curr_offset); + return ret; + } + + if (DUMP_BUFFER_SIZE > left) + memset(dump_config.dumper->curr_buf, 'm', + DUMP_BUFFER_SIZE - left); + + if ((ret = dump_ll_write(dump_config.dumper->dump_buf, + DUMP_BUFFER_SIZE)) < DUMP_BUFFER_SIZE) { + return (ret < 0) ? ret : -ENOSPC; + } + + dump_config.dumper->curr_offset += DUMP_BUFFER_SIZE; + + if (left > DUMP_BUFFER_SIZE) { + left -= DUMP_BUFFER_SIZE; + memcpy(dump_config.dumper->dump_buf, + dump_config.dumper->dump_buf + DUMP_BUFFER_SIZE, left); + dump_config.dumper->curr_buf -= DUMP_BUFFER_SIZE; + } else { + left = 0; + } + } + return 0; +} + + +/* Default Formatter (lcrash) */ +struct dump_fmt_ops dump_fmt_lcrash_ops = { + .configure_header = dump_lcrash_configure_header, + .update_header = dump_generic_update_header, + .save_context = dump_lcrash_save_context, + .add_data = dump_lcrash_add_data, + .update_end_marker = dump_lcrash_update_end_marker +}; + +struct dump_fmt dump_fmt_lcrash = { + .name = "lcrash", + .ops = &dump_fmt_lcrash_ops +}; + diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/dump/dump_gzip.c 900-mjb5/drivers/dump/dump_gzip.c --- 001-bk10/drivers/dump/dump_gzip.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/dump/dump_gzip.c Sun Mar 16 13:39:02 2003 @@ -0,0 +1,118 @@ +/* + * GZIP Compression functions for kernel crash dumps. + * + * Created by: Matt Robinson (yakker@sourceforge.net) + * Copyright 2001 Matt D. Robinson. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* header files */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void *deflate_workspace; + +/* + * Name: dump_compress_gzip() + * Func: Compress a DUMP_PAGE_SIZE page using gzip-style algorithms (the. + * deflate functions similar to what's used in PPP). + */ +static u16 +dump_compress_gzip(const u8 *old, u16 oldsize, u8 *new, u16 newsize) +{ + /* error code and dump stream */ + int err; + z_stream dump_stream; + + dump_stream.workspace = deflate_workspace; + + if ((err = zlib_deflateInit(&dump_stream, Z_BEST_COMPRESSION)) != Z_OK) { + /* fall back to RLE compression */ + printk("dump_compress_gzip(): zlib_deflateInit() " + "failed (%d)!\n", err); + return 0; + } + + /* use old (page of memory) and size (DUMP_PAGE_SIZE) as in-streams */ + dump_stream.next_in = (u8 *) old; + dump_stream.avail_in = oldsize; + + /* out streams are new (dpcpage) and new size (DUMP_DPC_PAGE_SIZE) */ + dump_stream.next_out = new; + dump_stream.avail_out = newsize; + + /* deflate the page -- check for error */ + err = zlib_deflate(&dump_stream, Z_FINISH); + if (err != Z_STREAM_END) { + /* zero is return code here */ + (void)zlib_deflateEnd(&dump_stream); + printk("dump_compress_gzip(): zlib_deflate() failed (%d)!\n", + err); + return 0; + } + + /* let's end the deflated compression stream */ + if ((err = zlib_deflateEnd(&dump_stream)) != Z_OK) { + printk("dump_compress_gzip(): zlib_deflateEnd() " + "failed (%d)!\n", err); + } + + /* return the compressed byte total (if it's smaller) */ + if (dump_stream.total_out >= oldsize) { + return oldsize; + } + return dump_stream.total_out; +} + +/* setup the gzip compression functionality */ +static struct __dump_compress dump_gzip_compression = { + .compress_type = DUMP_COMPRESS_GZIP, + .compress_func = dump_compress_gzip, + .compress_name = "GZIP", +}; + +/* + * Name: dump_compress_gzip_init() + * Func: Initialize gzip as a compression mechanism. + */ +static int __init +dump_compress_gzip_init(void) +{ + deflate_workspace = vmalloc(zlib_deflate_workspacesize()); + if (!deflate_workspace) { + printk("dump_compress_gzip_init(): Failed to " + "alloc %d bytes for deflate workspace\n", + zlib_deflate_workspacesize()); + return -ENOMEM; + } + dump_register_compression(&dump_gzip_compression); + return 0; +} + +/* + * Name: dump_compress_gzip_cleanup() + * Func: Remove gzip as a compression mechanism. + */ +static void __exit +dump_compress_gzip_cleanup(void) +{ + vfree(deflate_workspace); + dump_unregister_compression(DUMP_COMPRESS_GZIP); +} + +/* module initialization */ +module_init(dump_compress_gzip_init); +module_exit(dump_compress_gzip_cleanup); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("LKCD Development Team "); +MODULE_DESCRIPTION("Gzip compression module for crash dump driver"); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/dump/dump_i386.c 900-mjb5/drivers/dump/dump_i386.c --- 001-bk10/drivers/dump/dump_i386.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/dump/dump_i386.c Sun Mar 16 13:39:02 2003 @@ -0,0 +1,358 @@ +/* + * Architecture specific (i386) functions for Linux crash dumps. + * + * Created by: Matt Robinson (yakker@sgi.com) + * + * Copyright 1999 Silicon Graphics, Inc. All rights reserved. + * + * 2.3 kernel modifications by: Matt D. Robinson (yakker@turbolinux.com) + * Copyright 2000 TurboLinux, Inc. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* + * The hooks for dumping the kernel virtual memory to disk are in this + * file. Any time a modification is made to the virtual memory mechanism, + * these routines must be changed to use the new mechanisms. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "dump_methods.h" +#include + +#include +#include +#include +#include +#include + +static __s32 saved_irq_count; /* saved preempt_count() flags */ + +static int +alloc_dha_stack(void) +{ + int i; + void *ptr; + + if (dump_header_asm.dha_stack[0]) + return 0; + + ptr = vmalloc(THREAD_SIZE * num_online_cpus()); + if (!ptr) { + printk("vmalloc for dha_stacks failed\n"); + return -ENOMEM; + } + + for (i = 0; i < num_online_cpus(); i++) { + dump_header_asm.dha_stack[i] = (u32)((unsigned long)ptr + + (i * THREAD_SIZE)); + } + return 0; +} + +static int +free_dha_stack(void) +{ + if (dump_header_asm.dha_stack[0]) { + vfree((void *)dump_header_asm.dha_stack[0]); + dump_header_asm.dha_stack[0] = 0; + } + return 0; +} + + +void +__dump_save_regs(struct pt_regs *dest_regs, const struct pt_regs *regs) +{ + *dest_regs = *regs; + + /* In case of panic dumps, we collects regs on entry to panic. + * so, we shouldn't 'fix' ssesp here again. But it is hard to + * tell just looking at regs whether ssesp need fixing. We make + * this decision by looking at xss in regs. If we have better + * means to determine that ssesp are valid (by some flag which + * tells that we are here due to panic dump), then we can use + * that instead of this kludge. + */ + if (!user_mode(regs)) { + if ((0xffff & regs->xss) == __KERNEL_DS) + /* already fixed up */ + return; + dest_regs->esp = (unsigned long)&(regs->esp); + __asm__ __volatile__ ("movw %%ss, %%ax;" + :"=a"(dest_regs->xss)); + } +} + + +#ifdef CONFIG_SMP +extern unsigned long irq_affinity[]; +extern irq_desc_t irq_desc[]; +extern void dump_send_ipi(void); + +static int dump_expect_ipi[NR_CPUS]; +static atomic_t waiting_for_dump_ipi; +static unsigned long saved_affinity[NR_IRQS]; + +extern void stop_this_cpu(void *); /* exported by i386 kernel */ + +static int +dump_nmi_callback(struct pt_regs *regs, int cpu) +{ + if (!dump_expect_ipi[cpu]) + return 0; + + dump_expect_ipi[cpu] = 0; + + dump_save_this_cpu(regs); + atomic_dec(&waiting_for_dump_ipi); + + level_changed: + switch (dump_silence_level) { + case DUMP_HARD_SPIN_CPUS: /* Spin until dump is complete */ + while (dump_oncpu) { + barrier(); /* paranoia */ + if (dump_silence_level != DUMP_HARD_SPIN_CPUS) + goto level_changed; + + cpu_relax(); /* kill time nicely */ + } + break; + + case DUMP_HALT_CPUS: /* Execute halt */ + stop_this_cpu(NULL); + break; + + case DUMP_SOFT_SPIN_CPUS: + /* Mark the task so it spins in schedule */ + set_tsk_thread_flag(current, TIF_NEED_RESCHED); + break; + } + + return 1; +} + +/* save registers on other processors */ +void +__dump_save_other_cpus(void) +{ + int i, cpu = smp_processor_id(); + int other_cpus = num_online_cpus()-1; + + if (other_cpus > 0) { + atomic_set(&waiting_for_dump_ipi, other_cpus); + + for (i = 0; i < NR_CPUS; i++) { + dump_expect_ipi[i] = (i != cpu && cpu_online(i)); + } + + /* short circuit normal NMI handling temporarily */ + set_nmi_callback(dump_nmi_callback); + wmb(); + + dump_send_ipi(); + /* may be we dont need to wait for NMI to be processed. + just write out the header at the end of dumping, if + this IPI is not processed until then, there probably + is a problem and we just fail to capture state of + other cpus. */ + while(atomic_read(&waiting_for_dump_ipi) > 0) { + cpu_relax(); + } + + unset_nmi_callback(); + } +} + +/* + * Routine to save the old irq affinities and change affinities of all irqs to + * the dumping cpu. + */ +static void +set_irq_affinity(void) +{ + int i; + int cpu = smp_processor_id(); + + memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long)); + for (i = 0; i < NR_IRQS; i++) { + if (irq_desc[i].handler == NULL) + continue; + irq_affinity[i] = 1UL << cpu; + if (irq_desc[i].handler->set_affinity != NULL) + irq_desc[i].handler->set_affinity(i, irq_affinity[i]); + } +} + +/* + * Restore old irq affinities. + */ +static void +reset_irq_affinity(void) +{ + int i; + + memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long)); + for (i = 0; i < NR_IRQS; i++) { + if (irq_desc[i].handler == NULL) + continue; + if (irq_desc[i].handler->set_affinity != NULL) + irq_desc[i].handler->set_affinity(i, saved_affinity[i]); + } +} + +#else /* !CONFIG_SMP */ +#define set_irq_affinity() do { } while (0) +#define reset_irq_affinity() do { } while (0) +#define save_other_cpu_states() do { } while (0) +#endif /* !CONFIG_SMP */ + +/* + * Kludge - dump from interrupt context is unreliable (Fixme) + * + * We do this so that softirqs initiated for dump i/o + * get processed and we don't hang while waiting for i/o + * to complete or in any irq synchronization attempt. + * + * This is not quite legal of course, as it has the side + * effect of making all interrupts & softirqs triggered + * while dump is in progress complete before currently + * pending softirqs and the currently executing interrupt + * code. + */ +static inline void +irq_bh_save(void) +{ + saved_irq_count = irq_count(); + preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK); +} + +static inline void +irq_bh_restore(void) +{ + preempt_count() |= saved_irq_count; +} + +/* + * Name: __dump_irq_enable + * Func: Reset system so interrupts are enabled. + * This is used for dump methods that require interrupts + * Eventually, all methods will have interrupts disabled + * and this code can be removed. + * + * Change irq affinities + * Re-enable interrupts + */ +void +__dump_irq_enable(void) +{ + set_irq_affinity(); + irq_bh_save(); + local_irq_enable(); +} + +/* + * Name: __dump_irq_restore + * Func: Resume the system state in an architecture-specific way. + + */ +void +__dump_irq_restore(void) +{ + local_irq_disable(); + reset_irq_affinity(); + irq_bh_restore(); +} + +/* + * Name: __dump_configure_header() + * Func: Meant to fill in arch specific header fields except per-cpu state + * already captured via __dump_save_context for all CPUs. + */ +int +__dump_configure_header(const struct pt_regs *regs) +{ + return (0); +} + +/* + * Name: dump_nmi_handler + * Func: Called from notify_die + */ +static int dump_die_event(struct notifier_block *this, + unsigned long event, + void *arg) +{ + const struct die_args *args = (const struct die_args *) arg; + + switch (event) { + case DIE_PANIC: + case DIE_OOPS: + case DIE_WATCHDOG: + dump_execute(args->str, args->regs); + break; + } + return NOTIFY_DONE; + +} + +static struct notifier_block dump_die_block = { + .notifier_call = dump_die_event, +}; + +/* + * Name: __dump_init() + * Func: Initialize the dumping routine process. + */ +void +__dump_init(uint64_t local_memory_start) +{ + /* hook into NMI, Panic, and OOPS */ + register_die_notifier(&dump_die_block); +} + +/* + * Name: __dump_open() + * Func: Open the dump device (architecture specific). + */ +void +__dump_open(void) +{ + alloc_dha_stack(); +} + +/* + * Name: __dump_cleanup() + * Func: Free any architecture specific data structures. This is called + * when the dump module is being removed. + */ +void +__dump_cleanup(void) +{ + free_dha_stack(); + + unregister_die_notifier(&dump_die_block); +} + +extern int pfn_is_ram(unsigned long); + +/* + * Name: __dump_page_valid() + * Func: Check if page is valid to dump. + */ +int +__dump_page_valid(unsigned long index) +{ + if (!pfn_valid(index)) + return 0; + + return pfn_is_ram(index); +} + diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/dump/dump_methods.h 900-mjb5/drivers/dump/dump_methods.h --- 001-bk10/drivers/dump/dump_methods.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/dump/dump_methods.h Sun Mar 16 13:39:02 2003 @@ -0,0 +1,314 @@ +/* + * Generic interfaces for flexible system dump + * + * Started: Oct 2002 - Suparna Bhattacharya (suparna@in.ibm.com) + * + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#ifndef _LINUX_DUMP_METHODS_H +#define _LINUX_DUMP_METHODS_H + +/* + * Inspired by Matt Robinson's suggestion of introducing dump + * methods as a way to enable different crash dump facilities to + * coexist where each employs its own scheme or dumping policy. + * + * The code here creates a framework for flexible dump by defining + * a set of methods and providing associated helpers that differentiate + * between the underlying mechanism (how to dump), overall scheme + * (sequencing of stages and data dumped and associated quiescing), + * output format (what the dump output looks like), target type + * (where to save the dump; see dumpdev.h), and selection policy + * (state/data to dump). + * + * These sets of interfaces can be mixed and matched to build a + * dumper suitable for a given situation, allowing for + * flexibility as well appropriate degree of code reuse. + * For example all features and options of lkcd (including + * granular selective dumping in the near future) should be + * available even when say, the 2 stage soft-boot based mechanism + * is used for taking disruptive dumps. + * + * Todo: Additionally modules or drivers may supply their own + * custom dumpers which extend dump with module specific + * information or hardware state, and can even tweak the + * mechanism when it comes to saving state relevant to + * them. + */ + +#include +#include +#include +#include + +#define MAX_PASSES 6 +#define MAX_DEVS 4 + + +/* To customise selection of pages to be dumped in a given pass/group */ +struct dump_data_filter{ + char name[32]; + int (*selector)(int, unsigned long, unsigned long); + ulong level_mask; /* dump level(s) for which this filter applies */ + loff_t start, end; /* location range applicable */ +}; + + +/* + * Determined by the kind of dump mechanism and appropriate + * overall scheme + */ +struct dump_scheme_ops { + /* sets aside memory, inits data structures etc */ + int (*configure)(unsigned long devid); + /* releases resources */ + int (*unconfigure)(void); + + /* ordering of passes, invoking iterator */ + int (*sequencer)(void); + /* iterates over system data, selects and acts on data to dump */ + int (*iterator)(int, int (*)(unsigned long, unsigned long), + struct dump_data_filter *); + /* action when data is selected for dump */ + int (*save_data)(unsigned long, unsigned long); + /* action when data is to be excluded from dump */ + int (*skip_data)(unsigned long, unsigned long); + /* policies for space, multiple dump devices etc */ + int (*write_buffer)(void *, unsigned long); +}; + +struct dump_scheme { + /* the name serves as an anchor to locate the scheme after reboot */ + char name[32]; + struct dump_scheme_ops *ops; + struct list_head list; +}; + +/* Quiescing/Silence levels (controls IPI callback behaviour) */ +extern enum dump_silence_levels { + DUMP_SOFT_SPIN_CPUS = 1, + DUMP_HARD_SPIN_CPUS = 2, + DUMP_HALT_CPUS = 3, +} dump_silence_level; + +/* determined by the dump (file) format */ +struct dump_fmt_ops { + /* build header */ + int (*configure_header)(const char *, const struct pt_regs *); + int (*update_header)(void); /* update header and write it out */ + /* save curr context */ + void (*save_context)(int, const struct pt_regs *, + struct task_struct *); + /* typically called by the save_data action */ + /* add formatted data to the dump buffer */ + int (*add_data)(unsigned long, unsigned long); + int (*update_end_marker)(void); +}; + +struct dump_fmt { + unsigned long magic; + char name[32]; /* lcrash, crash, elf-core etc */ + struct dump_fmt_ops *ops; + struct list_head list; +}; + +/* + * Modules will be able add their own data capture schemes by + * registering their own dumpers. Typically they would use the + * primary dumper as a template and tune it with their routines. + * Still Todo. + */ + +/* The combined dumper profile (mechanism, scheme, dev, fmt) */ +struct dumper { + char name[32]; /* singlestage, overlay (stg1), passthru(stg2), pull */ + struct dump_scheme *scheme; + struct dump_fmt *fmt; + struct __dump_compress *compress; + struct dump_data_filter *filter; + struct dump_dev *dev; + /* state valid only for active dumper(s) - per instance */ + /* run time state/context */ + int curr_pass; + unsigned long count; + loff_t curr_offset; /* current logical offset into dump device */ + loff_t curr_loc; /* current memory location */ + void *curr_buf; /* current position in the dump buffer */ + void *dump_buf; /* starting addr of dump buffer */ + int header_dirty; /* whether the header needs to be written out */ + struct list_head dumper_list; /* links to other dumpers */ +}; + +/* Starting point to get to the current configured state */ +struct dump_config { + ulong level; + ulong flags; + struct dumper *dumper; + struct list_head dump_dev_list; +}; + +extern struct dump_config dump_config; + + +/* Wrappers that invoke the methods for the current (active) dumper */ + +/* Scheme operations */ + +static inline int dump_sequencer(void) +{ + return dump_config.dumper->scheme->ops->sequencer(); +} + +static inline int dump_iterator(int pass, int (*action)(unsigned long, + unsigned long), struct dump_data_filter *filter) +{ + return dump_config.dumper->scheme->ops->iterator(pass, action, filter); +} + +#define dump_save_data dump_config.dumper->scheme->ops->save_data +#define dump_skip_data dump_config.dumper->scheme->ops->skip_data + +static inline int dump_write_buffer(void *buf, unsigned long len) +{ + return dump_config.dumper->scheme->ops->write_buffer(buf, len); +} + +static inline int dump_configure(unsigned long devid) +{ + return dump_config.dumper->scheme->ops->configure(devid); +} + +static inline int dump_unconfigure(void) +{ + return dump_config.dumper->scheme->ops->unconfigure(); +} + +/* Format operations */ + +static inline int dump_configure_header(const char *panic_str, + const struct pt_regs *regs) +{ + return dump_config.dumper->fmt->ops->configure_header(panic_str, regs); +} + +static inline void dump_save_context(int cpu, const struct pt_regs *regs, + struct task_struct *tsk) +{ + dump_config.dumper->fmt->ops->save_context(cpu, regs, tsk); +} + +static inline int dump_save_this_cpu(const struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + + dump_save_context(cpu, regs, current); + return 1; +} + +static inline int dump_update_header(void) +{ + return dump_config.dumper->fmt->ops->update_header(); +} + +static inline int dump_update_end_marker(void) +{ + return dump_config.dumper->fmt->ops->update_end_marker(); +} + +static inline int dump_add_data(unsigned long loc, unsigned long sz) +{ + return dump_config.dumper->fmt->ops->add_data(loc, sz); +} + +/* Compression operation */ +static inline int dump_compress_data(char *src, int slen, char *dst) +{ + return dump_config.dumper->compress->compress_func(src, slen, + dst, DUMP_DPC_PAGE_SIZE); +} + + +/* Prototypes of some default implementations of dump methods */ + +extern struct __dump_compress dump_none_compression; + +/* Default scheme methods (dump_scheme.c) */ + +extern int dump_generic_sequencer(void); +extern int dump_page_iterator(int pass, int (*action)(unsigned long, unsigned + long), struct dump_data_filter *filter); +extern int dump_generic_save_data(unsigned long loc, unsigned long sz); +extern int dump_generic_skip_data(unsigned long loc, unsigned long sz); +extern int dump_generic_write_buffer(void *buf, unsigned long len); +extern int dump_generic_configure(unsigned long); +extern int dump_generic_unconfigure(void); + +/* Default scheme template */ +extern struct dump_scheme dump_scheme_singlestage; + +/* Default dump format methods */ + +extern int dump_lcrash_configure_header(const char *panic_str, + const struct pt_regs *regs); +extern void dump_lcrash_save_context(int cpu, const struct pt_regs *regs, + struct task_struct *tsk); +extern int dump_generic_update_header(void); +extern int dump_lcrash_add_data(unsigned long loc, unsigned long sz); +extern int dump_lcrash_update_end_marker(void); + +/* Default format (lcrash) template */ +extern struct dump_fmt dump_fmt_lcrash; + +/* Default dump selection filter table */ + +/* + * Entries listed in order of importance and correspond to passes + * The last entry (with a level_mask of zero) typically reflects data that + * won't be dumped -- this may for example be used to identify data + * that will be skipped for certain so the corresponding memory areas can be + * utilized as scratch space. + */ +extern struct dump_data_filter dump_filter_table[]; + +/* Some pre-defined dumpers */ +extern struct dumper dumper_singlestage; + +/* These are temporary */ +#define DUMP_MASK_HEADER DUMP_LEVEL_HEADER +#define DUMP_MASK_KERN DUMP_LEVEL_KERN +#define DUMP_MASK_USED DUMP_LEVEL_USED +#define DUMP_MASK_UNUSED DUMP_LEVEL_ALL_RAM +#define DUMP_MASK_REST 0 /* dummy for now */ + +/* Helpers - move these to dump.h later ? */ + +int dump_generic_execute(const char *panic_str, const struct pt_regs *regs); +extern int dump_ll_write(void *buf, unsigned long len); + +static inline void dumper_reset(void) +{ + dump_config.dumper->curr_buf = dump_config.dumper->dump_buf; + dump_config.dumper->curr_loc = 0; + dump_config.dumper->curr_offset = 0; + dump_config.dumper->count = 0; + dump_config.dumper->curr_pass = 0; +} + +/* + * May later be moulded to perform boot-time allocations so we can dump + * earlier during bootup + */ +static inline void *dump_alloc_mem(unsigned long size) +{ + return kmalloc(size, GFP_KERNEL); +} + +static inline void dump_free_mem(void *buf) +{ + kfree(buf); +} + +#endif /* _LINUX_DUMP_METHODS_H */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/dump/dump_netdev.c 900-mjb5/drivers/dump/dump_netdev.c --- 001-bk10/drivers/dump/dump_netdev.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/dump/dump_netdev.c Sun Mar 16 13:39:02 2003 @@ -0,0 +1,858 @@ +/* + * Implements the dump driver interface for saving a dump via network + * interface. + * + * Some of this code has been taken/adapted from Ingo Molnar's netconsole + * code. LKCD team expresses its thanks to Ingo. + * + * Started: June 2002 - Mohamed Abbas + * Adapted netconsole code to implement LKCD dump over the network. + * + * Nov 2002 - Bharata B. Rao + * Innumerable code cleanups, simplification and some fixes. + * Netdump configuration done by ioctl instead of using module parameters. + * + * Copyright (C) 2001 Ingo Molnar + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static int startup_handshake; +static int page_counter; +static struct net_device *dump_ndev; +static struct in_device *dump_in_dev; +static u16 source_port, target_port; +static u32 source_ip, target_ip; +static unsigned char daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ; +static spinlock_t dump_skb_lock = SPIN_LOCK_UNLOCKED; +static int dump_nr_skbs; +static struct sk_buff *dump_skb; +static unsigned long flags_global; +static int netdump_in_progress; +static char device_name[IFNAMSIZ]; + +/* + * security depends on the trusted path between the netconsole + * server and netconsole client, since none of the packets are + * encrypted. The random magic number protects the protocol + * against spoofing. + */ +static u64 dump_magic; + +#define MAX_UDP_CHUNK 1460 +#define MAX_PRINT_CHUNK (MAX_UDP_CHUNK-HEADER_LEN) + +/* + * We maintain a small pool of fully-sized skbs, + * to make sure the message gets out even in + * extreme OOM situations. + */ +#define DUMP_MAX_SKBS 32 + +#define MAX_SKB_SIZE \ + (MAX_UDP_CHUNK + sizeof(struct udphdr) + \ + sizeof(struct iphdr) + sizeof(struct ethhdr)) + +static void +dump_refill_skbs(void) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&dump_skb_lock, flags); + while (dump_nr_skbs < DUMP_MAX_SKBS) { + skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); + if (!skb) + break; + if (dump_skb) + skb->next = dump_skb; + else + skb->next = NULL; + dump_skb = skb; + dump_nr_skbs++; + } + spin_unlock_irqrestore(&dump_skb_lock, flags); +} + +static struct +sk_buff * dump_get_skb(void) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&dump_skb_lock, flags); + skb = dump_skb; + if (skb) { + dump_skb = skb->next; + skb->next = NULL; + dump_nr_skbs--; + } + spin_unlock_irqrestore(&dump_skb_lock, flags); + + return skb; +} + +/* + * Zap completed output skbs. + */ +static void +zap_completion_queue(void) +{ + int count; + unsigned long flags; + int cpu = smp_processor_id(); + + count=0; + if (softnet_data[cpu].completion_queue) { + struct sk_buff *clist; + + local_irq_save(flags); + clist = softnet_data[cpu].completion_queue; + softnet_data[cpu].completion_queue = NULL; + local_irq_restore(flags); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + __kfree_skb(skb); + count++; + if (count > 10000) + printk("Error in sk list\n"); + } + } +} + +static void +dump_send_skb(struct net_device *dev, const char *msg, unsigned int msg_len, + reply_t *reply) +{ + int once = 1; + int total_len, eth_len, ip_len, udp_len, count = 0; + struct sk_buff *skb; + struct udphdr *udph; + struct iphdr *iph; + struct ethhdr *eth; + + udp_len = msg_len + HEADER_LEN + sizeof(*udph); + ip_len = eth_len = udp_len + sizeof(*iph); + total_len = eth_len + ETH_HLEN; + +repeat_loop: + zap_completion_queue(); + if (dump_nr_skbs < DUMP_MAX_SKBS) + dump_refill_skbs(); + + skb = alloc_skb(total_len, GFP_ATOMIC); + if (!skb) { + skb = dump_get_skb(); + if (!skb) { + count++; + if (once && (count == 1000000)) { + printk("possibly FATAL: out of netconsole " + "skbs!!! will keep retrying.\n"); + once = 0; + } + dev->poll_controller(dev); + goto repeat_loop; + } + } + + atomic_set(&skb->users, 1); + skb_reserve(skb, total_len - msg_len - HEADER_LEN); + skb->data[0] = NETCONSOLE_VERSION; + + put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1)); + put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5)); + put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9)); + + memcpy(skb->data + HEADER_LEN, msg, msg_len); + skb->len += msg_len + HEADER_LEN; + + udph = (struct udphdr *) skb_push(skb, sizeof(*udph)); + udph->source = source_port; + udph->dest = target_port; + udph->len = htons(udp_len); + udph->check = 0; + + iph = (struct iphdr *)skb_push(skb, sizeof(*iph)); + + iph->version = 4; + iph->ihl = 5; + iph->tos = 0; + iph->tot_len = htons(ip_len); + iph->id = 0; + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + iph->saddr = source_ip; + iph->daddr = target_ip; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + + eth->h_proto = htons(ETH_P_IP); + memcpy(eth->h_source, dev->dev_addr, dev->addr_len); + memcpy(eth->h_dest, daddr, dev->addr_len); + + count=0; +repeat_poll: + spin_lock(&dev->xmit_lock); + dev->xmit_lock_owner = smp_processor_id(); + + count++; + + + if (netif_queue_stopped(dev)) { + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); + + dev->poll_controller(dev); + zap_completion_queue(); + + + goto repeat_poll; + } + + dev->hard_start_xmit(skb, dev); + + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); +} + +static unsigned short +udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, + unsigned long base) +{ + return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base); +} + +static int +udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, + unsigned short ulen, u32 saddr, u32 daddr) +{ + if (uh->check == 0) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) + return 0; + skb->ip_summed = CHECKSUM_NONE; + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, + IPPROTO_UDP, 0); + /* Probably, we should checksum udp header (it should be in cache + * in any case) and data in tiny packets (< rx copybreak). + */ + return 0; +} + +static __inline__ int +__udp_checksum_complete(struct sk_buff *skb) +{ + return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, + skb->csum)); +} + +static __inline__ +int udp_checksum_complete(struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __udp_checksum_complete(skb); +} + +int new_req = 0; +static req_t req; + +static int +dump_rx_hook(struct sk_buff *skb) +{ + int proto; + struct iphdr *iph; + struct udphdr *uh; + __u32 len, saddr, daddr, ulen; + req_t *__req; + + /* + * First check if were are dumping or doing startup handshake, if + * not quickly return. + */ + if (!netdump_in_progress) + return NET_RX_SUCCESS; + + if (skb->dev->type != ARPHRD_ETHER) + goto out; + + proto = ntohs(skb->mac.ethernet->h_proto); + if (proto != ETH_P_IP) + goto out; + + if (skb->pkt_type == PACKET_OTHERHOST) + goto out; + + if (skb_shared(skb)) + goto out; + + /* IP header correctness testing: */ + iph = (struct iphdr *)skb->data; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + if (iph->ihl < 5 || iph->version != 4) + goto out; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto out; + + if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + goto out; + + len = ntohs(iph->tot_len); + if (skb->len < len || len < iph->ihl*4) + goto out; + + saddr = iph->saddr; + daddr = iph->daddr; + if (iph->protocol != IPPROTO_UDP) + goto out; + + if (source_ip != daddr) + goto out; + + if (target_ip != saddr) + goto out; + + len -= iph->ihl*4; + uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); + ulen = ntohs(uh->len); + + if (ulen != len || ulen < (sizeof(*uh) + sizeof(*__req))) + goto out; + + if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) + goto out; + + if (udp_checksum_complete(skb)) + goto out; + + if (source_port != uh->dest) + goto out; + + if (target_port != uh->source) + goto out; + + __req = (req_t *)(uh + 1); + if ((ntohl(__req->command) != COMM_GET_MAGIC) && + (ntohl(__req->command) != COMM_HELLO) && + (ntohl(__req->command) != COMM_START_WRITE_NETDUMP_ACK) && + (ntohl(__req->command) != COMM_START_NETDUMP_ACK) && + (memcmp(&__req->magic, &dump_magic, sizeof(dump_magic)) != 0)) + goto out; + + req.magic = ntohl(__req->magic); + req.command = ntohl(__req->command); + req.from = ntohl(__req->from); + req.to = ntohl(__req->to); + req.nr = ntohl(__req->nr); + new_req = 1; +out: + return NET_RX_DROP; +} + +static void +dump_send_mem(struct net_device *dev, req_t *req, const char* buff, size_t len) +{ + int i; + + int nr_chunks = len/1024; + reply_t reply; + + reply.nr = req->nr; + reply.info = 0; + + if ( nr_chunks <= 0) + nr_chunks = 1; + for (i = 0; i < nr_chunks; i++) { + unsigned int offset = i*1024; + reply.code = REPLY_MEM; + reply.info = offset; + dump_send_skb(dev, buff + offset, 1024, &reply); + } +} + +/* + * This function waits for the client to acknowledge the receipt + * of the netdump startup reply, with the possibility of packets + * getting lost. We resend the startup packet if no ACK is received, + * after a 1 second delay. + * + * (The client can test the success of the handshake via the HELLO + * command, and send ACKs until we enter netdump mode.) + */ +static int +dump_handshake(struct dump_dev *net_dev) +{ + char tmp[200]; + reply_t reply; + int i, j; + + if (startup_handshake) { + sprintf(tmp, "NETDUMP start, waiting for start-ACK.\n"); + reply.code = REPLY_START_NETDUMP; + reply.nr = 0; + reply.info = 0; + } else { + sprintf(tmp, "NETDUMP start, waiting for start-ACK.\n"); + reply.code = REPLY_START_WRITE_NETDUMP; + reply.nr = net_dev->curr_offset; + reply.info = net_dev->curr_offset; + } + + /* send 300 handshake packets before declaring failure */ + for (i = 0; i < 300; i++) { + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + + /* wait 1 sec */ + for (j = 0; j < 10000; j++) { + udelay(100); + dump_ndev->poll_controller(dump_ndev); + zap_completion_queue(); + if (new_req) + break; + } + + /* + * if there is no new request, try sending the handshaking + * packet again + */ + if (!new_req) + continue; + + /* + * check if the new request is of the expected type, + * if so, return, else try sending the handshaking + * packet again + */ + if (startup_handshake) { + if (req.command == COMM_HELLO || req.command == + COMM_START_NETDUMP_ACK) { + return 0; + } else { + new_req = 0; + continue; + } + } else { + if (req.command == COMM_SEND_MEM) { + return 0; + } else { + new_req = 0; + continue; + } + } + } + return -1; +} + +static ssize_t +do_netdump(struct dump_dev *net_dev, const char* buff, size_t len) +{ + reply_t reply; + char tmp[200]; + ssize_t ret = 0; + int repeatCounter, counter, total_loop; + + netdump_in_progress = 1; + + if (dump_handshake(net_dev) < 0) { + printk("network dump failed due to handshake failure\n"); + goto out; + } + + /* + * Ideally startup handshake should be done during dump configuration, + * i.e., in dump_net_open(). This will be done when I figure out + * the dependency between startup handshake, subsequent write and + * various commands wrt to net-server. + */ + if (startup_handshake) + startup_handshake = 0; + + counter = 0; + repeatCounter = 0; + total_loop = 0; + while (1) { + if (!new_req) { + dump_ndev->poll_controller(dump_ndev); + zap_completion_queue(); + } + if (!new_req) { + repeatCounter++; + + if (repeatCounter > 5) { + counter++; + if (counter > 10000) { + if (total_loop >= 100000) { + printk("Time OUT LEAVE NOW\n"); + goto out; + } else { + total_loop++; + printk("Try number %d out of " + "10 before Time Out\n", + total_loop); + } + } + mdelay(1); + repeatCounter = 0; + } + continue; + } + repeatCounter = 0; + counter = 0; + total_loop = 0; + new_req = 0; + switch (req.command) { + case COMM_NONE: + break; + + case COMM_SEND_MEM: + dump_send_mem(dump_ndev, &req, buff, len); + break; + + case COMM_EXIT: + case COMM_START_WRITE_NETDUMP_ACK: + ret = len; + goto out; + + case COMM_HELLO: + sprintf(tmp, "Hello, this is netdump version " + "0.%02d\n", NETCONSOLE_VERSION); + reply.code = REPLY_HELLO; + reply.nr = req.nr; + reply.info = net_dev->curr_offset; + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + break; + + case COMM_GET_PAGE_SIZE: + sprintf(tmp, "PAGE_SIZE: %ld\n", PAGE_SIZE); + reply.code = REPLY_PAGE_SIZE; + reply.nr = req.nr; + reply.info = PAGE_SIZE; + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + break; + + case COMM_GET_NR_PAGES: + reply.code = REPLY_NR_PAGES; + reply.nr = req.nr; + reply.info = max_mapnr; + reply.info = page_counter; + sprintf(tmp, "Number of pages: %ld\n", max_mapnr); + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + break; + + case COMM_GET_MAGIC: + reply.code = REPLY_MAGIC; + reply.nr = req.nr; + reply.info = NETCONSOLE_VERSION; + dump_send_skb(dump_ndev, (char *)&dump_magic, + sizeof(dump_magic), &reply); + break; + + default: + reply.code = REPLY_ERROR; + reply.nr = req.nr; + reply.info = req.command; + sprintf(tmp, "Got unknown command code %d!\n", + req.command); + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + break; + } + } +out: + netdump_in_progress = 0; + return ret; +} + +static int +dump_validate_config(void) +{ + source_ip = dump_in_dev->ifa_list->ifa_local; + if (!source_ip) { + printk("network device %s has no local address, " + "aborting.\n", device_name); + return -1; + } + +#define IP(x) ((unsigned char *)&source_ip)[x] + printk("Source %d.%d.%d.%d", IP(0), IP(1), IP(2), IP(3)); +#undef IP + + if (!source_port) { + printk("source_port parameter not specified, aborting.\n"); + return -1; + } + printk(":%i\n", source_port); + source_port = htons(source_port); + + if (!target_ip) { + printk("target_ip parameter not specified, aborting.\n"); + return -1; + } + +#define IP(x) ((unsigned char *)&target_ip)[x] + printk("Target %d.%d.%d.%d", IP(0), IP(1), IP(2), IP(3)); +#undef IP + + if (!target_port) { + printk("target_port parameter not specified, aborting.\n"); + return -1; + } + printk(":%i\n", target_port); + target_port = htons(target_port); + + printk("Target Ethernet Address %02x:%02x:%02x:%02x:%02x:%02x", + daddr[0], daddr[1], daddr[2], daddr[3], daddr[4], daddr[5]); + + if ((daddr[0] & daddr[1] & daddr[2] & daddr[3] & daddr[4] & + daddr[5]) == 255) + printk("(Broadcast)"); + printk("\n"); + return 0; +} + +/* + * Prepares the dump device so we can take a dump later. + * Validates the netdump configuration parameters. + * + * TODO: Network connectivity check should be done here. + */ +static int +dump_net_open(struct dump_dev *net_dev, unsigned long arg) +{ + int retval = 0; + + /* get the interface name */ + if (copy_from_user(device_name, (void *)arg, IFNAMSIZ)) + return -EFAULT; + + if (!(dump_ndev = dev_get_by_name(device_name))) { + printk("network device %s does not exist, aborting.\n", + device_name); + return -ENODEV; + } + + if (!dump_ndev->poll_controller) { + printk("network device %s does not implement polling yet, " + "aborting.\n", device_name); + retval = -1; /* return proper error */ + goto err1; + } + + if (!(dump_in_dev = in_dev_get(dump_ndev))) { + printk("network device %s is not an IP protocol device, " + "aborting.\n", device_name); + retval = -EINVAL; + goto err1; + } + + if ((retval = dump_validate_config()) < 0) + goto err2; + + net_dev->curr_offset = 0; + printk("Network device %s successfully configured for dumping\n", + device_name); + return retval; +err2: + in_dev_put(dump_in_dev); +err1: + dev_put(dump_ndev); + return retval; +} + +/* + * Close the dump device and release associated resources + * Invoked when unconfiguring the dump device. + */ +static int +dump_net_release(struct dump_dev *net_dev) +{ + if (dump_in_dev) + in_dev_put(dump_in_dev); + if (dump_ndev) + dev_put(dump_ndev); + return 0; +} + +/* + * Prepare the dump device for use (silence any ongoing activity + * and quiesce state) when the system crashes. + */ +static int +dump_net_silence(struct dump_dev *net_dev) +{ + local_irq_save(flags_global); + dump_ndev->rx_hook = dump_rx_hook; + startup_handshake = 1; + net_dev->curr_offset = 0; + printk("Dumping to network device %s on CPU %d ...\n", device_name, + smp_processor_id()); + return 0; +} + +/* + * Invoked when dumping is done. This is the time to put things back + * (i.e. undo the effects of dump_block_silence) so the device is + * available for normal use. + */ +static int +dump_net_resume(struct dump_dev *net_dev) +{ + int indx; + reply_t reply; + char tmp[200]; + + if (!dump_ndev) + return (0); + + sprintf(tmp, "NETDUMP end.\n"); + for( indx = 0; indx < 6; indx++) { + reply.code = REPLY_END_NETDUMP; + reply.nr = 0; + reply.info = 0; + dump_send_skb(dump_ndev, tmp, strlen(tmp), &reply); + } + printk("NETDUMP END!\n"); + local_irq_restore(flags_global); + dump_ndev->rx_hook = NULL; + startup_handshake = 0; + return 0; +} + +/* + * Seek to the specified offset in the dump device. + * Makes sure this is a valid offset, otherwise returns an error. + */ +static int +dump_net_seek(struct dump_dev *net_dev, loff_t off) +{ + net_dev->curr_offset = off; + return 0; +} + +/* + * + */ +static int +dump_net_write(struct dump_dev *net_dev, void *buf, unsigned long len) +{ + int cnt, i, off; + ssize_t ret; + + cnt = len/ PAGE_SIZE; + + for (i = 0; i < cnt; i++) { + off = i* PAGE_SIZE; + ret = do_netdump(net_dev, buf+off, PAGE_SIZE); + if (ret <= 0) + return -1; + net_dev->curr_offset = net_dev->curr_offset + PAGE_SIZE; + } + return len; +} + +/* + * check if the last dump i/o is over and ready for next request + */ +static int +dump_net_ready(struct dump_dev *net_dev, void *buf) +{ + return 0; +} + +/* + * ioctl function used for configuring network dump + */ +static int +dump_net_ioctl(struct dump_dev *net_dev, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case DIOSTARGETIP: + target_ip = arg; + break; + case DIOSTARGETPORT: + target_port = (u16)arg; + break; + case DIOSSOURCEPORT: + source_port = (u16)arg; + break; + case DIOSETHADDR: + return copy_from_user(daddr, (void *)arg, 6); + break; + case DIOGTARGETIP: + case DIOGTARGETPORT: + case DIOGSOURCEPORT: + case DIOGETHADDR: + break; + default: + return -EINVAL; + } + return 0; +} + +struct dump_dev_ops dump_netdev_ops = { + .open = dump_net_open, + .release = dump_net_release, + .silence = dump_net_silence, + .resume = dump_net_resume, + .seek = dump_net_seek, + .write = dump_net_write, + /* .read not implemented */ + .ready = dump_net_ready, + .ioctl = dump_net_ioctl +}; + +static struct dump_dev default_dump_netdev = { + .type_name = "networkdev", + .ops = &dump_netdev_ops, + .curr_offset = 0 +}; + +static int __init +dump_netdev_init(void) +{ + default_dump_netdev.curr_offset = 0; + + if (dump_register_device(&default_dump_netdev) < 0) { + printk("network dump device driver registration failed\n"); + return -1; + } + printk("network device driver for LKCD registered\n"); + + get_random_bytes(&dump_magic, sizeof(dump_magic)); + return 0; +} + +static void __exit +dump_netdev_cleanup(void) +{ + dump_unregister_device(&default_dump_netdev); +} + +MODULE_AUTHOR("LKCD Development Team "); +MODULE_DESCRIPTION("Network Dump Driver for Linux Kernel Crash Dump (LKCD)"); +MODULE_LICENSE("GPL"); + +module_init(dump_netdev_init); +module_exit(dump_netdev_cleanup); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/dump/dump_rle.c 900-mjb5/drivers/dump/dump_rle.c --- 001-bk10/drivers/dump/dump_rle.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/dump/dump_rle.c Sun Mar 16 13:39:02 2003 @@ -0,0 +1,175 @@ +/* + * RLE Compression functions for kernel crash dumps. + * + * Created by: Matt Robinson (yakker@sourceforge.net) + * Copyright 2001 Matt D. Robinson. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* header files */ +#include +#include +#include +#include +#include +#include +#include + +/* + * Name: dump_compress_rle() + * Func: Compress a DUMP_PAGE_SIZE (hardware) page down to something more + * reasonable, if possible. This is the same routine we use in IRIX. + */ +static u16 +dump_compress_rle(const u8 *old, u16 oldsize, u8 *new, u16 newsize) +{ + u16 ri, wi, count = 0; + u_char value = 0, cur_byte; + + /* + * If the block should happen to "compress" to larger than the + * buffer size, allocate a larger one and change cur_buf_size. + */ + + wi = ri = 0; + + while (ri < oldsize) { + if (!ri) { + cur_byte = value = old[ri]; + count = 0; + } else { + if (count == 255) { + if (wi + 3 > oldsize) { + return oldsize; + } + new[wi++] = 0; + new[wi++] = count; + new[wi++] = value; + value = cur_byte = old[ri]; + count = 0; + } else { + if ((cur_byte = old[ri]) == value) { + count++; + } else { + if (count > 1) { + if (wi + 3 > oldsize) { + return oldsize; + } + new[wi++] = 0; + new[wi++] = count; + new[wi++] = value; + } else if (count == 1) { + if (value == 0) { + if (wi + 3 > oldsize) { + return oldsize; + } + new[wi++] = 0; + new[wi++] = 1; + new[wi++] = 0; + } else { + if (wi + 2 > oldsize) { + return oldsize; + } + new[wi++] = value; + new[wi++] = value; + } + } else { /* count == 0 */ + if (value == 0) { + if (wi + 2 > oldsize) { + return oldsize; + } + new[wi++] = value; + new[wi++] = value; + } else { + if (wi + 1 > oldsize) { + return oldsize; + } + new[wi++] = value; + } + } /* if count > 1 */ + + value = cur_byte; + count = 0; + + } /* if byte == value */ + + } /* if count == 255 */ + + } /* if ri == 0 */ + ri++; + + } + if (count > 1) { + if (wi + 3 > oldsize) { + return oldsize; + } + new[wi++] = 0; + new[wi++] = count; + new[wi++] = value; + } else if (count == 1) { + if (value == 0) { + if (wi + 3 > oldsize) + return oldsize; + new[wi++] = 0; + new[wi++] = 1; + new[wi++] = 0; + } else { + if (wi + 2 > oldsize) + return oldsize; + new[wi++] = value; + new[wi++] = value; + } + } else { /* count == 0 */ + if (value == 0) { + if (wi + 2 > oldsize) + return oldsize; + new[wi++] = value; + new[wi++] = value; + } else { + if (wi + 1 > oldsize) + return oldsize; + new[wi++] = value; + } + } /* if count > 1 */ + + value = cur_byte; + count = 0; + return wi; +} + +/* setup the rle compression functionality */ +static struct __dump_compress dump_rle_compression = { + .compress_type = DUMP_COMPRESS_RLE, + .compress_func = dump_compress_rle, + .compress_name = "RLE", +}; + +/* + * Name: dump_compress_rle_init() + * Func: Initialize rle compression for dumping. + */ +static int __init +dump_compress_rle_init(void) +{ + dump_register_compression(&dump_rle_compression); + return 0; +} + +/* + * Name: dump_compress_rle_cleanup() + * Func: Remove rle compression for dumping. + */ +static void __exit +dump_compress_rle_cleanup(void) +{ + dump_unregister_compression(DUMP_COMPRESS_RLE); +} + +/* module initialization */ +module_init(dump_compress_rle_init); +module_exit(dump_compress_rle_cleanup); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("LKCD Development Team "); +MODULE_DESCRIPTION("RLE compression module for crash dump driver"); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/dump/dump_scheme.c 900-mjb5/drivers/dump/dump_scheme.c --- 001-bk10/drivers/dump/dump_scheme.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/dump/dump_scheme.c Sun Mar 16 13:39:02 2003 @@ -0,0 +1,346 @@ +/* + * Default single stage dump scheme methods + * + * Previously a part of dump_base.c + * + * Started: Oct 2002 - Suparna Bhattacharya + * Split and rewrote LKCD dump scheme to generic dump method + * interfaces + * Derived from original code created by + * Matt Robinson ) + * + * Contributions from SGI, IBM, HP, MCL, and others. + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* + * Implements the default dump scheme, i.e. single-stage gathering and + * saving of dump data directly to the target device, which operates in + * a push mode, where the dumping system decides what data it saves + * taking into account pre-specified dump config options. + * + * Aside: The 2-stage dump scheme, where there is a soft-reset between + * the gathering and saving phases, also reuses some of these + * default routines (see dump_overlay.c) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "dump_methods.h" + +extern int panic_timeout; /* time before reboot */ + +extern void dump_speedo(int); + +/* Default sequencer used during single stage dumping */ +/* Also invoked during stage 2 of soft-boot based dumping */ +int dump_generic_sequencer(void) +{ + struct dump_data_filter *filter = dump_config.dumper->filter; + int pass = 0, err = 0, save = 0; + int (*action)(unsigned long, unsigned long); + + /* + * We want to save the more critical data areas first in + * case we run out of space, encounter i/o failures, or get + * interrupted otherwise and have to give up midway + * So, run through the passes in increasing order + */ + for (;filter->selector; filter++, pass++) + { + /* Assumes passes are exclusive (even across dumpers) */ + /* Requires care when coding the selection functions */ + if ((save = filter->level_mask & dump_config.level)) + action = dump_save_data; + else + action = dump_skip_data; + + if ((err = dump_iterator(pass, action, filter)) < 0) + break; + + printk("\n %d dump pages %s of %d each in pass %d\n", + err, save ? "saved" : "skipped", DUMP_PAGE_SIZE, pass); + + } + + return (err < 0) ? err : 0; +} + +static inline struct page *dump_get_page(loff_t loc) +{ + unsigned long page_index = loc >> PAGE_SHIFT; + + /* todo: complete this to account for ia64/discontig mem */ + /* todo: and to check for validity, ram page, no i/o mem etc */ + /* need to use pfn/physaddr equiv of kern_addr_valid */ + if (__dump_page_valid(page_index)) + return pfn_to_page(page_index); + else + return NULL; + +} + +/* Default iterator: for singlestage and stage 1 of soft-boot dumping */ +/* Iterates over range of physical memory pages in DUMP_PAGE_SIZE increments */ +int dump_page_iterator(int pass, int (*action)(unsigned long, unsigned long), + struct dump_data_filter *filter) +{ + /* Todo : fix unit, type */ + loff_t loc; + int count = 0, err = 0; + struct page *page; + + /* Todo: Add membanks code */ + /* TBD: Check if we need to address DUMP_PAGE_SIZE < PAGE_SIZE */ + + for (loc = filter->start; loc < filter->end; loc += DUMP_PAGE_SIZE) { + dump_config.dumper->curr_loc = loc; + page = dump_get_page(loc); + if (page && filter->selector(pass, (unsigned long) page, + DUMP_PAGE_SIZE)) { + if ((err = action((unsigned long)page, DUMP_PAGE_SIZE))) + { + printk("dump_page_iterator: err %d for loc " + "0x%llx, in pass %d\n", err, loc, pass); + break; + } else + count++; + } + } + + return err ? err : count; +} + +/* + * Base function that saves the selected block of data in the dump + * Action taken when iterator decides that data needs to be saved + */ +int dump_generic_save_data(unsigned long loc, unsigned long sz) +{ + void *buf; + void *dump_buf = dump_config.dumper->dump_buf; + int left, bytes, ret; + + if ((ret = dump_add_data(loc, sz))) { + return ret; + } + buf = dump_config.dumper->curr_buf; + + /* If we've filled up the buffer write it out */ + if ((left = buf - dump_buf) >= DUMP_BUFFER_SIZE) { + bytes = dump_write_buffer(dump_buf, DUMP_BUFFER_SIZE); + if (bytes < DUMP_BUFFER_SIZE) { + printk("dump_write_buffer failed %d\n", bytes); + return bytes ? -ENOSPC : bytes; + } + + left -= bytes; + + /* -- A few chores to do from time to time -- */ + dump_config.dumper->count++; + + if (!(dump_config.dumper->count & 0x3f)) { + /* Update the header every one in a while */ + /* memset((void *)dump_buf, 'b', DUMP_BUFFER_SIZE);*/ + if ((ret = dump_update_header()) < 0) { + /* issue warning */ + return ret; + } + printk("."); + + touch_nmi_watchdog(); + } else if (!(dump_config.dumper->count & 0x7)) { + /* Show progress so the user knows we aren't hung */ + dump_speedo(dump_config.dumper->count >> 3); + } + /* Todo: Touch/Refresh watchdog */ + + /* --- Done with periodic chores -- */ + + + /* now adjust the leftover bits back to the top of the page */ + /* this case would not arise during stage 2 (passthru) */ + memset(dump_buf, 'z', DUMP_BUFFER_SIZE); + if (left) { + memcpy(dump_buf, dump_buf + DUMP_BUFFER_SIZE, left); + } + buf -= DUMP_BUFFER_SIZE; + dump_config.dumper->curr_buf = buf; + } + + return 0; +} + +int dump_generic_skip_data(unsigned long loc, unsigned long sz) +{ + /* dummy by default */ + return 0; +} + +/* + * Common low level routine to write a buffer to current dump device + * Expects checks for space etc to have been taken care of by the caller + * Operates serially at the moment for simplicity. + * TBD/Todo: Consider batching for improved throughput + */ +int dump_ll_write(void *buf, unsigned long len) +{ + long transferred = 0, last_transfer = 0; + int ret = 0; + + /* make sure device is ready */ + while ((ret = dump_dev_ready(NULL)) == -EAGAIN); + if (ret < 0) { + printk("dump_dev_ready failed !err %d\n", ret); + return ret; + } + + while (len) { + if ((last_transfer = dump_dev_write(buf, len)) <= 0) { + ret = last_transfer; + printk("dump_dev_write failed !err %d\n", + ret); + break; + } + /* wait till complete */ + while ((ret = dump_dev_ready(buf)) == -EAGAIN) + cpu_relax(); + + if (ret < 0) { + printk("i/o failed !err %d\n", ret); + break; + } + + len -= last_transfer; + buf += last_transfer; + transferred += last_transfer; + } + return (ret < 0) ? ret : transferred; +} + +/* default writeout routine for single dump device */ +/* writes out the dump data ensuring enough space is left for the end marker */ +int dump_generic_write_buffer(void *buf, unsigned long len) +{ + long written = 0; + int err = 0; + + /* check for space */ + if ((err = dump_dev_seek(dump_config.dumper->curr_offset + len + + 2*DUMP_BUFFER_SIZE)) < 0) { + printk("dump_write_buffer: insuff space after offset 0x%llx\n", + dump_config.dumper->curr_offset); + return err; + } + /* alignment check would happen as a side effect of this */ + if ((err = dump_dev_seek(dump_config.dumper->curr_offset)) < 0) + return err; + + written = dump_ll_write(buf, len); + + /* all or none */ + + if (written < len) + written = written ? -ENOSPC : written; + else + dump_config.dumper->curr_offset += len; + + return written; +} + +int dump_generic_configure(unsigned long devid) +{ + struct dump_dev *dev = dump_config.dumper->dev; + struct dump_data_filter *filter; + void *buf; + int ret = 0; + + /* Allocate the dump buffer and initialize dumper state */ + /* Assume that we get aligned addresses */ + if (!(buf = dump_alloc_mem(DUMP_BUFFER_SIZE + 2 * DUMP_PAGE_SIZE))) + return -ENOMEM; + + if ((unsigned long)buf & (PAGE_SIZE - 1)) { + /* sanity check for page aligned address */ + dump_free_mem(buf); + return -ENOMEM; /* fixme: better error code */ + } + + /* Initialize the rest of the fields */ + dump_config.dumper->dump_buf = buf; + dumper_reset(); + + /* Open the dump device */ + if (!dev) + return -ENODEV; + + if ((ret = dev->ops->open(dev, devid))) { + return ret; + } + + /* Initialise the memory ranges in the dump filter */ + for (filter = dump_config.dumper->filter ;filter->selector; filter++) { + if (!filter->start && !filter->end) { + filter->start = 0; + filter->end = max_mapnr << PAGE_SHIFT; + } + } + + return 0; +} + +int dump_generic_unconfigure(void) +{ + struct dump_dev *dev = dump_config.dumper->dev; + void *buf = dump_config.dumper->dump_buf; + int ret = 0; + + /* Close the dump device */ + if (dev && (ret = dev->ops->release(dev))) + return ret; + + if (buf) + dump_free_mem(buf); + + dump_config.dumper->curr_buf = dump_config.dumper->dump_buf = NULL; + return 0; +} + + +/* Set up the default dump scheme */ + +struct dump_scheme_ops dump_scheme_singlestage_ops = { + .configure = dump_generic_configure, + .unconfigure = dump_generic_unconfigure, + .sequencer = dump_generic_sequencer, + .iterator = dump_page_iterator, + .save_data = dump_generic_save_data, + .skip_data = dump_generic_skip_data, + .write_buffer = dump_generic_write_buffer, +}; + +struct dump_scheme dump_scheme_singlestage = { + .name = "single-stage", + .ops = &dump_scheme_singlestage_ops +}; + +/* The single stage dumper comprising all these */ +struct dumper dumper_singlestage = { + .name = "single-stage", + .scheme = &dump_scheme_singlestage, + .fmt = &dump_fmt_lcrash, + .compress = &dump_none_compression, + .filter = dump_filter_table, + .dev = NULL, +}; + diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/dump/dump_setup.c 900-mjb5/drivers/dump/dump_setup.c --- 001-bk10/drivers/dump/dump_setup.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/drivers/dump/dump_setup.c Sun Mar 16 13:39:02 2003 @@ -0,0 +1,729 @@ +/* + * Standard kernel function entry points for Linux crash dumps. + * + * Created by: Matt Robinson (yakker@sourceforge.net) + * Contributions from SGI, IBM, HP, MCL, and others. + * + * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2000 - 2002 TurboLinux, Inc. All rights reserved. + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* + * ----------------------------------------------------------------------- + * + * DUMP HISTORY + * + * This dump code goes back to SGI's first attempts at dumping system + * memory on SGI systems running IRIX. A few developers at SGI needed + * a way to take this system dump and analyze it, and created 'icrash', + * or IRIX Crash. The mechanism (the dumps and 'icrash') were used + * by support people to generate crash reports when a system failure + * occurred. This was vital for large system configurations that + * couldn't apply patch after patch after fix just to hope that the + * problems would go away. So the system memory, along with the crash + * dump analyzer, allowed support people to quickly figure out what the + * problem was on the system with the crash dump. + * + * In comes Linux. SGI started moving towards the open source community, + * and upon doing so, SGI wanted to take its support utilities into Linux + * with the hopes that they would end up the in kernel and user space to + * be used by SGI's customers buying SGI Linux systems. One of the first + * few products to be open sourced by SGI was LKCD, or Linux Kernel Crash + * Dumps. LKCD comprises of a patch to the kernel to enable system + * dumping, along with 'lcrash', or Linux Crash, to analyze the system + * memory dump. A few additional system scripts and kernel modifications + * are also included to make the dump mechanism and dump data easier to + * process and use. + * + * As soon as LKCD was released into the open source community, a number + * of larger companies started to take advantage of it. Today, there are + * many community members that contribute to LKCD, and it continues to + * flourish and grow as an open source project. + */ + +/* + * DUMP TUNABLES + * + * This is the list of system tunables (via /proc) that are available + * for Linux systems. All the read, write, etc., functions are listed + * here. Currently, there are a few different tunables for dumps: + * + * dump_device (used to be dumpdev): + * The device for dumping the memory pages out to. This + * may be set to the primary swap partition for disruptive dumps, + * and must be an unused partition for non-disruptive dumps. + * Todo: In the case of network dumps, this may be interpreted + * as the IP address of the netdump server to connect to. + * + * dump_compress (used to be dump_compress_pages): + * This is the flag which indicates which compression mechanism + * to use. This is a BITMASK, not an index (0,1,2,4,8,16,etc.). + * This is the current set of values: + * + * 0: DUMP_COMPRESS_NONE -- Don't compress any pages. + * 1: DUMP_COMPRESS_RLE -- This uses RLE compression. + * 2: DUMP_COMPRESS_GZIP -- This uses GZIP compression. + * + * dump_level: + * The amount of effort the dump module should make to save + * information for post crash analysis. This value is now + * a BITMASK value, not an index: + * + * 0: Do nothing, no dumping. (DUMP_LEVEL_NONE) + * + * 1: Print out the dump information to the dump header, and + * write it out to the dump_device. (DUMP_LEVEL_HEADER) + * + * 2: Write out the dump header and all kernel memory pages. + * (DUMP_LEVEL_KERN) + * + * 4: Write out the dump header and all kernel and user + * memory pages. (DUMP_LEVEL_USED) + * + * 8: Write out the dump header and all conventional/cached + * memory (RAM) pages in the system (kernel, user, free). + * (DUMP_LEVEL_ALL_RAM) + * + * 16: Write out everything, including non-conventional memory + * like firmware, proms, I/O registers, uncached memory. + * (DUMP_LEVEL_ALL) + * + * The dump_level will default to 1. + * + * dump_flags: + * These are the flags to use when talking about dumps. There + * are lots of possibilities. This is a BITMASK value, not an index. + * + * ----------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include +#include "dump_methods.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * ----------------------------------------------------------------------- + * V A R I A B L E S + * ----------------------------------------------------------------------- + */ + +/* Dump tunables */ +struct dump_config dump_config = { + .level = 0, + .flags = 0, + .dumper = NULL +}; + + +/* Global variables used in dump.h */ +/* degree of system freeze when dumping */ +enum dump_silence_levels dump_silence_level = DUMP_HARD_SPIN_CPUS; + +/* Other global fields */ +extern struct __dump_header dump_header; +struct dump_dev *dump_dev = NULL; /* Active dump device */ +int dump_device = 0; +static int dump_compress = 0; + +static u16 dump_compress_none(const u8 *old, u16 oldsize, u8 *new, u16 newsize); +struct __dump_compress dump_none_compression = { + .compress_type = DUMP_COMPRESS_NONE, + .compress_func = dump_compress_none, + .compress_name = "none", +}; + +/* our device operations and functions */ +static int dump_ioctl(struct inode *i, struct file *f, + unsigned int cmd, unsigned long arg); + +static struct file_operations dump_fops = { + .ioctl = dump_ioctl, +}; + +/* static variables */ +static int dump_okay = 0; /* can we dump out to disk? */ +static spinlock_t dump_lock = SPIN_LOCK_UNLOCKED; + +/* used for dump compressors */ +static struct list_head dump_compress_list = LIST_HEAD_INIT(dump_compress_list); + +/* list of registered dump targets */ +static struct list_head dump_target_list = LIST_HEAD_INIT(dump_target_list); + +/* lkcd info structure -- this is used by lcrash for basic system data */ +struct __lkcdinfo lkcdinfo = { + .ptrsz = (sizeof(void *) * 8), +#if defined(__LITTLE_ENDIAN) + .byte_order = __LITTLE_ENDIAN, +#else + .byte_order = __BIG_ENDIAN, +#endif + .page_shift = PAGE_SHIFT, + .page_size = PAGE_SIZE, + .page_mask = PAGE_MASK, + .page_offset = PAGE_OFFSET, +}; + +/* + * ----------------------------------------------------------------------- + * / P R O C T U N A B L E F U N C T I O N S + * ----------------------------------------------------------------------- + */ + +static int proc_dump_device(ctl_table *ctl, int write, struct file *f, + void *buffer, size_t *lenp); + +/* + * sysctl-tuning infrastructure. + */ +static ctl_table dump_table[] = { + { .ctl_name = CTL_DUMP_LEVEL, + .procname = DUMP_LEVEL_NAME, + .data = &dump_config.level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, + + { .ctl_name = CTL_DUMP_FLAGS, + .procname = DUMP_FLAGS_NAME, + .data = &dump_config.flags, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, + + { .ctl_name = CTL_DUMP_COMPRESS, + .procname = DUMP_COMPRESS_NAME, + .data = &dump_compress, /* FIXME */ + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, + + { .ctl_name = CTL_DUMP_DEVICE, + .procname = DUMP_DEVICE_NAME, + .mode = 0644, + .data = &dump_device, /* FIXME */ + .maxlen = sizeof(int), + .proc_handler = proc_dump_device }, + + { 0, } +}; + +static ctl_table dump_root[] = { + { .ctl_name = KERN_DUMP, + .procname = "dump", + .mode = 0555, + .child = dump_table }, + { 0, } +}; + +static ctl_table kernel_root[] = { + { .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = dump_root, }, + { 0, } +}; + +static struct ctl_table_header *sysctl_header; + +/* + * ----------------------------------------------------------------------- + * C O M P R E S S I O N F U N C T I O N S + * ----------------------------------------------------------------------- + */ + +/* + * Name: dump_compress_none() + * Func: Don't do any compression, period. + */ +static u16 +dump_compress_none(const u8 *old, u16 oldsize, u8 *new, u16 newsize) +{ + /* just return the old size */ + return oldsize; +} + + +/* + * Name: dump_execute() + * Func: Execute the dumping process. This makes sure all the appropriate + * fields are updated correctly, and calls dump_execute_memdump(), + * which does the real work. + */ +void +dump_execute(const char *panic_str, const struct pt_regs *regs) +{ + int state = -1; + unsigned long flags; + + /* make sure we can dump */ + if (!dump_okay) { + pr_info("LKCD not yet configured, can't take dump now\n"); + return; + } + + /* Exclude multiple dumps at the same time, + * and disable interrupts, some drivers may re-enable + * interrupts in with silence() + * + * Try and acquire spin lock. If successful, leave preempt + * and interrupts disabled. See spin_lock_irqsave in spinlock.h + */ + local_irq_save(flags); + if (!spin_trylock(&dump_lock)) { + local_irq_restore(flags); + pr_info("LKCD dump already in progress\n"); + return; + } + + /* Bring system into the strictest level of quiescing for min drift + * dump drivers can soften this as required in dev->ops->silence() + */ + dump_oncpu = smp_processor_id() + 1; + dump_silence_level = DUMP_HARD_SPIN_CPUS; + + state = dump_generic_execute(panic_str, regs); + + dump_oncpu = 0; + spin_unlock_irqrestore(&dump_lock, flags); + + if (state < 0) { + printk("Dump Incomplete or failed!\n"); + } else { + printk("Dump Complete; %d dump pages saved.\n", + dump_header.dh_num_dump_pages); + } +} + +/* + * Name: dump_register_compression() + * Func: Register a dump compression mechanism. + */ +void +dump_register_compression(struct __dump_compress *item) +{ + if (item) + list_add(&(item->list), &dump_compress_list); +} + +/* + * Name: dump_unregister_compression() + * Func: Remove a dump compression mechanism, and re-assign the dump + * compression pointer if necessary. + */ +void +dump_unregister_compression(int compression_type) +{ + struct list_head *tmp; + struct __dump_compress *dc; + + /* let's make sure our list is valid */ + if (compression_type != DUMP_COMPRESS_NONE) { + list_for_each(tmp, &dump_compress_list) { + dc = list_entry(tmp, struct __dump_compress, list); + if (dc->compress_type == compression_type) { + list_del(&(dc->list)); + break; + } + } + } +} + +/* + * Name: dump_compress_init() + * Func: Initialize (or re-initialize) compression scheme. + */ +static int +dump_compress_init(int compression_type) +{ + struct list_head *tmp; + struct __dump_compress *dc; + + /* try to remove the compression item */ + list_for_each(tmp, &dump_compress_list) { + dc = list_entry(tmp, struct __dump_compress, list); + if (dc->compress_type == compression_type) { + dump_config.dumper->compress = dc; + dump_compress = compression_type; + pr_debug("Dump Compress %s\n", dc->compress_name); + return 0; + } + } + + /* + * nothing on the list -- return ENODATA to indicate an error + * + * NB: + * EAGAIN: reports "Resource temporarily unavailable" which + * isn't very enlightening. + */ + printk("compression_type:%d not found\n", compression_type); + + return -ENODATA; +} + +static int +dumper_setup(unsigned long flags, unsigned long devid) +{ + int ret = 0; + + /* unconfigure old dumper if it exists */ + dump_okay = 0; + if (dump_config.dumper) { + pr_debug("Unconfiguring current dumper\n"); + dump_unconfigure(); + } + /* set up new dumper */ + dump_config.dumper = &dumper_singlestage; + dump_config.dumper->dev = dump_dev; + + ret = dump_configure(devid); + if (!ret) { + dump_okay = 1; + pr_debug("%s dumper set up for dev 0x%lx\n", + dump_config.dumper->name, devid); + dump_device = devid; + } else { + printk("%s dumper set up failed for dev 0x%lx\n", + dump_config.dumper->name, devid); + } + return ret; +} + +static int +dump_target_init(int target) +{ + char type[20]; + struct list_head *tmp; + struct dump_dev *dev; + + switch (target) { + case DUMP_FLAGS_DISKDUMP: + strcpy(type, "blockdev"); break; + case DUMP_FLAGS_NETDUMP: + strcpy(type, "networkdev"); break; + default: + return -1; + } + + /* + * This is a bit stupid, generating strings from flag + * and doing strcmp. This is done because 'struct dump_dev' + * has string 'type_name' and not interger 'type'. + */ + list_for_each(tmp, &dump_target_list) { + dev = list_entry(tmp, struct dump_dev, list); + if (strcmp(type, dev->type_name) == 0) { + dump_dev = dev; + return 0; + } + } + return -1; +} + +/* + * Name: dump_ioctl() + * Func: Allow all dump tunables through a standard ioctl() mechanism. + * This is far better than before, where we'd go through /proc, + * because now this will work for multiple OS and architectures. + */ +static int +dump_ioctl(struct inode *i, struct file *f, unsigned int cmd, unsigned long arg) +{ + /* check capabilities */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dump_config.dumper && cmd == DIOSDUMPCOMPRESS) + /* dump device must be configured first */ + return -ENODEV; + + /* + * This is the main mechanism for controlling get/set data + * for various dump device parameters. The real trick here + * is setting the dump device (DIOSDUMPDEV). That's what + * triggers everything else. + */ + switch (cmd) { + case DIOSDUMPDEV: /* set dump_device */ + pr_debug("Configuring dump device\n"); + if (!(f->f_flags & O_RDWR)) + return -EPERM; + + __dump_open(); + return dumper_setup(dump_config.flags, arg); + + + case DIOGDUMPDEV: /* get dump_device */ + return put_user((long)dump_device, (long *)arg); + + case DIOSDUMPLEVEL: /* set dump_level */ + if (!(f->f_flags & O_RDWR)) + return -EPERM; + + /* make sure we have a positive value */ + if (arg < 0) + return -EINVAL; + + /* Fixme: clean this up */ + dump_config.level = 0; + switch ((int)arg) { + case DUMP_LEVEL_ALL: + case DUMP_LEVEL_ALL_RAM: + dump_config.level |= DUMP_MASK_UNUSED; + case DUMP_LEVEL_USED: + dump_config.level |= DUMP_MASK_USED; + case DUMP_LEVEL_KERN: + dump_config.level |= DUMP_MASK_KERN; + case DUMP_LEVEL_HEADER: + dump_config.level |= DUMP_MASK_HEADER; + case DUMP_LEVEL_NONE: + break; + default: + return (-EINVAL); + } + pr_debug("Dump Level 0x%lx\n", dump_config.level); + break; + + case DIOGDUMPLEVEL: /* get dump_level */ + /* fixme: handle conversion */ + return put_user((long)dump_config.level, (long *)arg); + + + case DIOSDUMPFLAGS: /* set dump_flags */ + /* check flags */ + if (!(f->f_flags & O_RDWR)) + return -EPERM; + + /* make sure we have a positive value */ + if (arg < 0) + return -EINVAL; + + if (dump_target_init(arg & DUMP_FLAGS_TARGETMASK) < 0) + return -EINVAL; /* return proper error */ + + dump_config.flags = arg; + + pr_debug("Dump Flags 0x%lx\n", dump_config.flags); + break; + + case DIOGDUMPFLAGS: /* get dump_flags */ + return put_user((long)dump_config.flags, (long *)arg); + + case DIOSDUMPCOMPRESS: /* set the dump_compress status */ + if (!(f->f_flags & O_RDWR)) + return -EPERM; + + return dump_compress_init((int)arg); + + case DIOGDUMPCOMPRESS: /* get the dump_compress status */ + return put_user((long)(dump_config.dumper ? + dump_config.dumper->compress->compress_type : 0), + (long *)arg); + + default: + /* + * these are network dump specific ioctls, let the + * module handle them. + */ + return dump_dev_ioctl(cmd, arg); + } + return 0; +} + +/* + * Handle special cases for dump_device + * changing dump device requires doing an opening the device + */ +static int +proc_dump_device(ctl_table *ctl, int write, struct file *f, + void *buffer, size_t *lenp) +{ + int *valp = ctl->data; + int oval = *valp; + int ret = -EPERM; + + /* same permission checks as ioctl */ + if (capable(CAP_SYS_ADMIN)) { + ret = proc_dointvec(ctl, write, f, buffer, lenp); + if (ret == 0 && write && *valp != oval) { + /* need to restore old value to close properly */ + dump_device = (dev_t) oval; + __dump_open(); + ret = dumper_setup(dump_config.flags, (dev_t) *valp); + } + } + + return ret; +} + +/* + * ----------------------------------------------------------------------- + * I N I T F U N C T I O N S + * ----------------------------------------------------------------------- + */ + +/* + * These register and unregister routines are exported for modules + * to register their dump drivers (like block, net etc) + */ +int +dump_register_device(struct dump_dev *ddev) +{ + struct list_head *tmp; + struct dump_dev *dev; + + list_for_each(tmp, &dump_target_list) { + dev = list_entry(tmp, struct dump_dev, list); + if (strcmp(ddev->type_name, dev->type_name) == 0) { + printk("Target type %s already registered\n", + dev->type_name); + return -1; /* return proper error */ + } + } + list_add(&(ddev->list), &dump_target_list); + + return 0; +} + +void +dump_unregister_device(struct dump_dev *ddev) +{ + list_del(&(ddev->list)); + if (ddev != dump_dev) + return; + + dump_okay = 0; + + if (dump_config.dumper) + dump_unconfigure(); + + dump_config.flags &= ~DUMP_FLAGS_TARGETMASK; + dump_okay = 0; + dump_dev = NULL; + dump_config.dumper = NULL; +} + + +#ifdef CONFIG_MAGIC_SYSRQ +/* Sysrq handler */ +static void sysrq_handle_crashdump(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) { + dump_execute("sysrq", pt_regs); +} + +static struct sysrq_key_op sysrq_crashdump_op = { + .handler = sysrq_handle_crashdump, + .help_msg = "Dump", + .action_msg = "Starting crash dump", +}; +#endif + +static inline void +dump_sysrq_register(void) +{ +#ifdef CONFIG_MAGIC_SYSRQ + __sysrq_lock_table(); + __sysrq_put_key_op(DUMP_SYSRQ_KEY, &sysrq_crashdump_op); + __sysrq_unlock_table(); +#endif +} + +static inline void +dump_sysrq_unregister(void) +{ +#ifdef CONFIG_MAGIC_SYSRQ + __sysrq_lock_table(); + if (__sysrq_get_key_op(DUMP_SYSRQ_KEY) == &sysrq_crashdump_op) + __sysrq_put_key_op(DUMP_SYSRQ_KEY, NULL); + __sysrq_unlock_table(); +#endif +} + +/* + * Name: dump_init() + * Func: Initialize the dump process. This will set up any architecture + * dependent code. The big key is we need the memory offsets before + * the page table is initialized, because the base memory offset + * is changed after paging_init() is called. + */ +static int __init +dump_init(void) +{ + struct sysinfo info; + + /* try to create our dump device */ + if (register_chrdev(CRASH_DUMP_MAJOR, "dump", &dump_fops)) { + printk("cannot register dump character device!\n"); + return -EBUSY; + } + + __dump_init((u64)PAGE_OFFSET); + + /* set the dump_compression_list structure up */ + dump_register_compression(&dump_none_compression); + + /* grab the total memory size now (not if/when we crash) */ + si_meminfo(&info); + + /* set the memory size */ + dump_header.dh_memory_size = (u64)info.totalram; + + sysctl_header = register_sysctl_table(kernel_root, 0); + dump_sysrq_register(); + + pr_info("Crash dump driver initialized.\n"); + return 0; +} + +static void __exit +dump_cleanup(void) +{ + dump_okay = 0; + + if (dump_config.dumper) + dump_unconfigure(); + + /* arch-specific cleanup routine */ + __dump_cleanup(); + + /* ignore errors while unregistering -- since can't do anything */ + unregister_sysctl_table(sysctl_header); + unregister_chrdev(CRASH_DUMP_MAJOR, "dump"); + dump_sysrq_unregister(); +} + +EXPORT_SYMBOL(dump_register_compression); +EXPORT_SYMBOL(dump_unregister_compression); +EXPORT_SYMBOL(dump_register_device); +EXPORT_SYMBOL(dump_unregister_device); +EXPORT_SYMBOL(dump_config); +EXPORT_SYMBOL(dump_silence_level); + +EXPORT_SYMBOL(__dump_irq_enable); +EXPORT_SYMBOL(__dump_irq_restore); + +MODULE_AUTHOR("Matt D. Robinson "); +MODULE_DESCRIPTION("Linux Kernel Crash Dump (LKCD) driver"); +MODULE_LICENSE("GPL"); + +module_init(dump_init); +module_exit(dump_cleanup); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/net/3c59x.c 900-mjb5/drivers/net/3c59x.c --- 001-bk10/drivers/net/3c59x.c Fri Feb 21 23:40:47 2003 +++ 900-mjb5/drivers/net/3c59x.c Sun Mar 16 13:39:02 2003 @@ -885,6 +885,7 @@ static void set_rx_mode(struct net_devic static int vortex_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static void vortex_tx_timeout(struct net_device *dev); static void acpi_set_WOL(struct net_device *dev); +static void vorboom_poll(struct net_device *dev); /* This driver uses 'options' to pass the media type, full-duplex flag, etc. */ /* Option count limit only -- unlimited interfaces are supported. */ @@ -1444,6 +1445,9 @@ static int __devinit vortex_probe1(struc dev->set_multicast_list = set_rx_mode; dev->tx_timeout = vortex_tx_timeout; dev->watchdog_timeo = (watchdog * HZ) / 1000; +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &vorboom_poll; +#endif if (pdev && vp->enable_wol) { vp->pm_state_valid = 1; pci_save_state(VORTEX_PCI(vp), vp->power_state); @@ -2427,6 +2431,29 @@ static void boomerang_interrupt(int irq, handler_exit: spin_unlock(&vp->lock); } + +#ifdef HAVE_POLL_CONTROLLER + +/* + * Polling 'interrupt' - used by things like netconsole to send skbs + * without having to re-enable interrupts. It's not called while + * the interrupt routine is executing. + */ + +static void vorboom_poll (struct net_device *dev) +{ + struct vortex_private *vp = (struct vortex_private *)dev->priv; + + disable_irq(dev->irq); + if (vp->full_bus_master_tx) + boomerang_interrupt(dev->irq, dev, 0); + else + vortex_interrupt(dev->irq, dev, 0); + enable_irq(dev->irq); +} + +#endif + static int vortex_rx(struct net_device *dev) { diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/net/e100/e100_main.c 900-mjb5/drivers/net/e100/e100_main.c --- 001-bk10/drivers/net/e100/e100_main.c Wed Mar 5 07:37:02 2003 +++ 900-mjb5/drivers/net/e100/e100_main.c Sun Mar 16 13:39:02 2003 @@ -551,6 +551,22 @@ e100_trigger_SWI(struct e100_private *bd readw(&(bdp->scb->scb_status)); /* flushes last write, read-safe */ } +#ifdef HAVE_POLL_CONTROLLER + +/* + * Polling 'interrupt' - used by things like netconsole to send skbs + * without having to re-enable interrupts. It's not called while + * the interrupt routine is executing. + */ +static void +e100_poll(struct net_device *dev) +{ + disable_irq(dev->irq); + e100intr(dev->irq, dev, NULL); + enable_irq(dev->irq); +} +#endif + static int __devinit e100_found1(struct pci_dev *pcid, const struct pci_device_id *ent) { @@ -569,6 +585,9 @@ e100_found1(struct pci_dev *pcid, const SET_MODULE_OWNER(dev); +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &e100_poll; +#endif if (first_time) { first_time = false; printk(KERN_NOTICE "%s - version %s\n", diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/net/e1000/e1000_main.c 900-mjb5/drivers/net/e1000/e1000_main.c --- 001-bk10/drivers/net/e1000/e1000_main.c Tue Feb 25 23:03:47 2003 +++ 900-mjb5/drivers/net/e1000/e1000_main.c Sun Mar 16 13:39:02 2003 @@ -162,6 +162,7 @@ static void e1000_leave_82542_rst(struct static inline void e1000_rx_checksum(struct e1000_adapter *adapter, struct e1000_rx_desc *rx_desc, struct sk_buff *skb); +static void e1000_Poll(struct net_device *dev); static void e1000_tx_timeout(struct net_device *dev); static void e1000_tx_timeout_task(struct net_device *dev); @@ -406,6 +407,9 @@ e1000_probe(struct pci_dev *pdev, adapter->bd_number = cards_found; adapter->id_string = e1000_strings[ent->driver_data]; +#ifdef HAVE_POLL_CONTROLLER + netdev->poll_controller = &e1000_Poll; +#endif /* setup the private structure */ if(e1000_sw_init(adapter)) @@ -1834,6 +1838,15 @@ e1000_intr(int irq, void *data, struct p } #endif } + +#ifdef HAVE_POLL_CONTROLLER +static void e1000_Poll(struct net_device *dev) +{ + disable_irq(dev->irq); + e1000_intr(dev->irq, dev, NULL); + enable_irq(dev->irq); +} +#endif #ifdef CONFIG_E1000_NAPI static int diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/net/eepro100.c 900-mjb5/drivers/net/eepro100.c --- 001-bk10/drivers/net/eepro100.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/drivers/net/eepro100.c Sun Mar 16 13:39:02 2003 @@ -542,6 +542,7 @@ static void speedo_refill_rx_buffers(str static int speedo_rx(struct net_device *dev); static void speedo_tx_buffer_gc(struct net_device *dev); static void speedo_interrupt(int irq, void *dev_instance, struct pt_regs *regs); +static void poll_speedo (struct net_device *dev); static int speedo_close(struct net_device *dev); static struct net_device_stats *speedo_get_stats(struct net_device *dev); static int speedo_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); @@ -878,6 +879,9 @@ static int __devinit speedo_found1(struc dev->get_stats = &speedo_get_stats; dev->set_multicast_list = &set_rx_mode; dev->do_ioctl = &speedo_ioctl; +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &poll_speedo; +#endif return 0; } @@ -1656,6 +1660,23 @@ static void speedo_interrupt(int irq, vo clear_bit(0, (void*)&sp->in_interrupt); return; } + +#ifdef HAVE_POLL_CONTROLLER + +/* + * Polling 'interrupt' - used by things like netconsole to send skbs + * without having to re-enable interrupts. It's not called while + * the interrupt routine is executing. + */ + +static void poll_speedo (struct net_device *dev) +{ + disable_irq(dev->irq); + speedo_interrupt (dev->irq, dev, NULL); + enable_irq(dev->irq); +} + +#endif static inline struct RxFD *speedo_rx_alloc(struct net_device *dev, int entry) { diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/net/smc-ultra.c 900-mjb5/drivers/net/smc-ultra.c --- 001-bk10/drivers/net/smc-ultra.c Wed Mar 5 07:37:02 2003 +++ 900-mjb5/drivers/net/smc-ultra.c Sun Mar 16 13:39:02 2003 @@ -122,6 +122,14 @@ MODULE_DEVICE_TABLE(isapnp, ultra_device #define ULTRA_IO_EXTENT 32 #define EN0_ERWCNT 0x08 /* Early receive warning count. */ + +static void ultra_poll(struct net_device *dev) +{ + disable_irq(dev->irq); + ei_interrupt(dev->irq, dev, NULL); + enable_irq(dev->irq); +} + /* Probe for the Ultra. This looks like a 8013 with the station address PROM at I/O ports +8 to +13, with a checksum following. @@ -134,6 +142,9 @@ int __init ultra_probe(struct net_device SET_MODULE_OWNER(dev); +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &ultra_poll; +#endif if (base_addr > 0x1ff) /* Check a single specified location. */ return ultra_probe1(dev, base_addr); else if (base_addr != 0) /* Don't probe at all. */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/net/tlan.c 900-mjb5/drivers/net/tlan.c --- 001-bk10/drivers/net/tlan.c Wed Mar 5 07:37:02 2003 +++ 900-mjb5/drivers/net/tlan.c Sun Mar 16 13:39:02 2003 @@ -345,6 +345,8 @@ static int TLan_EeSendByte( u16, u8, int static void TLan_EeReceiveByte( u16, u8 *, int ); static int TLan_EeReadByte( struct net_device *, u8, u8 * ); +static void TLan_Poll(struct net_device *); + static TLanIntVectorFunc *TLanIntVector[TLAN_INT_NUMBER_OF_INTS] = { TLan_HandleInvalid, @@ -854,6 +856,9 @@ static int TLan_Init( struct net_device dev->get_stats = &TLan_GetStats; dev->set_multicast_list = &TLan_SetMulticastList; dev->do_ioctl = &TLan_ioctl; +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &TLan_Poll; +#endif dev->tx_timeout = &TLan_tx_timeout; dev->watchdog_timeo = TX_TIMEOUT; @@ -1136,7 +1141,14 @@ static void TLan_HandleInterrupt(int irq } /* TLan_HandleInterrupts */ - +#ifdef HAVE_POLL_CONTROLLER +static void TLan_Poll(struct net_device *dev) +{ + disable_irq(dev->irq); + TLan_HandleInterrupt(dev->irq, dev, NULL); + enable_irq(dev->irq); +} +#endif /*************************************************************** diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/net/tulip/tulip_core.c 900-mjb5/drivers/net/tulip/tulip_core.c --- 001-bk10/drivers/net/tulip/tulip_core.c Wed Mar 5 07:37:02 2003 +++ 900-mjb5/drivers/net/tulip/tulip_core.c Sun Mar 16 13:39:02 2003 @@ -242,6 +242,7 @@ static void tulip_down(struct net_device static struct net_device_stats *tulip_get_stats(struct net_device *dev); static int private_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static void set_rx_mode(struct net_device *dev); +static void poll_tulip(struct net_device *dev); @@ -1618,6 +1619,9 @@ static int __devinit tulip_init_one (str dev->get_stats = tulip_get_stats; dev->do_ioctl = private_ioctl; dev->set_multicast_list = set_rx_mode; +#ifdef HAVE_POLL_CONTROLLER + dev->poll_controller = &poll_tulip; +#endif if (register_netdev(dev)) goto err_out_free_ring; @@ -1773,6 +1777,24 @@ static void __devexit tulip_remove_one ( /* pci_power_off (pdev, -1); */ } + + +#ifdef HAVE_POLL_CONTROLLER + +/* + * Polling 'interrupt' - used by things like netconsole to send skbs + * without having to re-enable interrupts. It's not called while + * the interrupt routine is executing. + */ + +static void poll_tulip (struct net_device *dev) +{ + disable_irq(dev->irq); + tulip_interrupt (dev->irq, dev, NULL); + enable_irq(dev->irq); +} + +#endif static struct pci_driver tulip_driver = { diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/serial/8250.c 900-mjb5/drivers/serial/8250.c --- 001-bk10/drivers/serial/8250.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/drivers/serial/8250.c Sun Mar 16 13:38:57 2003 @@ -2095,9 +2095,116 @@ void serial8250_get_irq_map(unsigned int } } -static int __init serial8250_init(void) +#ifdef CONFIG_X86_REMOTE_DEBUG +/* + * Takes: + * ttyS - integer specifying which serial port to use for debugging + * baud - baud rate of specified serial port + * Returns: + * port for use by the gdb serial driver + */ +int gdb_serial_setup(int ttyS, int baud, int *port, int *irq) +{ + struct uart_8250_port *up; + unsigned cval; + int bits = 8; + int parity = 'n'; + int cflag = CREAD | HUPCL | CLOCAL; + int quot = 0; + + /* + * Now construct a cflag setting. + */ + switch(baud) { + case 1200: + cflag |= B1200; + break; + case 2400: + cflag |= B2400; + break; + case 4800: + cflag |= B4800; + break; + case 19200: + cflag |= B19200; + break; + case 38400: + cflag |= B38400; + break; + case 57600: + cflag |= B57600; + break; + case 115200: + cflag |= B115200; + break; + case 9600: + default: + cflag |= B9600; + break; + } + switch(bits) { + case 7: + cflag |= CS7; + break; + default: + case 8: + cflag |= CS8; + break; + } + switch(parity) { + case 'o': case 'O': + cflag |= PARODD; + break; + case 'e': case 'E': + cflag |= PARENB; + break; + } + + /* + * Divisor, bytesize and parity + */ + + up = &serial8250_ports[ttyS]; +// ser->flags &= ~ASYNC_BOOT_AUTOCONF; + quot = ( 1843200 / 16 ) / baud; + cval = cflag & (CSIZE | CSTOPB); + cval >>= 4; + if (cflag & PARENB) + cval |= UART_LCR_PARITY; + if (!(cflag & PARODD)) + cval |= UART_LCR_EPAR; + + /* + * Disable UART interrupts, set DTR and RTS high + * and set speed. + */ + cval = 0x3; + serial_outp(up, UART_LCR, cval | UART_LCR_DLAB); /* set DLAB */ + serial_outp(up, UART_DLL, quot & 0xff); /* LS of divisor */ + serial_outp(up, UART_DLM, quot >> 8); /* MS of divisor */ + serial_outp(up, UART_LCR, cval); /* reset DLAB */ + serial_outp(up, UART_IER, UART_IER_RDI); /* turn on interrupts*/ + serial_outp(up, UART_MCR, UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS); + + /* + * If we read 0xff from the LSR, there is no UART here. + */ + if (serial_inp(up, UART_LSR) == 0xff) + return 1; + *port = up->port.iobase; + *irq = up->port.irq; +// serial8250_shutdown(&up->port); + return 0; +} +#endif + +int serial8250_init(void) { int ret, i; + static int didit = 0; + + if (didit++) + return 0; printk(KERN_INFO "Serial: 8250/16550 driver $Revision: 1.90 $ " "IRQ sharing %sabled\n", share_irqs ? "en" : "dis"); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/drivers/serial/core.c 900-mjb5/drivers/serial/core.c --- 001-bk10/drivers/serial/core.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/drivers/serial/core.c Sun Mar 16 13:38:57 2003 @@ -36,6 +36,10 @@ #include #include /* for serial_state and serial_icounter_struct */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + #include #include @@ -1043,6 +1047,17 @@ uart_ioctl(struct tty_struct *tty, struc (unsigned int *)arg); break; +#ifdef CONFIG_X86_REMOTE_DEBUG + case TIOCGDB: + ret = -ENOTTY; + if (capable(CAP_SYS_ADMIN)) { + gdb_ttyS = minor(tty->device) & 0x03F; + gdb_baud = tty_get_baud_rate(tty); + ret = gdb_hook(); + } + break; +#endif + case TIOCMBIS: case TIOCMBIC: case TIOCMSET: @@ -1118,6 +1133,30 @@ uart_ioctl(struct tty_struct *tty, struc } return ret; } + + /* + * ------------------------------------------------------------ + * Serial GDB driver (most in gdbserial.c) + * ------------------------------------------------------------ + */ + +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_GDB_CONSOLE +static struct console gdbcons = { + name: "gdb", + write: gdb_console_write, + flags: CON_PRINTBUFFER | CON_ENABLED, + index: -1, +}; +#endif + +#ifdef CONFIG_GDB_CONSOLE +void __init gdb_console_init(void) +{ + register_console(&gdbcons); +} +#endif +#endif /* CONFIG_X86_REMOTE_DEBUG */ static void uart_set_termios(struct tty_struct *tty, struct termios *old_termios) { diff -urpN -X /home/fletch/.diff.exclude 001-bk10/fs/dcache.c 900-mjb5/fs/dcache.c --- 001-bk10/fs/dcache.c Wed Mar 5 07:37:05 2003 +++ 900-mjb5/fs/dcache.c Sun Mar 16 13:39:03 2003 @@ -24,6 +24,7 @@ #include #include #include +#include #include #define DCACHE_PARANOIA 1 @@ -1571,7 +1572,7 @@ void __init vfs_caches_init(unsigned lon filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + SLAB_HWCACHE_ALIGN, filp_ctor, filp_dtor); if(!filp_cachep) panic("Cannot create filp SLAB cache"); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/fs/exec.c 900-mjb5/fs/exec.c --- 001-bk10/fs/exec.c Tue Feb 25 23:03:49 2003 +++ 900-mjb5/fs/exec.c Sun Mar 16 13:39:06 2003 @@ -50,6 +50,7 @@ #include #include #include +#include #ifdef CONFIG_KMOD #include @@ -316,10 +317,11 @@ void put_dirty_page(struct task_struct * lru_cache_add_active(page); flush_dcache_page(page); flush_page_to_ram(page); + SetPageAnon(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); - tsk->mm->rss++; + increment_rss(pmd_ptpage(*pmd)); spin_unlock(&tsk->mm->page_table_lock); /* no need for flush_tlb */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/fs/ext2/balloc.c 900-mjb5/fs/ext2/balloc.c --- 001-bk10/fs/ext2/balloc.c Sat Feb 15 16:11:45 2003 +++ 900-mjb5/fs/ext2/balloc.c Sun Mar 16 13:50:09 2003 @@ -94,69 +94,62 @@ error_out: return bh; } -static inline int reserve_blocks(struct super_block *sb, int count) +static inline int group_reserve_blocks(struct ext2_sb_info *sbi, struct ext2_bg_info *bgi, + struct ext2_group_desc *desc, + struct buffer_head *bh, int count, int use_reserve) { - struct ext2_sb_info * sbi = EXT2_SB(sb); - struct ext2_super_block * es = sbi->s_es; - unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count); - unsigned root_blocks = le32_to_cpu(es->s_r_blocks_count); + unsigned free_blocks; + unsigned root_blocks; + spin_lock(&bgi->balloc_lock); + + free_blocks = le16_to_cpu(desc->bg_free_blocks_count); if (free_blocks < count) count = free_blocks; + root_blocks = bgi->reserved; - if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) && - sbi->s_resuid != current->fsuid && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { - /* - * We are too close to reserve and we are not privileged. - * Can we allocate anything at all? - */ - if (free_blocks > root_blocks) - count = free_blocks - root_blocks; - else - return 0; + if (free_blocks < bgi->reserved && !use_reserve) { + /* don't use reserved blocks */ + spin_unlock(&bgi->balloc_lock); + return 0; } - - es->s_free_blocks_count = cpu_to_le32(free_blocks - count); - mark_buffer_dirty(sbi->s_sbh); - sb->s_dirt = 1; - return count; -} - -static inline void release_blocks(struct super_block *sb, int count) -{ - if (count) { - struct ext2_sb_info * sbi = EXT2_SB(sb); - struct ext2_super_block * es = sbi->s_es; - unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count); - es->s_free_blocks_count = cpu_to_le32(free_blocks + count); - mark_buffer_dirty(sbi->s_sbh); - sb->s_dirt = 1; + + if (free_blocks < bgi->reserved + count && !capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid && + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + /* + * We are too close to reserve and we are not privileged. + * Can we allocate anything at all? + */ + if (free_blocks > bgi->reserved) + count = free_blocks - bgi->reserved; + else { + spin_unlock(&bgi->balloc_lock); + return 0; + } } -} - -static inline int group_reserve_blocks(struct ext2_group_desc *desc, - struct buffer_head *bh, int count) -{ - unsigned free_blocks; - - if (!desc->bg_free_blocks_count) - return 0; - - free_blocks = le16_to_cpu(desc->bg_free_blocks_count); - if (free_blocks < count) - count = free_blocks; desc->bg_free_blocks_count = cpu_to_le16(free_blocks - count); + + spin_unlock(&bgi->balloc_lock); + mark_buffer_dirty(bh); return count; } -static inline void group_release_blocks(struct ext2_group_desc *desc, - struct buffer_head *bh, int count) +static inline void group_release_blocks(struct ext2_bg_info *bgi, + struct ext2_group_desc *desc, + struct buffer_head *bh, int count) { if (count) { - unsigned free_blocks = le16_to_cpu(desc->bg_free_blocks_count); + unsigned free_blocks; + + spin_lock(&bgi->balloc_lock); + + free_blocks = le16_to_cpu(desc->bg_free_blocks_count); desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count); + + spin_unlock(&bgi->balloc_lock); + mark_buffer_dirty(bh); } } @@ -172,12 +165,11 @@ void ext2_free_blocks (struct inode * in unsigned long i; unsigned long overflow; struct super_block * sb = inode->i_sb; + struct ext2_sb_info * sbi = EXT2_SB(sb); struct ext2_group_desc * desc; - struct ext2_super_block * es; + struct ext2_super_block * es = sbi->s_es; unsigned freed = 0, group_freed; - lock_super (sb); - es = EXT2_SB(sb)->s_es; if (block < le32_to_cpu(es->s_first_data_block) || block + count < block || block + count > le32_to_cpu(es->s_blocks_count)) { @@ -215,16 +207,17 @@ do_more: if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) || in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) || in_range (block, le32_to_cpu(desc->bg_inode_table), - EXT2_SB(sb)->s_itb_per_group) || + sbi->s_itb_per_group) || in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table), - EXT2_SB(sb)->s_itb_per_group)) + sbi->s_itb_per_group)) ext2_error (sb, "ext2_free_blocks", "Freeing blocks in system zones - " "Block = %lu, count = %lu", block, count); for (i = 0, group_freed = 0; i < count; i++) { - if (!ext2_clear_bit(bit + i, bitmap_bh->b_data)) + if (!ext2_clear_bit_atomic(&sbi->s_bgi[block_group].balloc_lock, + bit + i, (void *) bitmap_bh->b_data)) ext2_error (sb, "ext2_free_blocks", "bit already cleared for block %lu", block + i); @@ -236,7 +229,7 @@ do_more: if (sb->s_flags & MS_SYNCHRONOUS) sync_dirty_buffer(bitmap_bh); - group_release_blocks(desc, bh2, group_freed); + group_release_blocks(&sbi->s_bgi[block_group], desc, bh2, group_freed); freed += group_freed; if (overflow) { @@ -246,18 +239,18 @@ do_more: } error_return: brelse(bitmap_bh); - release_blocks(sb, freed); - unlock_super (sb); DQUOT_FREE_BLOCK(inode, freed); } -static int grab_block(char *map, unsigned size, int goal) +static int grab_block(spinlock_t *lock, char *map, unsigned size, int goal) { int k; char *p, *r; if (!ext2_test_bit(goal, map)) goto got_it; + +repeat: if (goal) { /* * The goal was occupied; search forward for a free @@ -297,7 +290,8 @@ static int grab_block(char *map, unsigne } return -1; got_it: - ext2_set_bit(goal, map); + if (ext2_set_bit_atomic(lock, goal, (void *) map)) + goto repeat; return goal; } @@ -319,7 +313,7 @@ int ext2_new_block (struct inode * inode int ret_block; /* j */ int bit; /* k */ int target_block; /* tmp */ - int block = 0; + int block = 0, use_reserve = 0; struct super_block *sb = inode->i_sb; struct ext2_sb_info *sbi = EXT2_SB(sb); struct ext2_super_block *es = sbi->s_es; @@ -341,14 +335,7 @@ int ext2_new_block (struct inode * inode prealloc_goal--; dq_alloc = prealloc_goal + 1; - - lock_super (sb); - - es_alloc = reserve_blocks(sb, dq_alloc); - if (!es_alloc) { - *err = -ENOSPC; - goto out_unlock; - } + es_alloc = dq_alloc; ext2_debug ("goal=%lu.\n", goal); @@ -360,7 +347,8 @@ int ext2_new_block (struct inode * inode if (!desc) goto io_error; - group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc); + group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no], + desc, gdp_bh, es_alloc, 0); if (group_alloc) { ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) % group_size); @@ -371,11 +359,12 @@ int ext2_new_block (struct inode * inode ext2_debug("goal is at %d:%d.\n", group_no, ret_block); - ret_block = grab_block(bitmap_bh->b_data, + ret_block = grab_block(&sbi->s_bgi[group_no].balloc_lock, + bitmap_bh->b_data, group_size, ret_block); if (ret_block >= 0) goto got_block; - group_release_blocks(desc, gdp_bh, group_alloc); + group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc); group_alloc = 0; } @@ -385,6 +374,7 @@ int ext2_new_block (struct inode * inode * Now search the rest of the groups. We assume that * i and desc correctly point to the last group visited. */ +repeat: for (bit = 0; !group_alloc && bit < sbi->s_groups_count; bit++) { group_no++; @@ -393,7 +383,16 @@ int ext2_new_block (struct inode * inode desc = ext2_get_group_desc(sb, group_no, &gdp_bh); if (!desc) goto io_error; - group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc); + group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no], + desc, gdp_bh, es_alloc, use_reserve); + } + if (!use_reserve) { + /* first time we did not try to allocate + * reserved blocks. now it looks like + * no more non-reserved blocks left. we + * will try to allocate reserved blocks -bzzz */ + use_reserve = 1; + goto repeat; } if (bit >= sbi->s_groups_count) { *err = -ENOSPC; @@ -404,13 +403,11 @@ int ext2_new_block (struct inode * inode if (!bitmap_bh) goto io_error; - ret_block = grab_block(bitmap_bh->b_data, group_size, 0); + ret_block = grab_block(&sbi->s_bgi[group_no].balloc_lock, + bitmap_bh->b_data, group_size, 0); if (ret_block < 0) { - ext2_error (sb, "ext2_new_block", - "Free blocks count corrupted for block group %d", - group_no); group_alloc = 0; - goto io_error; + goto repeat; } got_block: @@ -452,7 +449,8 @@ got_block: unsigned n; for (n = 0; n < group_alloc && ++ret_block < group_size; n++) { - if (ext2_set_bit(ret_block, bitmap_bh->b_data)) + if (ext2_set_bit_atomic(&sbi->s_bgi[group_no].balloc_lock, + ret_block, (void*) bitmap_bh->b_data)) break; } *prealloc_block = block + 1; @@ -471,10 +469,7 @@ got_block: *err = 0; out_release: - group_release_blocks(desc, gdp_bh, group_alloc); - release_blocks(sb, es_alloc); -out_unlock: - unlock_super (sb); + group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc); DQUOT_FREE_BLOCK(inode, dq_alloc); out: brelse(bitmap_bh); @@ -487,11 +482,11 @@ io_error: unsigned long ext2_count_free_blocks (struct super_block * sb) { -#ifdef EXT2FS_DEBUG - struct ext2_super_block * es; - unsigned long desc_count, bitmap_count, x; struct ext2_group_desc * desc; + unsigned long desc_count = 0; int i; +#ifdef EXT2FS_DEBUG + unsigned long bitmap_count, x; lock_super (sb); es = EXT2_SB(sb)->s_es; @@ -519,7 +514,13 @@ unsigned long ext2_count_free_blocks (st unlock_super (sb); return bitmap_count; #else - return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_blocks_count); + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + desc = ext2_get_group_desc (sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_blocks_count); + } + return desc_count; #endif } diff -urpN -X /home/fletch/.diff.exclude 001-bk10/fs/ext2/dir.c 900-mjb5/fs/ext2/dir.c --- 001-bk10/fs/ext2/dir.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/fs/ext2/dir.c Sun Mar 16 13:51:06 2003 @@ -259,8 +259,6 @@ ext2_readdir (struct file * filp, void * int need_revalidate = (filp->f_version != inode->i_version); int ret = 0; - lock_kernel(); - if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) goto done; @@ -313,7 +311,6 @@ done: filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset; filp->f_version = inode->i_version; UPDATE_ATIME(inode); - unlock_kernel(); return 0; } diff -urpN -X /home/fletch/.diff.exclude 001-bk10/fs/ext2/ialloc.c 900-mjb5/fs/ext2/ialloc.c --- 001-bk10/fs/ext2/ialloc.c Wed Mar 5 07:37:05 2003 +++ 900-mjb5/fs/ext2/ialloc.c Sun Mar 16 13:50:49 2003 @@ -63,6 +63,52 @@ error_out: return bh; } +void ext2_reserve_inode (struct super_block * sb, int group, int dir) +{ + struct ext2_group_desc * desc; + struct buffer_head *bh; + + desc = ext2_get_group_desc(sb, group, &bh); + if (!desc) { + ext2_error(sb, "ext2_reserve_inode", + "can't get descriptor for group %d", group); + return; + } + + spin_lock(&EXT2_SB(sb)->s_bgi[group].ialloc_lock); + desc->bg_free_inodes_count = + cpu_to_le16(le16_to_cpu(desc->bg_free_inodes_count) - 1); + if (dir) + desc->bg_used_dirs_count = + cpu_to_le16(le16_to_cpu(desc->bg_used_dirs_count) + 1); + spin_unlock(&EXT2_SB(sb)->s_bgi[group].ialloc_lock); + + mark_buffer_dirty(bh); +} + +void ext2_release_inode (struct super_block * sb, int group, int dir) +{ + struct ext2_group_desc * desc; + struct buffer_head *bh; + + desc = ext2_get_group_desc(sb, group, &bh); + if (!desc) { + ext2_error(sb, "ext2_release_inode", + "can't get descriptor for group %d", group); + return; + } + + spin_lock(&EXT2_SB(sb)->s_bgi[group].ialloc_lock); + desc->bg_free_inodes_count = + cpu_to_le16(le16_to_cpu(desc->bg_free_inodes_count) + 1); + if (dir) + desc->bg_used_dirs_count = + cpu_to_le16(le16_to_cpu(desc->bg_used_dirs_count) - 1); + spin_unlock(&EXT2_SB(sb)->s_bgi[group].ialloc_lock); + + mark_buffer_dirty(bh); +} + /* * NOTE! When we get the inode, we're the only people * that have access to it, and as such there are no @@ -85,10 +131,8 @@ void ext2_free_inode (struct inode * ino int is_directory; unsigned long ino; struct buffer_head *bitmap_bh = NULL; - struct buffer_head *bh2; unsigned long block_group; unsigned long bit; - struct ext2_group_desc * desc; struct ext2_super_block * es; ino = inode->i_ino; @@ -105,7 +149,6 @@ void ext2_free_inode (struct inode * ino DQUOT_DROP(inode); } - lock_super (sb); es = EXT2_SB(sb)->s_es; is_directory = S_ISDIR(inode->i_mode); @@ -126,32 +169,17 @@ void ext2_free_inode (struct inode * ino goto error_return; /* Ok, now we can actually update the inode bitmaps.. */ - if (!ext2_clear_bit(bit, bitmap_bh->b_data)) + if (!ext2_clear_bit_atomic(&EXT2_SB(sb)->s_bgi[block_group].ialloc_lock, + bit, (void *) bitmap_bh->b_data)) ext2_error (sb, "ext2_free_inode", "bit already cleared for inode %lu", ino); - else { - desc = ext2_get_group_desc (sb, block_group, &bh2); - if (desc) { - desc->bg_free_inodes_count = - cpu_to_le16(le16_to_cpu(desc->bg_free_inodes_count) + 1); - if (is_directory) { - desc->bg_used_dirs_count = - cpu_to_le16(le16_to_cpu(desc->bg_used_dirs_count) - 1); - EXT2_SB(sb)->s_dir_count--; - } - } - mark_buffer_dirty(bh2); - es->s_free_inodes_count = - cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1); - mark_buffer_dirty(EXT2_SB(sb)->s_sbh); - } + else + ext2_release_inode(sb, block_group, is_directory); mark_buffer_dirty(bitmap_bh); if (sb->s_flags & MS_SYNCHRONOUS) sync_dirty_buffer(bitmap_bh); - sb->s_dirt = 1; error_return: brelse(bitmap_bh); - unlock_super (sb); } /* @@ -211,9 +239,8 @@ static void ext2_preread_inode(struct in */ static int find_group_dir(struct super_block *sb, struct inode *parent) { - struct ext2_super_block * es = EXT2_SB(sb)->s_es; int ngroups = EXT2_SB(sb)->s_groups_count; - int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups; + int avefreei = ext2_count_free_inodes(sb) / ngroups; struct ext2_group_desc *desc, *best_desc = NULL; struct buffer_head *bh, *best_bh = NULL; int group, best_group = -1; @@ -234,11 +261,9 @@ static int find_group_dir(struct super_b } if (!best_desc) return -1; - best_desc->bg_free_inodes_count = - cpu_to_le16(le16_to_cpu(best_desc->bg_free_inodes_count) - 1); - best_desc->bg_used_dirs_count = - cpu_to_le16(le16_to_cpu(best_desc->bg_used_dirs_count) + 1); - mark_buffer_dirty(best_bh); + + ext2_reserve_inode(sb, best_group, 1); + return best_group; } @@ -277,10 +302,12 @@ static int find_group_orlov(struct super struct ext2_super_block *es = sbi->s_es; int ngroups = sbi->s_groups_count; int inodes_per_group = EXT2_INODES_PER_GROUP(sb); - int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups; - int avefreeb = le32_to_cpu(es->s_free_blocks_count) / ngroups; + int freei = ext2_count_free_inodes(sb); + int avefreei = freei / ngroups; + int free_blocks = ext2_count_free_blocks(sb); + int avefreeb = free_blocks / ngroups; int blocks_per_dir; - int ndirs = sbi->s_dir_count; + int ndirs = ext2_count_dirs(sb); int max_debt, max_dirs, min_blocks, min_inodes; int group = -1, i; struct ext2_group_desc *desc; @@ -320,8 +347,7 @@ static int find_group_orlov(struct super goto fallback; } - blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_free_blocks_count)) / ndirs; + blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - free_blocks) / ndirs; max_dirs = ndirs / ngroups + inodes_per_group / 16; min_inodes = avefreei - inodes_per_group / 4; @@ -340,7 +366,7 @@ static int find_group_orlov(struct super desc = ext2_get_group_desc (sb, group, &bh); if (!desc || !desc->bg_free_inodes_count) continue; - if (sbi->s_debts[group] >= max_debt) + if (sbi->s_bgi[group].debts >= max_debt) continue; if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) continue; @@ -364,12 +390,8 @@ fallback: return -1; found: - desc->bg_free_inodes_count = - cpu_to_le16(le16_to_cpu(desc->bg_free_inodes_count) - 1); - desc->bg_used_dirs_count = - cpu_to_le16(le16_to_cpu(desc->bg_used_dirs_count) + 1); - sbi->s_dir_count++; - mark_buffer_dirty(bh); + ext2_reserve_inode(sb, group, 1); + return group; } @@ -431,9 +453,8 @@ static int find_group_other(struct super return -1; found: - desc->bg_free_inodes_count = - cpu_to_le16(le16_to_cpu(desc->bg_free_inodes_count) - 1); - mark_buffer_dirty(bh); + ext2_reserve_inode(sb, group, 0); + return group; } @@ -456,7 +477,6 @@ struct inode * ext2_new_inode(struct ino return ERR_PTR(-ENOMEM); ei = EXT2_I(inode); - lock_super (sb); es = EXT2_SB(sb)->s_es; repeat: if (S_ISDIR(mode)) { @@ -480,7 +500,12 @@ repeat: EXT2_INODES_PER_GROUP(sb)); if (i >= EXT2_INODES_PER_GROUP(sb)) goto bad_count; - ext2_set_bit(i, bitmap_bh->b_data); + if (ext2_set_bit_atomic(&EXT2_SB(sb)->s_bgi[group].ialloc_lock, + i, (void *) bitmap_bh->b_data)) { + brelse(bitmap_bh); + ext2_release_inode(sb, group, S_ISDIR(mode)); + goto repeat; + } mark_buffer_dirty(bitmap_bh); if (sb->s_flags & MS_SYNCHRONOUS) @@ -497,19 +522,16 @@ repeat: goto fail2; } - es->s_free_inodes_count = - cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1); - + spin_lock(&EXT2_SB(sb)->s_bgi[group].ialloc_lock); if (S_ISDIR(mode)) { - if (EXT2_SB(sb)->s_debts[group] < 255) - EXT2_SB(sb)->s_debts[group]++; + if (EXT2_SB(sb)->s_bgi[group].debts < 255) + EXT2_SB(sb)->s_bgi[group].debts++; } else { - if (EXT2_SB(sb)->s_debts[group]) - EXT2_SB(sb)->s_debts[group]--; + if (EXT2_SB(sb)->s_bgi[group].debts) + EXT2_SB(sb)->s_bgi[group].debts--; } + spin_unlock(&EXT2_SB(sb)->s_bgi[group].ialloc_lock); - mark_buffer_dirty(EXT2_SB(sb)->s_sbh); - sb->s_dirt = 1; inode->i_uid = current->fsuid; if (test_opt (sb, GRPID)) inode->i_gid = dir->i_gid; @@ -552,7 +574,6 @@ repeat: inode->i_generation = EXT2_SB(sb)->s_next_generation++; insert_inode_hash(inode); - unlock_super(sb); if(DQUOT_ALLOC_INODE(inode)) { DQUOT_DROP(inode); goto fail3; @@ -574,15 +595,8 @@ fail3: return ERR_PTR(err); fail2: - desc = ext2_get_group_desc (sb, group, &bh2); - desc->bg_free_inodes_count = - cpu_to_le16(le16_to_cpu(desc->bg_free_inodes_count) + 1); - if (S_ISDIR(mode)) - desc->bg_used_dirs_count = - cpu_to_le16(le16_to_cpu(desc->bg_used_dirs_count) - 1); - mark_buffer_dirty(bh2); + ext2_release_inode(sb, group, S_ISDIR(mode)); fail: - unlock_super(sb); make_bad_inode(inode); iput(inode); return ERR_PTR(err); @@ -605,16 +619,19 @@ bad_count: unsigned long ext2_count_free_inodes (struct super_block * sb) { + struct ext2_group_desc *desc; + unsigned long desc_count = 0; + int i; + #ifdef EXT2FS_DEBUG struct ext2_super_block * es; - unsigned long desc_count = 0, bitmap_count = 0; + unsigned long bitmap_count = 0; struct buffer_head *bitmap_bh = NULL; int i; lock_super (sb); es = EXT2_SB(sb)->s_es; for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { - struct ext2_group_desc *desc; unsigned x; desc = ext2_get_group_desc (sb, i, NULL); @@ -637,7 +654,13 @@ unsigned long ext2_count_free_inodes (st unlock_super(sb); return desc_count; #else - return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_inodes_count); + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + desc = ext2_get_group_desc (sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_inodes_count); + } + return desc_count; #endif } diff -urpN -X /home/fletch/.diff.exclude 001-bk10/fs/ext2/super.c 900-mjb5/fs/ext2/super.c --- 001-bk10/fs/ext2/super.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/fs/ext2/super.c Sun Mar 16 13:50:49 2003 @@ -141,7 +141,7 @@ static void ext2_put_super (struct super if (sbi->s_group_desc[i]) brelse (sbi->s_group_desc[i]); kfree(sbi->s_group_desc); - kfree(sbi->s_debts); + kfree(sbi->s_bgi); brelse (sbi->s_sbh); sb->s_fs_info = NULL; kfree(sbi); @@ -464,8 +464,11 @@ static int ext2_check_descriptors (struc int i; int desc_block = 0; struct ext2_sb_info *sbi = EXT2_SB(sb); - unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); + struct ext2_super_block * es = sbi->s_es; + unsigned long block = le32_to_cpu(es->s_first_data_block); struct ext2_group_desc * gdp = NULL; + unsigned int total_free = 0, free; + unsigned int reserved = le32_to_cpu(es->s_r_blocks_count); ext2_debug ("Checking group descriptors"); @@ -504,6 +507,28 @@ static int ext2_check_descriptors (struc block += EXT2_BLOCKS_PER_GROUP(sb); gdp++; } + + /* restore free blocks counter in SB -bzzz */ + es->s_free_blocks_count = total_free = ext2_count_free_blocks(sb); + es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); + + /* distribute reserved blocks over groups -bzzz */ + for(i = sbi->s_groups_count-1; reserved && total_free && i >= 0; i--) { + gdp = ext2_get_group_desc (sb, i, NULL); + if (!gdp) { + ext2_error (sb, "ext2_check_descriptors", + "cant get descriptor for group %d", i); + return 0; + } + + free = le16_to_cpu(gdp->bg_free_blocks_count); + if (free > reserved) + free = reserved; + sbi->s_bgi[i].reserved = free; + reserved -= free; + total_free -= free; + } + return 1; } @@ -768,13 +793,18 @@ static int ext2_fill_super(struct super_ printk ("EXT2-fs: not enough memory\n"); goto failed_mount; } - sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts), + sbi->s_bgi = kmalloc(sbi->s_groups_count*sizeof(struct ext2_bg_info), GFP_KERNEL); - if (!sbi->s_debts) { + if (!sbi->s_bgi) { printk ("EXT2-fs: not enough memory\n"); goto failed_mount_group_desc; } - memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(*sbi->s_debts)); + for (i = 0; i < sbi->s_groups_count; i++) { + sbi->s_bgi[i].debts = 0; + sbi->s_bgi[i].reserved = 0; + spin_lock_init(&sbi->s_bgi[i].balloc_lock); + spin_lock_init(&sbi->s_bgi[i].ialloc_lock); + } for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logic_sb_block, i); sbi->s_group_desc[i] = sb_bread(sb, block); @@ -819,8 +849,8 @@ failed_mount2: brelse(sbi->s_group_desc[i]); failed_mount_group_desc: kfree(sbi->s_group_desc); - if (sbi->s_debts) - kfree(sbi->s_debts); + if (sbi->s_bgi) + kfree(sbi->s_bgi); failed_mount: brelse(bh); failed_sbi: @@ -839,6 +869,8 @@ static void ext2_commit_super (struct su static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es) { + es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); + es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); es->s_wtime = cpu_to_le32(get_seconds()); mark_buffer_dirty(EXT2_SB(sb)->s_sbh); sync_dirty_buffer(EXT2_SB(sb)->s_sbh); @@ -867,6 +899,8 @@ void ext2_write_super (struct super_bloc ext2_debug ("setting valid to 0\n"); es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT2_VALID_FS); + es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); + es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); es->s_mtime = cpu_to_le32(get_seconds()); ext2_sync_super(sb, es); } else @@ -928,7 +962,8 @@ static int ext2_remount (struct super_bl static int ext2_statfs (struct super_block * sb, struct statfs * buf) { struct ext2_sb_info *sbi = EXT2_SB(sb); - unsigned long overhead; + unsigned long overhead, total_free = 0; + struct ext2_group_desc *desc; int i; if (test_opt (sb, MINIX_DF)) @@ -949,9 +984,14 @@ static int ext2_statfs (struct super_blo * block group descriptors. If the sparse superblocks * feature is turned on, then not all groups have this. */ - for (i = 0; i < sbi->s_groups_count; i++) + for (i = 0; i < sbi->s_groups_count; i++) { overhead += ext2_bg_has_super(sb, i) + ext2_bg_num_gdb(sb, i); + + /* sum total free blocks -bzzz */ + desc = ext2_get_group_desc (sb, i, NULL); + total_free += le16_to_cpu(desc->bg_free_blocks_count); + } /* * Every block group has an inode bitmap, a block @@ -964,7 +1004,7 @@ static int ext2_statfs (struct super_blo buf->f_type = EXT2_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = le32_to_cpu(sbi->s_es->s_blocks_count) - overhead; - buf->f_bfree = ext2_count_free_blocks (sb); + buf->f_bfree = total_free; buf->f_bavail = buf->f_bfree - le32_to_cpu(sbi->s_es->s_r_blocks_count); if (buf->f_bfree < le32_to_cpu(sbi->s_es->s_r_blocks_count)) buf->f_bavail = 0; diff -urpN -X /home/fletch/.diff.exclude 001-bk10/fs/file_table.c 900-mjb5/fs/file_table.c --- 001-bk10/fs/file_table.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/fs/file_table.c Sun Mar 16 13:39:04 2003 @@ -22,72 +22,81 @@ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -/* Here the new files go */ -static LIST_HEAD(anon_list); -/* And here the free ones sit */ -static LIST_HEAD(free_list); /* public *and* exported. Not pretty! */ spinlock_t files_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +static spinlock_t filp_count_lock = SPIN_LOCK_UNLOCKED; + +/* slab constructors and destructors are called from arbitrary + * context and must be fully threaded - use a local spinlock + * to protect files_stat.nr_files + */ +void filp_ctor(void * objp, struct kmem_cache_s *cachep, unsigned long cflags) +{ + if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + unsigned long flags; + spin_lock_irqsave(&filp_count_lock, flags); + files_stat.nr_files++; + spin_unlock_irqrestore(&filp_count_lock, flags); + } +} + +void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags) +{ + unsigned long flags; + spin_lock_irqsave(&filp_count_lock, flags); + files_stat.nr_files--; + spin_unlock_irqrestore(&filp_count_lock, flags); +} + +static inline void file_free(struct file *f) +{ + kmem_cache_free(filp_cachep, f); +} + /* Find an unused file structure and return a pointer to it. * Returns NULL, if there are no more free file structures or * we run out of memory. - * - * SMP-safe. */ -struct file * get_empty_filp(void) +struct file *get_empty_filp(void) { - static int old_max = 0; +static int old_max = 0; struct file * f; - file_list_lock(); - if (files_stat.nr_free_files > NR_RESERVED_FILES) { - used_one: - f = list_entry(free_list.next, struct file, f_list); - list_del(&f->f_list); - files_stat.nr_free_files--; - new_one: - memset(f, 0, sizeof(*f)); - if (security_file_alloc(f)) { - list_add(&f->f_list, &free_list); - files_stat.nr_free_files++; - file_list_unlock(); - return NULL; - } - eventpoll_init_file(f); - atomic_set(&f->f_count,1); - f->f_version = 0; - f->f_uid = current->fsuid; - f->f_gid = current->fsgid; - f->f_owner.lock = RW_LOCK_UNLOCKED; - list_add(&f->f_list, &anon_list); - file_list_unlock(); - return f; - } - /* - * Use a reserved one if we're the superuser - */ - if (files_stat.nr_free_files && !current->euid) - goto used_one; /* - * Allocate a new one if we're below the limit. + * Privileged users can go above max_files */ - if (files_stat.nr_files < files_stat.max_files) { - file_list_unlock(); - f = kmem_cache_alloc(filp_cachep, SLAB_KERNEL); - file_list_lock(); + if (files_stat.nr_files < files_stat.max_files || + capable(CAP_SYS_ADMIN)) { + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (f) { - files_stat.nr_files++; - goto new_one; + memset(f, 0, sizeof(*f)); + if (security_file_alloc(f)) { + file_free(f); + goto fail; + } + eventpoll_init_file(f); + atomic_set(&f->f_count, 1); + f->f_uid = current->fsuid; + f->f_gid = current->fsgid; + f->f_owner.lock = RW_LOCK_UNLOCKED; + /* f->f_version: 0 */ + INIT_LIST_HEAD(&f->f_list); + return f; } - /* Big problems... */ - printk(KERN_WARNING "VFS: filp allocation failed\n"); + } - } else if (files_stat.max_files > old_max) { - printk(KERN_INFO "VFS: file-max limit %d reached\n", files_stat.max_files); + /* Ran out of filps - report that */ + if (files_stat.max_files >= old_max) { + printk(KERN_INFO "VFS: file-max limit %d reached\n", + files_stat.max_files); old_max = files_stat.max_files; + } else { + /* Big problems... */ + printk(KERN_WARNING "VFS: filp allocation failed\n"); } - file_list_unlock(); +fail: return NULL; } @@ -106,6 +115,7 @@ int init_private_file(struct file *filp, filp->f_uid = current->fsuid; filp->f_gid = current->fsgid; filp->f_op = dentry->d_inode->i_fop; + INIT_LIST_HEAD(&filp->f_list); if (filp->f_op->open) return filp->f_op->open(dentry->d_inode, filp); else @@ -121,11 +131,11 @@ void fput(struct file * file) /* __fput is called from task context when aio completion releases the last * last use of a struct file *. Do not use otherwise. */ -void __fput(struct file * file) +void __fput(struct file *file) { - struct dentry * dentry = file->f_dentry; - struct vfsmount * mnt = file->f_vfsmnt; - struct inode * inode = dentry->d_inode; + struct dentry *dentry = file->f_dentry; + struct vfsmount *mnt = file->f_vfsmnt; + struct inode *inode = dentry->d_inode; /* * The function eventpoll_release() should be the first called @@ -144,16 +154,15 @@ void __fput(struct file * file) file->f_dentry = NULL; file->f_vfsmnt = NULL; list_del(&file->f_list); - list_add(&file->f_list, &free_list); - files_stat.nr_free_files++; file_list_unlock(); + file_free(file); dput(dentry); mntput(mnt); } -struct file * fget(unsigned int fd) +struct file *fget(unsigned int fd) { - struct file * file; + struct file *file; struct files_struct *files = current->files; read_lock(&files->file_lock); @@ -164,17 +173,14 @@ struct file * fget(unsigned int fd) return file; } -/* Here. put_filp() is SMP-safe now. */ - void put_filp(struct file *file) { - if(atomic_dec_and_test(&file->f_count)) { + if (atomic_dec_and_test(&file->f_count)) { security_file_free(file); file_list_lock(); list_del(&file->f_list); - list_add(&file->f_list, &free_list); - files_stat.nr_free_files++; file_list_unlock(); + file_free(file); } } @@ -183,8 +189,7 @@ void file_move(struct file *file, struct if (!list) return; file_list_lock(); - list_del(&file->f_list); - list_add(&file->f_list, list); + list_move(&file->f_list, list); file_list_unlock(); } @@ -209,7 +214,7 @@ int fs_may_remount_ro(struct super_block if (inode->i_nlink == 0) goto too_bad; - /* Writable file? */ + /* Writeable file? */ if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) goto too_bad; } diff -urpN -X /home/fletch/.diff.exclude 001-bk10/fs/inode.c 900-mjb5/fs/inode.c --- 001-bk10/fs/inode.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/fs/inode.c Sun Mar 16 13:38:54 2003 @@ -1088,16 +1088,19 @@ sector_t bmap(struct inode * inode, sect void update_atime(struct inode *inode) { - struct timespec now = CURRENT_TIME; + struct timespec now; - /* Can later do this more lazily with a per superblock interval */ - if (timespec_equal(&inode->i_atime, &now)) - return; if (IS_NOATIME(inode)) return; if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) return; if (IS_RDONLY(inode)) + return; + + now = CURRENT_TIME; + + /* Can later do this more lazily with a per superblock interval */ + if (timespec_equal(&inode->i_atime, &now)) return; inode->i_atime = now; mark_inode_dirty_sync(inode); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/fs/proc/proc_misc.c 900-mjb5/fs/proc/proc_misc.c --- 001-bk10/fs/proc/proc_misc.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/fs/proc/proc_misc.c Sun Mar 16 13:39:06 2003 @@ -97,6 +97,37 @@ static int loadavg_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +static int real_loadavg_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int a, b, c, cpu; + int len; + + a = tasks_running[0] + (FIXED_1/200); + b = tasks_running[1] + (FIXED_1/200); + c = tasks_running[2] + (FIXED_1/200); + len = sprintf(page,"Domain load1 load2 load3 nr_run/nr_thrd\n"); + len += sprintf(page+len,"SYSTEM %5d.%02d %5d.%02d %5d.%02d %7ld/%7d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running(), nr_threads); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!cpu_online(cpu)) + continue; + a = cpu_tasks_running[0][cpu] + (FIXED_1/200); + b = cpu_tasks_running[1][cpu] + (FIXED_1/200); + c = cpu_tasks_running[2][cpu] + (FIXED_1/200); + len += sprintf(page+len, "%5d %5d.%02d %5d.%02d %5d.%02d %7ld/7%d\n", + cpu, + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running_cpu(cpu), nr_threads); + } + return proc_calc_metrics(page, start, off, count, eof, len); +} + static int uptime_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -133,6 +164,40 @@ static int uptime_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +struct vmalloc_info { + unsigned long used; + unsigned long largest_chunk; +}; + +static struct vmalloc_info get_vmalloc_info(void) +{ + unsigned long prev_end = VMALLOC_START; + struct vm_struct* vma; + struct vmalloc_info vmi; + vmi.used = 0; + + read_lock(&vmlist_lock); + + if(!vmlist) + vmi.largest_chunk = (VMALLOC_END-VMALLOC_START); + else + vmi.largest_chunk = 0; + + for (vma = vmlist; vma; vma = vma->next) { + unsigned long free_area_size = + (unsigned long)vma->addr - prev_end; + vmi.used += vma->size; + if (vmi.largest_chunk < free_area_size ) + vmi.largest_chunk = free_area_size; + prev_end = vma->size + (unsigned long)vma->addr; + } + if(VMALLOC_END-prev_end > vmi.largest_chunk) + vmi.largest_chunk = VMALLOC_END-prev_end; + + read_unlock(&vmlist_lock); + return vmi; +} + extern atomic_t vm_committed_space; static int meminfo_read_proc(char *page, char **start, off_t off, @@ -144,6 +209,8 @@ static int meminfo_read_proc(char *page, unsigned long inactive; unsigned long active; unsigned long free; + unsigned long vmtot; + struct vmalloc_info vmi; get_page_state(&ps); get_zone_counts(&active, &inactive, &free); @@ -156,6 +223,11 @@ static int meminfo_read_proc(char *page, si_swapinfo(&i); committed = atomic_read(&vm_committed_space); + vmtot = (VMALLOC_END-VMALLOC_START)>>10; + vmi = get_vmalloc_info(); + vmi.used >>= 10; + vmi.largest_chunk >>= 10; + /* * Tagged format, for easy grepping and expansion. */ @@ -179,7 +251,10 @@ static int meminfo_read_proc(char *page, "Slab: %8lu kB\n" "Committed_AS: %8u kB\n" "PageTables: %8lu kB\n" - "ReverseMaps: %8lu\n", + "ReverseMaps: %8lu\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.bufferram), @@ -199,7 +274,10 @@ static int meminfo_read_proc(char *page, K(ps.nr_slab), K(committed), K(ps.nr_page_table_pages), - ps.nr_reverse_maps + ps.nr_reverse_maps, + vmtot, + vmi.used, + vmi.largest_chunk ); len += hugetlb_report_meminfo(page + len); @@ -257,6 +335,9 @@ static struct file_operations proc_vmsta .release = seq_release, }; +extern int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data); + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -301,6 +382,71 @@ static struct file_operations proc_modul }; #endif +#ifdef CONFIG_NUMA +#define K(x) ((x) << (PAGE_SHIFT - 10)) +static int show_meminfo_numa (struct seq_file *m, void *v) +{ + int *d = v; + int nid = *d; + struct sysinfo i; + si_meminfo_node(&i, nid); + seq_printf(m, "\n" + "Node %d MemTotal: %8lu kB\n" + "Node %d MemFree: %8lu kB\n" + "Node %d MemUsed: %8lu kB\n" + "Node %d HighTotal: %8lu kB\n" + "Node %d HighFree: %8lu kB\n" + "Node %d LowTotal: %8lu kB\n" + "Node %d LowFree: %8lu kB\n", + nid, K(i.totalram), + nid, K(i.freeram), + nid, K(i.totalram-i.freeram), + nid, K(i.totalhigh), + nid, K(i.freehigh), + nid, K(i.totalram-i.totalhigh), + nid, K(i.freeram-i.freehigh)); + + return 0; +} +#undef K + +extern struct seq_operations meminfo_numa_op; +static int meminfo_numa_open(struct inode *inode, struct file *file) +{ + return seq_open(file,&meminfo_numa_op); +} + +static struct file_operations proc_meminfo_numa_operations = { + open: meminfo_numa_open, + read: seq_read, + llseek: seq_lseek, + release: seq_release, +}; + +static void *meminfo_numa_start(struct seq_file *m, loff_t *pos) +{ + return *pos < numnodes ? pos : NULL; +} + +static void *meminfo_numa_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return meminfo_numa_start(m, pos); +} + +static void meminfo_numa_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations meminfo_numa_op = { + .start = meminfo_numa_start, + .next = meminfo_numa_next, + .stop = meminfo_numa_stop, + .show = show_meminfo_numa, +}; + +#endif + extern struct seq_operations slabinfo_op; extern ssize_t slabinfo_write(struct file *, const char *, size_t, loff_t *); static int slabinfo_open(struct inode *inode, struct file *file) @@ -531,6 +677,36 @@ static void create_seq_entry(char *name, entry->proc_fops = f; } +#ifdef CONFIG_LOCKMETER +extern ssize_t get_lockmeter_info(char *, size_t, loff_t *); +extern ssize_t put_lockmeter_info(const char *, size_t); +extern int get_lockmeter_info_size(void); + +/* + * This function accesses lock metering information. + */ +static ssize_t read_lockmeter(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + return get_lockmeter_info(buf, count, ppos); +} + +/* + * Writing to /proc/lockmeter resets the counters + */ +static ssize_t write_lockmeter(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + return put_lockmeter_info(buf, count); +} + +static struct file_operations proc_lockmeter_operations = { + NULL, /* lseek */ + read: read_lockmeter, + write: write_lockmeter, +}; +#endif /* CONFIG_LOCKMETER */ + void __init proc_misc_init(void) { struct proc_dir_entry *entry; @@ -539,6 +715,7 @@ void __init proc_misc_init(void) int (*read_proc)(char*,char**,off_t,int,int*,void*); } *p, simple_ones[] = { {"loadavg", loadavg_read_proc}, + {"real_loadavg",real_loadavg_read_proc}, {"uptime", uptime_read_proc}, {"meminfo", meminfo_read_proc}, {"version", version_read_proc}, @@ -557,6 +734,7 @@ void __init proc_misc_init(void) #endif {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"schedstat", schedstats_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) @@ -579,6 +757,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); #endif +#ifdef CONFIG_NUMA + create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations); +#endif proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { proc_root_kcore->proc_fops = &proc_kcore_operations; @@ -592,6 +773,13 @@ void __init proc_misc_init(void) entry->size = (1+prof_len) * sizeof(unsigned int); } } +#ifdef CONFIG_LOCKMETER + entry = create_proc_entry("lockmeter", S_IWUSR | S_IRUGO, NULL); + if (entry) { + entry->proc_fops = &proc_lockmeter_operations; + entry->size = get_lockmeter_info_size(); + } +#endif #ifdef CONFIG_PPC32 { extern struct file_operations ppc_htab_operations; diff -urpN -X /home/fletch/.diff.exclude 001-bk10/fs/reiserfs/inode.c 900-mjb5/fs/reiserfs/inode.c --- 001-bk10/fs/reiserfs/inode.c Thu Feb 13 11:08:12 2003 +++ 900-mjb5/fs/reiserfs/inode.c Sun Mar 16 13:39:07 2003 @@ -304,7 +304,7 @@ research: ** read old data off disk. Set the up to date bit on the buffer instead ** and jump to the end */ - if (PageUptodate(bh_result->b_page)) { + if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { set_buffer_uptodate(bh_result); goto finished ; } @@ -418,6 +418,40 @@ static int reiserfs_get_block_create_0 ( return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ; } +static int reiserfs_get_block_direct_io (struct inode * inode, + sector_t iblock, unsigned long max_blocks, + struct buffer_head * bh_result, int create) { + int ret ; + + bh_result->b_size = (1 << inode->i_blkbits); + bh_result->b_page = NULL; + + ret = reiserfs_get_block(inode, iblock, bh_result, create) ; + + if (ret != 0) + return ret; + + /* don't allow direct io onto tail pages */ + if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* make sure future calls to the direct io funcs for this offset + ** in the file fail by unmapping the buffer + */ + reiserfs_unmap_buffer(bh_result); + ret = -EINVAL ; + } + + /* Possible unpacked tail. Flush the data before pages have + disappeared */ + if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { + lock_kernel(); + reiserfs_commit_for_inode(inode); + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + unlock_kernel(); + } + + return ret ; +} + /* ** helper function for when reiserfs_get_block is called for a hole ** but the file tail is still in a direct item @@ -446,7 +480,7 @@ static int convert_tail_for_hole(struct tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ; index = tail_offset >> PAGE_CACHE_SHIFT ; - if (index != hole_page->index) { + if (!hole_page || index != hole_page->index) { tail_page = grab_cache_page(inode->i_mapping, index) ; retval = -ENOMEM; if (!tail_page) { @@ -552,7 +586,15 @@ int reiserfs_get_block (struct inode * i return ret; } - REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; + /* If file is of such a size, that it might have a tail and tails are enabled + ** we should mark it as possibly needing tail packing on close + */ + if ( (have_large_tails (inode->i_sb) && + inode->i_size < i_block_size (inode)*4) || + (have_small_tails (inode->i_sb) && + inode->i_size < i_block_size(inode)) ) + + REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; windex = push_journal_writer("reiserfs_get_block") ; @@ -742,22 +784,27 @@ int reiserfs_get_block (struct inode * i ** the disk */ set_buffer_uptodate (unbh); - - /* we've converted the tail, so we must - ** flush unbh before the transaction commits - */ - add_to_flushlist(inode, unbh) ; - - /* mark it dirty now to prevent commit_write from adding - ** this buffer to the inode's dirty buffer list - */ + /* unbh->b_page == NULL in case of DIRECT_IO request, this means + buffer will disappear shortly, so it should not be added to + any of our lists. + */ + if ( unbh->b_page ) { + /* we've converted the tail, so we must + ** flush unbh before the transaction commits + */ + add_to_flushlist(inode, unbh) ; + + /* mark it dirty now to prevent commit_write from adding + ** this buffer to the inode's dirty buffer list + */ /* * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty(). * It's still atomic, but it sets the page dirty too, * which makes it eligible for writeback at any time by the * VM (which was also the case with __mark_buffer_dirty()) */ - mark_buffer_dirty(unbh) ; + mark_buffer_dirty(unbh) ; + } //inode->i_blocks += inode->i_sb->s_blocksize / 512; //mark_tail_converted (inode); @@ -2156,6 +2203,15 @@ static int reiserfs_commit_write(struct if (pos > inode->i_size) { struct reiserfs_transaction_handle th ; reiserfs_write_lock(inode->i_sb); + /* If the file have grown beyond the border where it + can have a tail, unmark it as needing a tail + packing */ + if ( (have_large_tails (inode->i_sb) && + inode->i_size < i_block_size(inode)*4) || + (have_small_tails (inode->i_sb) && + inode->i_size < i_block_size(inode)) ) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ; + journal_begin(&th, inode->i_sb, 1) ; reiserfs_update_inode_transaction(inode) ; inode->i_size = pos ; @@ -2214,6 +2270,17 @@ static int reiserfs_releasepage(struct p return ret ; } +static int reiserfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + + return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, reiserfs_get_block_direct_io); +} + struct address_space_operations reiserfs_address_space_operations = { .writepage = reiserfs_writepage, .readpage = reiserfs_readpage, @@ -2222,5 +2289,6 @@ struct address_space_operations reiserfs .sync_page = block_sync_page, .prepare_write = reiserfs_prepare_write, .commit_write = reiserfs_commit_write, - .bmap = reiserfs_aop_bmap + .bmap = reiserfs_aop_bmap, + .direct_IO = reiserfs_direct_IO } ; diff -urpN -X /home/fletch/.diff.exclude 001-bk10/fs/reiserfs/tail_conversion.c 900-mjb5/fs/reiserfs/tail_conversion.c --- 001-bk10/fs/reiserfs/tail_conversion.c Sun Nov 17 20:29:59 2002 +++ 900-mjb5/fs/reiserfs/tail_conversion.c Sun Mar 16 13:39:07 2003 @@ -105,8 +105,10 @@ int direct2indirect (struct reiserfs_tra /* we only send the unbh pointer if the buffer is not up to date. ** this avoids overwriting good data from writepage() with old data ** from the disk or buffer cache + ** Special case: unbh->b_page will be NULL if we are coming through + ** DIRECT_IO handler here. */ - if (buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) { + if (!unbh->b_page || buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) { up_to_date_bh = NULL ; } else { up_to_date_bh = unbh ; diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-alpha/bitops.h 900-mjb5/include/asm-alpha/bitops.h --- 001-bk10/include/asm-alpha/bitops.h Wed Mar 5 07:37:06 2003 +++ 900-mjb5/include/asm-alpha/bitops.h Sun Mar 16 13:50:09 2003 @@ -487,7 +487,9 @@ sched_find_first_bit(unsigned long b[3]) #define ext2_set_bit __test_and_set_bit +#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a) #define ext2_clear_bit __test_and_clear_bit +#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a) #define ext2_test_bit test_bit #define ext2_find_first_zero_bit find_first_zero_bit #define ext2_find_next_zero_bit find_next_zero_bit diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-arm/bitops.h 900-mjb5/include/asm-arm/bitops.h --- 001-bk10/include/asm-arm/bitops.h Wed Mar 5 07:37:06 2003 +++ 900-mjb5/include/asm-arm/bitops.h Sun Mar 16 13:50:09 2003 @@ -357,8 +357,12 @@ static inline int sched_find_first_bit(u */ #define ext2_set_bit(nr,p) \ __test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p)) +#define ext2_set_bit_atomic(lock,nr,p) \ + test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p)) #define ext2_clear_bit(nr,p) \ __test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p)) +#define ext2_clear_bit_atomic(lock,nr,p) \ + test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p)) #define ext2_test_bit(nr,p) \ __test_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p)) #define ext2_find_first_zero_bit(p,sz) \ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-cris/bitops.h 900-mjb5/include/asm-cris/bitops.h --- 001-bk10/include/asm-cris/bitops.h Sun Nov 17 20:29:56 2002 +++ 900-mjb5/include/asm-cris/bitops.h Sun Mar 16 13:50:09 2003 @@ -360,7 +360,9 @@ static inline int find_next_zero_bit (vo #define hweight8(x) generic_hweight8(x) #define ext2_set_bit test_and_set_bit +#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a) #define ext2_clear_bit test_and_clear_bit +#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a) #define ext2_test_bit test_bit #define ext2_find_first_zero_bit find_first_zero_bit #define ext2_find_next_zero_bit find_next_zero_bit diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-generic/rmap.h 900-mjb5/include/asm-generic/rmap.h --- 001-bk10/include/asm-generic/rmap.h Thu Feb 13 11:08:13 2003 +++ 900-mjb5/include/asm-generic/rmap.h Sun Mar 16 13:39:06 2003 @@ -26,39 +26,12 @@ */ #include -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) -{ -#ifdef BROKEN_PPC_PTE_ALLOC_ONE - /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ - extern int mem_init_done; - - if (!mem_init_done) - return; -#endif - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page * page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; -} - static inline unsigned long ptep_to_address(pte_t * ptep) { - struct page * page = kmap_atomic_to_page(ptep); + struct ptpage * page = (struct ptpage *)kmap_atomic_to_page(ptep); unsigned long low_bits; low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; + return page->virtual + low_bits; } #if CONFIG_HIGHPTE @@ -86,5 +59,10 @@ static inline void rmap_ptep_unmap(pte_t return; } #endif + +extern void pgtable_add_rmap(struct ptpage * ptepage, struct mm_struct * mm, unsigned long address); +extern void pgtable_add_rmap_locked(struct ptpage * ptepage, struct mm_struct * mm, unsigned long address); +extern void pgtable_remove_rmap(struct ptpage * ptepage, struct mm_struct *mm); +extern void pgtable_remove_rmap_locked(struct ptpage * ptepage, struct mm_struct *mm); #endif /* _GENERIC_RMAP_H */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-generic/tlb.h 900-mjb5/include/asm-generic/tlb.h --- 001-bk10/include/asm-generic/tlb.h Mon Jan 13 21:09:27 2003 +++ 900-mjb5/include/asm-generic/tlb.h Sun Mar 16 13:39:06 2003 @@ -84,13 +84,6 @@ tlb_flush_mmu(struct mmu_gather *tlb, un static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - int freed = tlb->freed; - struct mm_struct *mm = tlb->mm; - int rss = mm->rss; - - if (rss < freed) - freed = rss; - mm->rss = rss - freed; tlb_flush_mmu(tlb, start, end); /* keep the page table cache within bounds */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/bitops.h 900-mjb5/include/asm-i386/bitops.h --- 001-bk10/include/asm-i386/bitops.h Fri Dec 13 23:18:12 2002 +++ 900-mjb5/include/asm-i386/bitops.h Sun Mar 16 13:50:09 2003 @@ -479,8 +479,12 @@ static __inline__ int ffs(int x) #define ext2_set_bit(nr,addr) \ __test_and_set_bit((nr),(unsigned long*)addr) +#define ext2_set_bit_atomic(lock,nr,addr) \ + test_and_set_bit((nr),(unsigned long*)addr) #define ext2_clear_bit(nr, addr) \ __test_and_clear_bit((nr),(unsigned long*)addr) +#define ext2_clear_bit_atomic(lock,nr, addr) \ + test_and_clear_bit((nr),(unsigned long*)addr) #define ext2_test_bit(nr, addr) test_bit((nr),(unsigned long*)addr) #define ext2_find_first_zero_bit(addr, size) \ find_first_zero_bit((unsigned long*)addr, size) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/bug.h 900-mjb5/include/asm-i386/bug.h --- 001-bk10/include/asm-i386/bug.h Sun Mar 16 13:38:21 2003 +++ 900-mjb5/include/asm-i386/bug.h Sun Mar 16 13:38:57 2003 @@ -9,6 +9,11 @@ * undefined" opcode for parsing in the trap handler. */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#define BUG() do { \ + asm ("int $0x3"); \ +} while (0) +#else #if 1 /* Set to zero for a slightly smaller kernel */ #define BUG() \ __asm__ __volatile__( "ud2\n" \ @@ -17,6 +22,7 @@ : : "i" (__LINE__), "i" (__FILE__)) #else #define BUG() __asm__ __volatile__("ud2\n") +#endif #endif #define PAGE_BUG(page) do { \ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/dump.h 900-mjb5/include/asm-i386/dump.h --- 001-bk10/include/asm-i386/dump.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/asm-i386/dump.h Sun Mar 16 13:39:02 2003 @@ -0,0 +1,75 @@ +/* + * Kernel header file for Linux crash dumps. + * + * Created by: Matt Robinson (yakker@sgi.com) + * + * Copyright 1999 Silicon Graphics, Inc. All rights reserved. + * + * This code is released under version 2 of the GNU GPL. + */ + +/* This header file holds the architecture specific crash dump header */ +#ifndef _ASM_DUMP_H +#define _ASM_DUMP_H + +/* necessary header files */ +#include +#include +#include +#include + +/* definitions */ +#define DUMP_ASM_MAGIC_NUMBER 0xdeaddeadULL /* magic number */ +#define DUMP_ASM_VERSION_NUMBER 0x3 /* version number */ + +/* max number of cpus */ +#define DUMP_MAX_NUM_CPUS 32 + +/* + * Structure: __dump_header_asm + * Function: This is the header for architecture-specific stuff. It + * follows right after the dump header. + */ +struct __dump_header_asm { + /* the dump magic number -- unique to verify dump is valid */ + u64 dha_magic_number; + + /* the version number of this dump */ + u32 dha_version; + + /* the size of this header (in case we can't read it) */ + u32 dha_header_size; + + /* the esp for i386 systems */ + u32 dha_esp; + + /* the eip for i386 systems */ + u32 dha_eip; + + /* the dump registers */ + struct pt_regs dha_regs; + + /* smp specific */ + u32 dha_smp_num_cpus; + u32 dha_dumping_cpu; + struct pt_regs dha_smp_regs[DUMP_MAX_NUM_CPUS]; + u32 dha_smp_current_task[DUMP_MAX_NUM_CPUS]; + u32 dha_stack[DUMP_MAX_NUM_CPUS]; + u32 dha_stack_ptr[DUMP_MAX_NUM_CPUS]; +} __attribute__((packed)); + +#ifdef __KERNEL__ + +extern struct __dump_header_asm dump_header_asm; + +#ifdef CONFIG_SMP +extern unsigned long irq_affinity[]; +extern int (*dump_ipi_function_ptr)(struct pt_regs *); +extern void dump_send_ipi(void); +#else +#define dump_send_ipi() do { } while(0) +#endif + +#endif /* __KERNEL__ */ + +#endif /* _ASM_DUMP_H */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/e820.h 900-mjb5/include/asm-i386/e820.h --- 001-bk10/include/asm-i386/e820.h Sun Nov 17 20:29:50 2002 +++ 900-mjb5/include/asm-i386/e820.h Sun Mar 16 13:39:02 2003 @@ -35,6 +35,7 @@ struct e820map { }; extern struct e820map e820; + #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/early_printk.h 900-mjb5/include/asm-i386/early_printk.h --- 001-bk10/include/asm-i386/early_printk.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/asm-i386/early_printk.h Sun Mar 16 13:38:49 2003 @@ -0,0 +1,8 @@ +#ifndef __EARLY_PRINTK_H_I386_ +#define __EARLY_PRINTK_H_i386_ + +#define VGABASE 0xB8000 +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/ioctls.h 900-mjb5/include/asm-i386/ioctls.h --- 001-bk10/include/asm-i386/ioctls.h Sun Nov 17 20:29:22 2002 +++ 900-mjb5/include/asm-i386/ioctls.h Sun Mar 16 13:38:57 2003 @@ -68,6 +68,7 @@ #define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ #define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ #define FIOQSIZE 0x5460 +#define TIOCGDB 0x547F /* enable GDB stub mode on this tty */ /* Used for packet mode */ #define TIOCPKT_DATA 0 diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/kdebug.h 900-mjb5/include/asm-i386/kdebug.h --- 001-bk10/include/asm-i386/kdebug.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/asm-i386/kdebug.h Sun Mar 16 13:39:02 2003 @@ -0,0 +1,53 @@ +#ifndef _I386_KDEBUG_H +#define _I386_KDEBUG_H 1 + +#include + +struct pt_regs; + +struct die_args { + struct pt_regs *regs; + const char *str; + long err; +}; + + +/* Grossly misnamed. */ +enum die_val { + DIE_OOPS = 1, + DIE_INT3, + DIE_DEBUG, + DIE_PANIC, + DIE_NMI, + DIE_DIE, + DIE_CPUINIT, /* not really a die, but .. */ + DIE_TRAPINIT, + DIE_STOP, + DIE_PROTFAULT, + DIE_PAGEFAULT, + DIE_FPUTRAP, + DIE_WATCHDOG, +}; + +extern struct notifier_block *die_chain; + +static inline int register_die_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&die_chain, nb); + +} + +static inline int unregister_die_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&die_chain, nb); +} + +static inline int notify_die(enum die_val val, const char *str, + struct pt_regs *regs,long err) +{ + struct die_args args = { regs: regs, str: str, err: err }; + + return notifier_call_chain(&die_chain, val, &args); +} + +#endif diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/kmap_types.h 900-mjb5/include/asm-i386/kmap_types.h --- 001-bk10/include/asm-i386/kmap_types.h Thu Feb 13 11:08:13 2003 +++ 900-mjb5/include/asm-i386/kmap_types.h Sun Mar 16 13:39:02 2003 @@ -24,7 +24,8 @@ D(10) KM_IRQ0, D(11) KM_IRQ1, D(12) KM_SOFTIRQ0, D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(14) KM_TYPE_NR, +D(15) KM_DUMP }; #undef D diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/kprobes.h 900-mjb5/include/asm-i386/kprobes.h --- 001-bk10/include/asm-i386/kprobes.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/asm-i386/kprobes.h Sun Mar 16 13:38:57 2003 @@ -0,0 +1,34 @@ +#ifndef _ASM_KPROBES_H +#define _ASM_KPROBES_H +/* + * Dynamic Probes (kprobes) support + * Vamsi Krishna S , July, 2002 + * Mailing list: dprobes@www-124.ibm.com + */ +#include +#include + +struct pt_regs; + +typedef u8 kprobe_opcode_t; +#define BREAKPOINT_INSTRUCTION 0xcc + +/* trap3/1 are intr gates for kprobes. So, restore the status of IF, + * if necessary, before executing the original int3/1 (trap) handler. + */ +static inline void restore_interrupts(struct pt_regs *regs) +{ + if (regs->eflags & IF_MASK) + __asm__ __volatile__ ("sti"); +} + +#ifdef CONFIG_KPROBES +extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); +extern int post_kprobe_handler(struct pt_regs *regs); +extern int kprobe_handler(struct pt_regs *regs); +#else /* !CONFIG_KPROBES */ +static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { return 0; } +static inline int post_kprobe_handler(struct pt_regs *regs) { return 0; } +static inline int kprobe_handler(struct pt_regs *regs) { return 0; } +#endif +#endif /* _ASM_KPROBES_H */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/linkage.h 900-mjb5/include/asm-i386/linkage.h --- 001-bk10/include/asm-i386/linkage.h Sun Nov 17 20:29:46 2002 +++ 900-mjb5/include/asm-i386/linkage.h Sun Mar 16 13:38:58 2003 @@ -3,6 +3,7 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #define FASTCALL(x) x __attribute__((regparm(3))) +#define IRQHANDLER(x) x __attribute__((regparm(1))) #ifdef CONFIG_X86_ALIGNMENT_16 #define __ALIGN .align 16,0x90 diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/lockmeter.h 900-mjb5/include/asm-i386/lockmeter.h --- 001-bk10/include/asm-i386/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/asm-i386/lockmeter.h Sun Mar 16 13:39:06 2003 @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks. + * Added support for hold time statistics for read and write + * locks. + * Moved machine dependent code here from include/lockmeter.h. + * + */ + +#ifndef _I386_LOCKMETER_H +#define _I386_LOCKMETER_H + +#include +#include + +#include + +#ifdef __KERNEL__ +extern unsigned long cpu_khz; +#define CPU_CYCLE_FREQUENCY (cpu_khz * 1000) +#else +#define CPU_CYCLE_FREQUENCY 450000000 +#endif + +#define THIS_CPU_NUMBER smp_processor_id() + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) \ + __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory") + +#define local_irq_restore(x) \ + __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory") +#endif /* Linux version 2.2.x */ + +/* + * macros to cache and retrieve an index value inside of a spin lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. Not normally a problem!! + * we also assume that the hash table has less than 65535 entries. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Intel is little endian */ + unsigned short lock; + unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + /* read and write lock attempts may cause the lock value to temporarily */ + /* be negative. Until it is >= 0 we know nothing (i. e. can't tell if */ + /* is -1 because it was write locked and somebody tried to read lock it */ + /* or if it is -1 because it was read locked and somebody tried to write*/ + /* lock it. ........................................................... */ + do { + tmp = (int) rwlock_ptr->lock; + } while (tmp < 0); + if (tmp == 0) return(0); + else return(RW_LOCK_BIAS-tmp); +} + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock <= 0) +#define IABS(x) ((x) > 0 ? (x) : -(x)) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((IABS((rwlock_ptr)->lock) % RW_LOCK_BIAS) != 0) + +/* this is a lot of typing just to get gcc to emit "rdtsc" */ +static inline long long get_cycles64 (void) +{ +#ifndef CONFIG_X86_TSC + #error this code requires CONFIG_X86_TSC +#else + union longlong_u { + long long intlong; + struct intint_s { + uint32_t eax; + uint32_t edx; + } intint; + } longlong; + + rdtsc(longlong.intint.eax,longlong.intint.edx); + return longlong.intlong; +#endif +} + +#endif /* _I386_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/mach-default/irq_vectors.h 900-mjb5/include/asm-i386/mach-default/irq_vectors.h --- 001-bk10/include/asm-i386/mach-default/irq_vectors.h Mon Dec 23 23:01:56 2002 +++ 900-mjb5/include/asm-i386/mach-default/irq_vectors.h Sun Mar 16 13:39:02 2003 @@ -48,6 +48,7 @@ #define INVALIDATE_TLB_VECTOR 0xfd #define RESCHEDULE_VECTOR 0xfc #define CALL_FUNCTION_VECTOR 0xfb +#define DUMP_VECTOR 0xfa #define THERMAL_APIC_VECTOR 0xf0 /* diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/mach-summit/mach_mpparse.h 900-mjb5/include/asm-i386/mach-summit/mach_mpparse.h --- 001-bk10/include/asm-i386/mach-summit/mach_mpparse.h Sat Feb 15 16:11:45 2003 +++ 900-mjb5/include/asm-i386/mach-summit/mach_mpparse.h Sun Mar 16 13:39:05 2003 @@ -1,6 +1,8 @@ #ifndef __ASM_MACH_MPPARSE_H #define __ASM_MACH_MPPARSE_H +#include + extern int use_cyclone; static inline void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, @@ -33,4 +35,71 @@ static inline void acpi_madt_oem_check(c use_cyclone = 1; /*enable cyclone-timer*/ } } + +struct rio_table_hdr { + unsigned char version; /* Version number of this data structure */ + /* Version 3 adds chassis_num & WP_index */ + unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */ + unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */ +} __attribute__((packed)); + +struct scal_detail { + unsigned char node_id; /* Scalability Node ID */ + unsigned long CBAR; /* Address of 1MB register space */ + unsigned char port0node; /* Node ID port connected to: 0xFF=None */ + unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char port1node; /* Node ID port connected to: 0xFF = None */ + unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char port2node; /* Node ID port connected to: 0xFF = None */ + unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */ +} __attribute__((packed)); + +struct rio_detail { + unsigned char node_id; /* RIO Node ID */ + unsigned long BBAR; /* Address of 1MB register space */ + unsigned char type; /* Type of device */ + unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/ + /* For CYC: Node ID of Twister that owns this CYC */ + unsigned char port0node; /* Node ID port connected to: 0xFF=None */ + unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char port1node; /* Node ID port connected to: 0xFF=None */ + unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ + unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */ + /* For CYC: 0 */ + unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */ + /* = 0 : the XAPIC is not used, ie:*/ + /* ints fwded to another XAPIC */ + /* Bits1:7 Reserved */ + /* For CYC: Bits0:7 Reserved */ + unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */ + /* lower slot numbers/PCI bus numbers */ + /* For CYC: No meaning */ + unsigned char chassis_num; /* 1 based Chassis number */ + /* For LookOut WPEGs this field indicates the */ + /* Expansion Chassis #, enumerated from Boot */ + /* Node WPEG external port, then Boot Node CYC */ + /* external port, then Next Vigil chassis WPEG */ + /* external port, etc. */ + /* Shared Lookouts have only 1 chassis number (the */ + /* first one assigned) */ +} __attribute__((packed)); + + +typedef enum { + CompatTwister = 0, /* Compatibility Twister */ + AltTwister = 1, /* Alternate Twister of internal 8-way */ + CompatCyclone = 2, /* Compatibility Cyclone */ + AltCyclone = 3, /* Alternate Cyclone of internal 8-way */ + CompatWPEG = 4, /* Compatibility WPEG */ + AltWPEG = 5, /* Second Planar WPEG */ + LookOutAWPEG = 6, /* LookOut WPEG */ + LookOutBWPEG = 7, /* LookOut WPEG */ +} node_type; + +static inline int is_WPEG(node_type type){ + return (type == CompatWPEG || type == AltWPEG || + type == LookOutAWPEG || type == LookOutBWPEG); +} + #endif /* __ASM_MACH_MPPARSE_H */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/mpspec.h 900-mjb5/include/asm-i386/mpspec.h --- 001-bk10/include/asm-i386/mpspec.h Fri Jan 17 09:18:31 2003 +++ 900-mjb5/include/asm-i386/mpspec.h Sun Mar 16 13:39:05 2003 @@ -222,6 +222,10 @@ extern unsigned long mp_lapic_addr; extern int pic_mode; extern int using_apic_timer; +#ifdef CONFIG_X86_SUMMIT +extern void setup_summit (void); +#endif + #ifdef CONFIG_ACPI_BOOT extern void mp_register_lapic (u8 id, u8 enabled); extern void mp_register_lapic_address (u64 address); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/page.h 900-mjb5/include/asm-i386/page.h --- 001-bk10/include/asm-i386/page.h Wed Mar 5 07:37:06 2003 +++ 900-mjb5/include/asm-i386/page.h Sun Mar 16 13:38:58 2003 @@ -3,7 +3,11 @@ /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) +#ifndef __ASSEMBLY__ +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#else +#define PAGE_SIZE (1 << PAGE_SHIFT) +#endif #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) @@ -90,7 +94,16 @@ typedef struct { unsigned long pgprot; } * and CONFIG_HIGHMEM64G options in the kernel configuration. */ -#define __PAGE_OFFSET (0xC0000000) +#include +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000) +#endif /* * This much address space is reserved for vmalloc() and iomap() diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/param.h 900-mjb5/include/asm-i386/param.h --- 001-bk10/include/asm-i386/param.h Sun Nov 17 20:29:26 2002 +++ 900-mjb5/include/asm-i386/param.h Sun Mar 16 13:38:50 2003 @@ -2,10 +2,18 @@ #define _ASMi386_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +#include + +#ifdef CONFIG_1000HZ +# define HZ 1000 /* Internal kernel timer frequency */ +#else +# define HZ 100 #endif + +#define USER_HZ 100 /* .. some user interfaces are in "ticks" */ +#define CLOCKS_PER_SEC (USER_HZ) /* like times() */ + +#endif /* __KERNEL__ */ #ifndef HZ #define HZ 100 diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/pgalloc.h 900-mjb5/include/asm-i386/pgalloc.h --- 001-bk10/include/asm-i386/pgalloc.h Thu Feb 13 11:08:13 2003 +++ 900-mjb5/include/asm-i386/pgalloc.h Sun Mar 16 13:39:06 2003 @@ -10,10 +10,10 @@ #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct ptpage *pte) { set_pmd(pmd, __pmd(_PAGE_TABLE + - ((unsigned long long)page_to_pfn(pte) << + ((unsigned long long)page_to_pfn((struct page *)pte) << (unsigned long long) PAGE_SHIFT))); } /* @@ -24,20 +24,20 @@ pgd_t *pgd_alloc(struct mm_struct *); void pgd_free(pgd_t *pgd); pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -struct page *pte_alloc_one(struct mm_struct *, unsigned long); +struct ptpage *pte_alloc_one(struct mm_struct *, unsigned long); static inline void pte_free_kernel(pte_t *pte) { free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct ptpage *pte) { - __free_page(pte); + __free_page((struct page *)pte); } -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),((struct page *)pte)) /* * allocating and freeing a pmd is trivial: the 1-entry pmd is diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/pgtable.h 900-mjb5/include/asm-i386/pgtable.h --- 001-bk10/include/asm-i386/pgtable.h Sun Mar 16 13:38:21 2003 +++ 900-mjb5/include/asm-i386/pgtable.h Sun Mar 16 13:39:06 2003 @@ -113,6 +113,7 @@ void pgtable_cache_init(void); #define _PAGE_PROTNONE 0x080 /* If not present */ #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _PAGE_TABLE_RDONLY (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) #define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -121,6 +122,10 @@ void pgtable_cache_init(void); #define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define PMD_NONE __pgprot(_PAGE_PRESENT | _PAGE_ACCESSED) +#define PMD_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) +#define PMD_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) + #define _PAGE_KERNEL \ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) @@ -157,6 +162,15 @@ extern unsigned long __PAGE_KERNEL; #define __S110 PAGE_SHARED #define __S111 PAGE_SHARED +#define __PMD000 PMD_NONE +#define __PMD001 PMD_READONLY +#define __PMD010 PMD_SHARED +#define __PMD011 PMD_SHARED +#define __PMD100 PMD_READONLY +#define __PMD101 PMD_READONLY +#define __PMD110 PMD_SHARED +#define __PMD111 PMD_SHARED + /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are @@ -173,8 +187,8 @@ extern unsigned long pg0[1024]; #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) - +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_RW)) != \ + (_KERNPG_TABLE & ~_PAGE_RW)) #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) @@ -198,6 +212,9 @@ static inline pte_t pte_mkexec(pte_t pte static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } +static inline int pmd_write(pmd_t pmd) { return (pmd).pmd & _PAGE_RW; } +static inline pmd_t pmd_wrprotect(pmd_t pmd) { (pmd).pmd &= ~_PAGE_RW; return pmd; } +static inline pmd_t pmd_mkwrite(pmd_t pmd) { (pmd).pmd |= _PAGE_RW; return pmd; } static inline int ptep_test_and_clear_dirty(pte_t *ptep) { return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } static inline int ptep_test_and_clear_young(pte_t *ptep) { return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); } @@ -219,6 +236,13 @@ static inline pte_t pte_modify(pte_t pte return pte; } +static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + pmd.pmd &= ~(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER); + pmd.pmd |= pgprot_val(newprot); + return pmd; +} + #define page_pte(page) page_pte_prot(page, __pgprot(0)) #define pmd_page_kernel(pmd) \ @@ -228,6 +252,8 @@ static inline pte_t pte_modify(pte_t pte #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) #endif /* !CONFIG_DISCONTIGMEM */ +#define pmd_ptpage(pmd) ((struct ptpage *)pmd_page(pmd)) + #define pmd_large(pmd) \ ((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT)) @@ -253,12 +279,20 @@ static inline pte_t pte_modify(pte_t pte ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) #define pte_offset_map_nested(dir, address) \ ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) +#define pte_page_map(__page, address) \ + ((pte_t *)kmap_atomic(__page,KM_PTE0) + pte_index(address)) +#define pte_page_map_nested(__page, address) \ + ((pte_t *)kmap_atomic(__page,KM_PTE1) + pte_index(address)) #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) #else #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) #define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) +#define pte_page_map(__page, address) \ + ((pte_t *)page_address(__page) + pte_index(address)) +#define pte_page_map_nested(__page, address) \ + ((pte_t *)page_address(__page) + pte_index(address)) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) #endif diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/processor.h 900-mjb5/include/asm-i386/processor.h --- 001-bk10/include/asm-i386/processor.h Sun Mar 16 13:38:21 2003 +++ 900-mjb5/include/asm-i386/processor.h Sun Mar 16 13:38:57 2003 @@ -280,7 +280,11 @@ extern unsigned int mca_pentium_flag; /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ +#ifdef CONFIG_05GB +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 16)) +#else #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#endif /* * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. @@ -394,6 +398,9 @@ struct thread_struct { unsigned int saved_fs, saved_gs; /* IO permissions */ unsigned long *ts_io_bitmap; +#ifdef CONFIG_X86_REMOTE_DEBUG + struct pt_regs *kgdbregs; +#endif }; #define INIT_THREAD { \ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/rwlock.h 900-mjb5/include/asm-i386/rwlock.h --- 001-bk10/include/asm-i386/rwlock.h Sun Nov 17 20:29:57 2002 +++ 900-mjb5/include/asm-i386/rwlock.h Sun Mar 16 13:39:05 2003 @@ -20,28 +20,52 @@ #define RW_LOCK_BIAS 0x01000000 #define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $1,(%0)\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_read_lock_const(rw, helper) \ - asm volatile(LOCK "subl $1,%0\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "jns 1f\n\t" \ + "call " helper "\n\t" \ + "1:\t" \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "jns 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\t" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ + #define __build_read_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ @@ -50,28 +74,51 @@ __build_read_lock_ptr(rw, helper); \ } while (0) -#define __build_write_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_write_lock_const(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jz 1f\n\t" \ + "call " helper "\n\t" \ + "1:\n" \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jz 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\n" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ #define __build_write_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/smp.h 900-mjb5/include/asm-i386/smp.h --- 001-bk10/include/asm-i386/smp.h Fri Jan 17 09:18:31 2003 +++ 900-mjb5/include/asm-i386/smp.h Sun Mar 16 13:39:02 2003 @@ -39,6 +39,7 @@ extern int smp_num_siblings; extern int cpu_sibling_map[]; extern void smp_flush_tlb(void); +extern void dump_send_ipi(void); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); extern void smp_send_reschedule(int cpu); extern void smp_send_reschedule_all(void); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/spinlock.h 900-mjb5/include/asm-i386/spinlock.h --- 001-bk10/include/asm-i386/spinlock.h Mon Dec 23 23:01:56 2002 +++ 900-mjb5/include/asm-i386/spinlock.h Sun Mar 16 13:39:06 2003 @@ -42,18 +42,35 @@ typedef struct { #define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0) #define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) -#define spin_lock_string \ - "\n1:\t" \ - "lock ; decb %0\n\t" \ - "js 2f\n" \ - LOCK_SECTION_START("") \ - "2:\t" \ - "rep;nop\n\t" \ - "cmpb $0,%0\n\t" \ - "jle 2b\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END +#ifdef CONFIG_SPINLINE + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + "jmp 3f\n" \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + "3:\t" + +#else /* !CONFIG_SPINLINE */ + + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + LOCK_SECTION_START("") \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END + +#endif /* CONFIG_SPINLINE */ /* * This works. Despite all the confusion. * (except on PPro SMP or if we are using OOSTORE) @@ -141,6 +158,11 @@ printk("eip: %p\n", &&here); */ typedef struct { volatile unsigned int lock; +#if CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* and we need this storage for CPU and lock INDEX */ + unsigned lockmeter_magic; +#endif #ifdef CONFIG_DEBUG_SPINLOCK unsigned magic; #endif @@ -148,11 +170,19 @@ typedef struct { #define RWLOCK_MAGIC 0xdeaf1eed +#ifdef CONFIG_LOCKMETER +#if CONFIG_DEBUG_SPINLOCK +#define RWLOCK_MAGIC_INIT , 0, RWLOCK_MAGIC +#else +#define RWLOCK_MAGIC_INIT , 0 +#endif +#else /* !CONFIG_LOCKMETER */ #ifdef CONFIG_DEBUG_SPINLOCK #define RWLOCK_MAGIC_INIT , RWLOCK_MAGIC #else #define RWLOCK_MAGIC_INIT /* */ #endif +#endif /* !CONFIG_LOCKMETER */ #define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT } @@ -200,5 +230,59 @@ static inline int _raw_write_trylock(rwl atomic_add(RW_LOCK_BIAS, count); return 0; } + +#ifdef CONFIG_LOCKMETER +static inline int _raw_read_trylock(rwlock_t *lock) +{ +/* FIXME -- replace with assembler */ + atomic_t *count = (atomic_t *)lock; + atomic_dec(count); + if (count->counter > 0) + return 1; + atomic_inc(count); + return 0; +} +#endif + +#if defined(CONFIG_LOCKMETER) && defined(CONFIG_HAVE_DEC_LOCK) +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock(spinlock_t *lock); + +/* + * Matches what is in arch/i386/lib/dec_and_lock.c, except this one is + * "static inline" so that the spin_lock(), if actually invoked, is charged + * against the real caller, not against the catch-all atomic_dec_and_lock + */ +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + int counter; + int newcount; + +repeat: + counter = atomic_read(atomic); + newcount = counter-1; + + if (!newcount) + goto slow_path; + + asm volatile("lock; cmpxchgl %1,%2" + :"=a" (newcount) + :"r" (newcount), "m" (atomic->counter), "0" (counter)); + + /* If the above failed, "eax" will have changed */ + if (newcount != counter) + goto repeat; + return 0; + +slow_path: + _metered_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + _metered_spin_unlock(lock); + return 0; +} + +#define ATOMIC_DEC_AND_LOCK +#endif #endif /* __ASM_SPINLOCK_H */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-i386/thread_info.h 900-mjb5/include/asm-i386/thread_info.h --- 001-bk10/include/asm-i386/thread_info.h Sun Mar 16 13:38:21 2003 +++ 900-mjb5/include/asm-i386/thread_info.h Sun Mar 16 13:39:01 2003 @@ -9,6 +9,7 @@ #ifdef __KERNEL__ +#include #ifndef __ASSEMBLY__ #include #endif @@ -30,9 +31,11 @@ struct thread_info { __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: + 0 for interrupts: illegal 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ + struct thread_info *irq_stack; /* pointer to cpu irq stack */ struct restart_block restart_block; __u8 supervisor_stack[0]; @@ -48,7 +51,8 @@ struct thread_info { #define TI_CPU 0x00000010 #define TI_PRE_COUNT 0x00000014 #define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C +#define TI_IRQ_STACK 0x0000001C +#define TI_RESTART_BLOCK 0x0000026 #endif @@ -59,48 +63,66 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ +#ifdef CONFIG_4K_STACK +#define THREAD_ORDER 0 +#define STACK_WARN 0x200 +#define STACK_PANIC 0x100 +#else +#define THREAD_ORDER 1 +#define STACK_WARN ((THREAD_SIZE)>>1) +#define STACK_PANIC 0x100 +#endif +#define INIT_THREAD_SIZE THREAD_SIZE + #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .irq_stack = &init_irq_union.thread_info, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + } \ } #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) +/* thread information allocation */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define alloc_thread_info() kmalloc(THREAD_SIZE, GFP_KERNEL) +#define free_thread_info(ti) kfree(ti) +#define get_thread_info(ti) get_task_struct((ti)->task) +#define put_thread_info(ti) put_task_struct((ti)->task) + /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } -/* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_thread_info() ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) -#define free_thread_info(ti) free_pages((unsigned long) (ti), 1) -#define get_thread_info(ti) get_task_struct((ti)->task) -#define put_thread_info(ti) put_task_struct((ti)->task) - #else /* !__ASSEMBLY__ */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) + /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $-THREAD_SIZE, reg; \ andl %esp, reg -#endif +/* use this one if reg already contains %esp */ +#define GET_THREAD_INFO_WITH_ESP(reg) \ + andl $-THREAD_SIZE, reg +#endif + /* * thread information flags * - these are process state flags that various assembly files may need to access diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-ia64/bitops.h 900-mjb5/include/asm-ia64/bitops.h --- 001-bk10/include/asm-ia64/bitops.h Thu Feb 13 11:08:13 2003 +++ 900-mjb5/include/asm-ia64/bitops.h Sun Mar 16 13:50:09 2003 @@ -453,7 +453,9 @@ find_next_bit (void *addr, unsigned long #define __clear_bit(nr, addr) clear_bit(nr, addr) #define ext2_set_bit test_and_set_bit +#define ext2_set_atomic(l,n,a) test_and_set_bit(n,a) #define ext2_clear_bit test_and_clear_bit +#define ext2_clear_atomic(l,n,a) test_and_clear_bit(n,a) #define ext2_test_bit test_bit #define ext2_find_first_zero_bit find_first_zero_bit #define ext2_find_next_zero_bit find_next_zero_bit diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-m68k/bitops.h 900-mjb5/include/asm-m68k/bitops.h --- 001-bk10/include/asm-m68k/bitops.h Sun Nov 17 20:29:59 2002 +++ 900-mjb5/include/asm-m68k/bitops.h Sun Mar 16 13:50:09 2003 @@ -354,6 +354,16 @@ ext2_set_bit (int nr, volatile void *vad return retval; } +static inline int +ext2_set_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr) +{ + int ret; + spin_lock(lock); + ret = ext2_set_bit(nr, vaddr); + spin_unlock(lock); + return ret; +} + extern __inline__ int ext2_clear_bit (int nr, volatile void *vaddr) { @@ -364,6 +374,16 @@ ext2_clear_bit (int nr, volatile void *v return retval; } + +static inline int +ext2_clear_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr) +{ + int ret; + spin_lock(lock); + ret = ext2_clear_bit(nr, vaddr); + spin_unlock(lock); + return ret; +} extern __inline__ int ext2_test_bit (int nr, const volatile void *vaddr) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-m68knommu/bitops.h 900-mjb5/include/asm-m68knommu/bitops.h --- 001-bk10/include/asm-m68knommu/bitops.h Sun Nov 17 20:29:23 2002 +++ 900-mjb5/include/asm-m68knommu/bitops.h Sun Mar 16 13:50:09 2003 @@ -387,6 +387,16 @@ extern __inline__ int ext2_set_bit(int n return retval; } +static inline int ext2_set_bit_atomic(spinlock_t *lock, int nr, + volatile void * addr) +{ + int ret; + spin_lock(lock); + ret = ext2_set_bit(nr, addr); + spin_unlock(lock); + return ret; +} + extern __inline__ int ext2_clear_bit(int nr, volatile void * addr) { int mask, retval; @@ -400,6 +410,16 @@ extern __inline__ int ext2_clear_bit(int *ADDR &= ~mask; local_irq_restore(flags); return retval; +} + +static inline int ext2_clear_bit_atomic(spinlock_t *lock, int nr, + volatile void * addr) +{ + int ret; + spin_lock(lock); + ret = ext2_clear_bit(nr, addr); + spin_unlock(lock); + return ret; } extern __inline__ int ext2_test_bit(int nr, const volatile void * addr) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-mips/bitops.h 900-mjb5/include/asm-mips/bitops.h --- 001-bk10/include/asm-mips/bitops.h Sun Nov 17 20:29:22 2002 +++ 900-mjb5/include/asm-mips/bitops.h Sun Mar 16 13:50:09 2003 @@ -810,6 +810,15 @@ extern __inline__ int ext2_set_bit(int n return retval; } +static inline int ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr) +{ + int ret; + spin_lock(lock); + ret = ext2_set_bit(nr, addr); + spin_unlock(lock); + return ret; +} + extern __inline__ int ext2_clear_bit(int nr, void * addr) { int mask, retval, flags; @@ -824,6 +833,15 @@ extern __inline__ int ext2_clear_bit(int return retval; } +static inline int ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr) +{ + int ret; + spin_lock(lock); + ret = ext2_clear_bit(nr, addr); + spin_unlock(lock); + return ret; +} + extern __inline__ int ext2_test_bit(int nr, const void * addr) { int mask; @@ -890,7 +908,9 @@ found_middle: /* Native ext2 byte ordering, just collapse using defines. */ #define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr)) +#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr), (addr)) #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr)) +#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr), (addr)) #define ext2_test_bit(nr, addr) test_bit((nr), (addr)) #define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size)) #define ext2_find_next_zero_bit(addr, size, offset) \ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-mips64/bitops.h 900-mjb5/include/asm-mips64/bitops.h --- 001-bk10/include/asm-mips64/bitops.h Sun Nov 17 20:29:53 2002 +++ 900-mjb5/include/asm-mips64/bitops.h Sun Mar 16 13:50:09 2003 @@ -516,6 +516,16 @@ ext2_set_bit(int nr,void * addr) return retval; } +static inline int +ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr) +{ + int ret; + spin_lock(lock); + ret = ext2_set_bit(nr, addr); + spin_unlock(lock); + return ret; +} + extern inline int ext2_clear_bit(int nr, void * addr) { @@ -531,6 +541,16 @@ ext2_clear_bit(int nr, void * addr) return retval; } +static inline int +ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr) +{ + int ret; + spin_lock(lock); + ret = ext2_clear_bit(nr, addr); + spin_unlock(lock); + return ret; +} + extern inline int ext2_test_bit(int nr, const void * addr) { @@ -599,7 +619,9 @@ found_middle: /* Native ext2 byte ordering, just collapse using defines. */ #define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr)) +#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr), (addr)) #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr)) +#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr), (addr)) #define ext2_test_bit(nr, addr) test_bit((nr), (addr)) #define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size)) #define ext2_find_next_zero_bit(addr, size, offset) \ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-parisc/bitops.h 900-mjb5/include/asm-parisc/bitops.h --- 001-bk10/include/asm-parisc/bitops.h Thu Feb 13 11:08:13 2003 +++ 900-mjb5/include/asm-parisc/bitops.h Sun Mar 16 13:50:09 2003 @@ -389,10 +389,14 @@ found_middle: */ #ifdef __LP64__ #define ext2_set_bit(nr, addr) test_and_set_bit((nr) ^ 0x38, addr) +#define ext2_set_bit_atomic(l,nr,addr) test_and_set_bit((nr) ^ 0x38, addr) #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr) ^ 0x38, addr) +#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x38, addr) #else #define ext2_set_bit(nr, addr) test_and_set_bit((nr) ^ 0x18, addr) +#define ext2_set_bit_atomic(l,nr,addr) test_and_set_bit((nr) ^ 0x18, addr) #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr) ^ 0x18, addr) +#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x18, addr) #endif #endif /* __KERNEL__ */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-ppc/bitops.h 900-mjb5/include/asm-ppc/bitops.h --- 001-bk10/include/asm-ppc/bitops.h Thu Jan 9 19:16:11 2003 +++ 900-mjb5/include/asm-ppc/bitops.h Sun Mar 16 13:50:09 2003 @@ -392,7 +392,9 @@ found_middle: #define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr)) +#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr)) #define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr)) +#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr)) static __inline__ int ext2_test_bit(int nr, __const__ void * addr) { diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-ppc64/bitops.h 900-mjb5/include/asm-ppc64/bitops.h --- 001-bk10/include/asm-ppc64/bitops.h Sun Nov 17 20:29:52 2002 +++ 900-mjb5/include/asm-ppc64/bitops.h Sun Mar 16 13:50:09 2003 @@ -336,8 +336,12 @@ static __inline__ int __test_and_clear_l #define ext2_set_bit(nr,addr) \ __test_and_set_le_bit((nr),(unsigned long*)addr) +#define ext2_set_bit_atomic(lock, nr,addr) \ + test_and_set_le_bit((nr),(unsigned long*)addr) #define ext2_clear_bit(nr, addr) \ __test_and_clear_le_bit((nr),(unsigned long*)addr) +#define ext2_clear_bit_atomic(lock, nr, addr) \ + test_and_clear_le_bit((nr),(unsigned long*)addr) #define ext2_test_bit(nr, addr) test_le_bit((nr),(unsigned long*)addr) #define ext2_find_first_zero_bit(addr, size) \ find_first_zero_le_bit((unsigned long*)addr, size) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-s390/bitops.h 900-mjb5/include/asm-s390/bitops.h --- 001-bk10/include/asm-s390/bitops.h Tue Feb 25 23:03:50 2003 +++ 900-mjb5/include/asm-s390/bitops.h Sun Mar 16 13:50:09 2003 @@ -805,8 +805,12 @@ extern __inline__ int fls(int x) #define ext2_set_bit(nr, addr) \ test_and_set_bit((nr)^24, (unsigned long *)addr) +#define ext2_set_bit_atomic(lock, nr, addr) \ + test_and_set_bit((nr)^24, (unsigned long *)addr) #define ext2_clear_bit(nr, addr) \ test_and_clear_bit((nr)^24, (unsigned long *)addr) +#define ext2_clear_bit_atomic(lock, nr, addr) \ + test_and_clear_bit((nr)^24, (unsigned long *)addr) #define ext2_test_bit(nr, addr) \ test_bit((nr)^24, (unsigned long *)addr) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-s390x/bitops.h 900-mjb5/include/asm-s390x/bitops.h --- 001-bk10/include/asm-s390x/bitops.h Tue Feb 25 23:03:50 2003 +++ 900-mjb5/include/asm-s390x/bitops.h Sun Mar 16 13:50:09 2003 @@ -838,8 +838,12 @@ extern __inline__ int fls(int x) #define ext2_set_bit(nr, addr) \ test_and_set_bit((nr)^56, (unsigned long *)addr) +#define ext2_set_bit_atomic(lock, nr, addr) \ + test_and_set_bit((nr)^56, (unsigned long *)addr) #define ext2_clear_bit(nr, addr) \ test_and_clear_bit((nr)^56, (unsigned long *)addr) +#define ext2_clear_bit_atomic(lock, nr, addr) \ + test_and_clear_bit((nr)^56, (unsigned long *)addr) #define ext2_test_bit(nr, addr) \ test_bit((nr)^56, (unsigned long *)addr) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-sh/bitops.h 900-mjb5/include/asm-sh/bitops.h --- 001-bk10/include/asm-sh/bitops.h Sun Nov 17 20:29:20 2002 +++ 900-mjb5/include/asm-sh/bitops.h Sun Mar 16 13:50:09 2003 @@ -265,6 +265,16 @@ static __inline__ int ext2_set_bit(int n return retval; } +static inline int ext2_set_bit_atomic(spinlock_t *lock, + int nr, volatile void * addr) +{ + int ret; + spin_lock(lock); + ret = ext2_set_bit(nr, addr); + spin_unlock(lock); + return ret; +} + static __inline__ int ext2_clear_bit(int nr, volatile void * addr) { int mask, retval; @@ -279,6 +289,16 @@ static __inline__ int ext2_clear_bit(int restore_flags(flags); return retval; } + +static inline int ext2_clear_bit_atomic(spinlock_t *lock, + int nr, volatile void * addr) +{ + int ret; + spin_lock(lock); + ret = ext2_clear_bit(nr, addr); + spin_unlock(lock); + return ret; +} static __inline__ int ext2_test_bit(int nr, const volatile void * addr) { diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-sparc/bitops.h 900-mjb5/include/asm-sparc/bitops.h --- 001-bk10/include/asm-sparc/bitops.h Thu Jan 9 19:16:12 2003 +++ 900-mjb5/include/asm-sparc/bitops.h Sun Mar 16 13:50:09 2003 @@ -454,7 +454,9 @@ found_middle: find_next_zero_le_bit((addr), (size), 0) #define ext2_set_bit __test_and_set_le_bit +#define ext2_set_bit_atomic(l,n,a) test_and_set_le_bit(n,a) #define ext2_clear_bit __test_and_clear_le_bit +#define ext2_clear_bit_atomic(l,n,a) test_and_clear_le_bit(n,a) #define ext2_test_bit test_le_bit #define ext2_find_first_zero_bit find_first_zero_le_bit #define ext2_find_next_zero_bit find_next_zero_le_bit diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-sparc64/bitops.h 900-mjb5/include/asm-sparc64/bitops.h --- 001-bk10/include/asm-sparc64/bitops.h Sun Nov 17 20:29:25 2002 +++ 900-mjb5/include/asm-sparc64/bitops.h Sun Mar 16 13:50:09 2003 @@ -351,7 +351,9 @@ found_middle: #ifdef __KERNEL__ #define ext2_set_bit(nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr)) +#define ext2_set_bit_atomic(lock,nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr)) #define ext2_clear_bit(nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr)) +#define ext2_clear_bit_atomic(lock,nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr)) #define ext2_test_bit(nr,addr) test_le_bit((nr),(unsigned long *)(addr)) #define ext2_find_first_zero_bit(addr, size) \ find_first_zero_le_bit((unsigned long *)(addr), (size)) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-v850/bitops.h 900-mjb5/include/asm-v850/bitops.h --- 001-bk10/include/asm-v850/bitops.h Sun Nov 17 20:29:20 2002 +++ 900-mjb5/include/asm-v850/bitops.h Sun Mar 16 13:50:09 2003 @@ -252,7 +252,9 @@ static inline int sched_find_first_bit(u #define hweight8(x) generic_hweight8 (x) #define ext2_set_bit test_and_set_bit +#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a) #define ext2_clear_bit test_and_clear_bit +#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a) #define ext2_test_bit test_bit #define ext2_find_first_zero_bit find_first_zero_bit #define ext2_find_next_zero_bit find_next_zero_bit diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-x86_64/bitops.h 900-mjb5/include/asm-x86_64/bitops.h --- 001-bk10/include/asm-x86_64/bitops.h Tue Feb 25 23:03:51 2003 +++ 900-mjb5/include/asm-x86_64/bitops.h Sun Mar 16 13:50:09 2003 @@ -487,8 +487,12 @@ static __inline__ int ffs(int x) #define ext2_set_bit(nr,addr) \ __test_and_set_bit((nr),(unsigned long*)addr) +#define ext2_set_bit_atomic(lock,nr,addr) \ + test_and_set_bit((nr),(unsigned long*)addr) #define ext2_clear_bit(nr, addr) \ __test_and_clear_bit((nr),(unsigned long*)addr) +#define ext2_clear_bit_atomic(lock,nr,addr) \ + test_and_clear_bit((nr),(unsigned long*)addr) #define ext2_test_bit(nr, addr) test_bit((nr),(unsigned long*)addr) #define ext2_find_first_zero_bit(addr, size) \ find_first_zero_bit((unsigned long*)addr, size) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/asm-x86_64/early_printk.h 900-mjb5/include/asm-x86_64/early_printk.h --- 001-bk10/include/asm-x86_64/early_printk.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/asm-x86_64/early_printk.h Sun Mar 16 13:38:49 2003 @@ -0,0 +1,8 @@ +#ifdef __EARLY_PRINTK_H_X86_64_ +#define __EARLY_PRINTK_H_X86_64_ + +#define VGABASE 0xffffffff800b8000UL +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/dump.h 900-mjb5/include/linux/dump.h --- 001-bk10/include/linux/dump.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/linux/dump.h Sun Mar 16 13:39:02 2003 @@ -0,0 +1,362 @@ +/* + * Kernel header file for Linux crash dumps. + * + * Created by: Matt Robinson (yakker@sgi.com) + * Copyright 1999 - 2002 Silicon Graphics, Inc. All rights reserved. + * + * vmdump.h to dump.h by: Matt D. Robinson (yakker@sourceforge.net) + * Copyright 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved. + * + * Most of this is the same old stuff from vmdump.h, except now we're + * actually a stand-alone driver plugged into the block layer interface, + * with the exception that we now allow for compression modes externally + * loaded (e.g., someone can come up with their own). + * + * This code is released under version 2 of the GNU GPL. + */ + +/* This header file includes all structure definitions for crash dumps. */ +#ifndef _DUMP_H +#define _DUMP_H + +#if defined(CONFIG_CRASH_DUMP) || defined (CONFIG_CRASH_DUMP_MODULE) + +#include +#include +#include + +/* + * Predefine default DUMP_PAGE constants, asm header may override. + * + * On ia64 discontinuous memory systems it's possible for the memory + * banks to stop at 2**12 page alignments, the smallest possible page + * size. But the system page size, PAGE_SIZE, is in fact larger. + */ +#define DUMP_PAGE_SHIFT PAGE_SHIFT +#define DUMP_PAGE_MASK PAGE_MASK +#define DUMP_PAGE_ALIGN(addr) PAGE_ALIGN(addr) +#define DUMP_HEADER_OFFSET PAGE_SIZE + +/* keep DUMP_PAGE_SIZE constant to 4K = 1<<12 + * it may be different from PAGE_SIZE then. + */ +#define DUMP_PAGE_SIZE 4096 + +/* + * Predefined default memcpy() to use when copying memory to the dump buffer. + * + * On ia64 there is a heads up function that can be called to let the prom + * machine check monitor know that the current activity is risky and it should + * ignore the fault (nofault). In this case the ia64 header will redefine this + * macro to __dump_memcpy() and use it's arch specific version. + */ +#define DUMP_memcpy memcpy + +/* necessary header files */ +#include /* for architecture-specific header */ + +/* + * Size of the buffer that's used to hold: + * + * 1. the dump header (padded to fill the complete buffer) + * 2. the possibly compressed page headers and data + */ +#define DUMP_BUFFER_SIZE (64 * 1024) /* size of dump buffer */ +#define DUMP_HEADER_SIZE DUMP_BUFFER_SIZE + +/* standard header definitions */ +#define DUMP_MAGIC_NUMBER 0xa8190173618f23edULL /* dump magic number */ +#define DUMP_MAGIC_LIVE 0xa8190173618f23cdULL /* live magic number */ +#define DUMP_VERSION_NUMBER 0x8 /* dump version number */ +#define DUMP_PANIC_LEN 0x100 /* dump panic string length */ + +/* dump levels - type specific stuff added later -- add as necessary */ +#define DUMP_LEVEL_NONE 0x0 /* no dumping at all -- just bail */ +#define DUMP_LEVEL_HEADER 0x1 /* kernel dump header only */ +#define DUMP_LEVEL_KERN 0x2 /* dump header and kernel pages */ +#define DUMP_LEVEL_USED 0x4 /* dump header, kernel/user pages */ +#define DUMP_LEVEL_ALL_RAM 0x8 /* dump header, all RAM pages */ +#define DUMP_LEVEL_ALL 0x10 /* dump all memory RAM and firmware */ + + +/* dump compression options -- add as necessary */ +#define DUMP_COMPRESS_NONE 0x0 /* don't compress this dump */ +#define DUMP_COMPRESS_RLE 0x1 /* use RLE compression */ +#define DUMP_COMPRESS_GZIP 0x2 /* use GZIP compression */ + +/* dump flags - any dump-type specific flags -- add as necessary */ +#define DUMP_FLAGS_NONE 0x0 /* no flags are set for this dump */ + +#define DUMP_FLAGS_TARGETMASK 0xf0000000 /* handle special case targets */ +#define DUMP_FLAGS_DISKDUMP 0x80000000 /* dump to local disk */ +#define DUMP_FLAGS_NETDUMP 0x40000000 /* dump over the network */ + +/* dump header flags -- add as necessary */ +#define DUMP_DH_FLAGS_NONE 0x0 /* no flags set (error condition!) */ +#define DUMP_DH_RAW 0x1 /* raw page (no compression) */ +#define DUMP_DH_COMPRESSED 0x2 /* page is compressed */ +#define DUMP_DH_END 0x4 /* end marker on a full dump */ +#define DUMP_DH_TRUNCATED 0x8 /* dump is incomplete */ +#define DUMP_DH_TEST_PATTERN 0x10 /* dump page is a test pattern */ +#define DUMP_DH_NOT_USED 0x20 /* 1st bit not used in flags */ + +/* names for various dump parameters in /proc/kernel */ +#define DUMP_ROOT_NAME "sys/dump" +#define DUMP_DEVICE_NAME "device" +#define DUMP_COMPRESS_NAME "compress" +#define DUMP_LEVEL_NAME "level" +#define DUMP_FLAGS_NAME "flags" + +#define DUMP_SYSRQ_KEY 'd' /* key to use for MAGIC_SYSRQ key */ + +/* CTL_DUMP names: */ +enum +{ + CTL_DUMP_DEVICE=1, + CTL_DUMP_COMPRESS=3, + CTL_DUMP_LEVEL=3, + CTL_DUMP_FLAGS=4, + CTL_DUMP_TEST=5, +}; + + +/* page size for gzip compression -- buffered slightly beyond hardware PAGE_SIZE used by DUMP */ +#define DUMP_DPC_PAGE_SIZE (DUMP_PAGE_SIZE + 512) + +/* dump ioctl() control options */ +#define DIOSDUMPDEV 1 /* set the dump device */ +#define DIOGDUMPDEV 2 /* get the dump device */ +#define DIOSDUMPLEVEL 3 /* set the dump level */ +#define DIOGDUMPLEVEL 4 /* get the dump level */ +#define DIOSDUMPFLAGS 5 /* set the dump flag parameters */ +#define DIOGDUMPFLAGS 6 /* get the dump flag parameters */ +#define DIOSDUMPCOMPRESS 7 /* set the dump compress level */ +#define DIOGDUMPCOMPRESS 8 /* get the dump compress level */ + +/* these ioctls are used only by netdump module */ +#define DIOSTARGETIP 9 /* set the target m/c's ip */ +#define DIOGTARGETIP 10 /* get the target m/c's ip */ +#define DIOSTARGETPORT 11 /* set the target m/c's port */ +#define DIOGTARGETPORT 12 /* get the target m/c's port */ +#define DIOSSOURCEPORT 13 /* set the source m/c's port */ +#define DIOGSOURCEPORT 14 /* get the source m/c's port */ +#define DIOSETHADDR 15 /* set ethernet address */ +#define DIOGETHADDR 16 /* get ethernet address */ + +/* + * Structure: __dump_header + * Function: This is the header dumped at the top of every valid crash + * dump. + */ +struct __dump_header { + /* the dump magic number -- unique to verify dump is valid */ + u64 dh_magic_number; + + /* the version number of this dump */ + u32 dh_version; + + /* the size of this header (in case we can't read it) */ + u32 dh_header_size; + + /* the level of this dump (just a header?) */ + u32 dh_dump_level; + + /* + * We assume dump_page_size to be 4K in every case. + * Store here the configurable system page size (4K, 8K, 16K, etc.) + */ + u32 dh_page_size; + + /* the size of all physical memory */ + u64 dh_memory_size; + + /* the start of physical memory */ + u64 dh_memory_start; + + /* the end of physical memory */ + u64 dh_memory_end; + + /* the number of hardware/physical pages in this dump specifically */ + u32 dh_num_dump_pages; + + /* the panic string, if available */ + char dh_panic_string[DUMP_PANIC_LEN]; + + /* timeval depends on architecture, two long values */ + struct { + u64 tv_sec; + u64 tv_usec; + } dh_time; /* the time of the system crash */ + + /* the NEW utsname (uname) information -- in character form */ + /* we do this so we don't have to include utsname.h */ + /* plus it helps us be more architecture independent */ + /* now maybe one day soon they'll make the [65] a #define! */ + char dh_utsname_sysname[65]; + char dh_utsname_nodename[65]; + char dh_utsname_release[65]; + char dh_utsname_version[65]; + char dh_utsname_machine[65]; + char dh_utsname_domainname[65]; + + /* the address of current task (OLD = void *, NEW = u64) */ + u64 dh_current_task; + + /* what type of compression we're using in this dump (if any) */ + u32 dh_dump_compress; + + /* any additional flags */ + u32 dh_dump_flags; + + /* any additional flags */ + u32 dh_dump_device; +} __attribute__((packed)); + +/* + * Structure: __dump_page + * Function: To act as the header associated to each physical page of + * memory saved in the system crash dump. This allows for + * easy reassembly of each crash dump page. The address bits + * are split to make things easier for 64-bit/32-bit system + * conversions. + * + * dp_byte_offset and dp_page_index are landmarks that are helpful when + * looking at a hex dump of /dev/vmdump, + */ +struct __dump_page { + /* the address of this dump page */ + u64 dp_address; + + /* the size of this dump page */ + u32 dp_size; + + /* flags (currently DUMP_COMPRESSED, DUMP_RAW or DUMP_END) */ + u32 dp_flags; +} __attribute__((packed)); + +/* + * Structure: __lkcdinfo + * Function: This structure contains information needed for the lkcdutils + * package (particularly lcrash) to determine what information is + * associated to this kernel, specifically. + */ +struct __lkcdinfo { + int arch; + int ptrsz; + int byte_order; + int linux_release; + int page_shift; + int page_size; + u64 page_mask; + u64 page_offset; + int stack_offset; +}; + +#ifdef __KERNEL__ + +/* + * Structure: __dump_compress + * Function: This is what an individual compression mechanism can use + * to plug in their own compression techniques. It's always + * best to build these as individual modules so that people + * can put in whatever they want. + */ +struct __dump_compress { + /* the list_head structure for list storage */ + struct list_head list; + + /* the type of compression to use (DUMP_COMPRESS_XXX) */ + int compress_type; + const char *compress_name; + + /* the compression function to call */ + u16 (*compress_func)(const u8 *, u16, u8 *, u16); +}; + +/* functions for dump compression registration */ +extern void dump_register_compression(struct __dump_compress *); +extern void dump_unregister_compression(int); + +/* + * Structure dump_mbank[]: + * + * For CONFIG_DISCONTIGMEM systems this array specifies the + * memory banks/chunks that need to be dumped after a panic. + * + * For classic systems it specifies a single set of pages from + * 0 to max_mapnr. + */ +struct __dump_mbank { + u64 start; + u64 end; + int type; + int pad1; + long pad2; +}; + +#define DUMP_MBANK_TYPE_CONVENTIONAL_MEMORY 1 +#define DUMP_MBANK_TYPE_OTHER 2 + +#define MAXCHUNKS 256 +extern int dump_mbanks; +extern struct __dump_mbank dump_mbank[MAXCHUNKS]; + +/* notification event codes */ +#define DUMP_BEGIN 0x0001 /* dump beginning */ +#define DUMP_END 0x0002 /* dump ending */ + +/* Scheduler soft spin control. + * + * 0 - no dump in progress + * 1 - cpu0 is dumping, ... + */ +extern unsigned long dump_oncpu; +extern void dump_execute(const char *, const struct pt_regs *); + +/* + * Notifier list for kernel code which wants to be called + * at kernel dump. + */ +extern struct notifier_block *dump_notifier_list; +static inline int register_dump_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&dump_notifier_list, nb); +} +static inline int unregister_dump_notifier(struct notifier_block * nb) +{ + return notifier_chain_unregister(&dump_notifier_list, nb); +} + +/* + * Common Arch Specific Functions should be declared here. + * This allows the C compiler to detect discrepancies. + */ +extern void __dump_open(void); +extern void __dump_cleanup(void); +extern void __dump_init(u64); +extern void __dump_save_regs(struct pt_regs *, const struct pt_regs *); +extern int __dump_configure_header(const struct pt_regs *); +extern void __dump_irq_enable(void); +extern void __dump_irq_restore(void); +extern int __dump_page_valid(unsigned long index); +#ifdef CONFIG_SMP +extern void __dump_save_other_cpus(void); +#else +#define __dump_save_other_cpus(void) +#endif + +#endif /* __KERNEL__ */ + +#else /* !CONFIG_CRASH_DUMP */ + +/* If not configured then make code disappear! */ +#define register_dump_watchdog(x) do { } while(0) +#define unregister_dump_watchdog(x) do { } while(0) +#define register_dump_notifier(x) do { } while(0) +#define unregister_dump_notifier(x) do { } while(0) +#define dump_in_progress() 0 + +#endif /* !CONFIG_CRASH_DUMP */ + +#endif /* _DUMP_H */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/dump_netdev.h 900-mjb5/include/linux/dump_netdev.h --- 001-bk10/include/linux/dump_netdev.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/linux/dump_netdev.h Sun Mar 16 13:39:02 2003 @@ -0,0 +1,80 @@ +/* + * linux/drivers/net/netconsole.h + * + * Copyright (C) 2001 Ingo Molnar + * + * This file contains the implementation of an IRQ-safe, crash-safe + * kernel console implementation that outputs kernel messages to the + * network. + * + * Modification history: + * + * 2001-09-17 started by Ingo Molnar. + */ + +/**************************************************************** + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + ****************************************************************/ + +#define NETCONSOLE_VERSION 0x03 + +enum netdump_commands { + COMM_NONE = 0, + COMM_SEND_MEM = 1, + COMM_EXIT = 2, + COMM_REBOOT = 3, + COMM_HELLO = 4, + COMM_GET_NR_PAGES = 5, + COMM_GET_PAGE_SIZE = 6, + COMM_START_NETDUMP_ACK = 7, + COMM_GET_REGS = 8, + COMM_GET_MAGIC = 9, + COMM_START_WRITE_NETDUMP_ACK = 10, +}; + +typedef struct netdump_req_s { + u64 magic; + u32 nr; + u32 command; + u32 from; + u32 to; +} req_t; + +enum netdump_replies { + REPLY_NONE = 0, + REPLY_ERROR = 1, + REPLY_LOG = 2, + REPLY_MEM = 3, + REPLY_RESERVED = 4, + REPLY_HELLO = 5, + REPLY_NR_PAGES = 6, + REPLY_PAGE_SIZE = 7, + REPLY_START_NETDUMP = 8, + REPLY_END_NETDUMP = 9, + REPLY_REGS = 10, + REPLY_MAGIC = 11, + REPLY_START_WRITE_NETDUMP = 12, +}; + +typedef struct netdump_reply_s { + u32 nr; + u32 code; + u32 info; +} reply_t; + +#define HEADER_LEN (1 + sizeof(reply_t)) + + diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/dumpdev.h 900-mjb5/include/linux/dumpdev.h --- 001-bk10/include/linux/dumpdev.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/linux/dumpdev.h Sun Mar 16 13:39:02 2003 @@ -0,0 +1,123 @@ +/* + * Generic dump device interfaces for flexible system dump + * (Enables variation of dump target types e.g disk, network, memory) + * + * These interfaces have evolved based on discussions on lkcd-devel. + * Eventually the intent is to support primary and secondary or + * alternate targets registered at the same time, with scope for + * situation based failover or multiple dump devices used for parallel + * dump i/o. + * + * Started: Oct 2002 - Suparna Bhattacharya (suparna@in.ibm.com) + * + * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. + * Copyright (C) 2002 International Business Machines Corp. + * + * This code is released under version 2 of the GNU GPL. + */ + +#ifndef _LINUX_DUMPDEV_H +#define _LINUX_DUMPDEV_H + +#include +#include +#include + +/* Determined by the dump target (device) type */ + +struct dump_dev; + +struct dump_dev_ops { + int (*open)(struct dump_dev *, unsigned long); /* configure */ + int (*release)(struct dump_dev *); /* unconfigure */ + int (*silence)(struct dump_dev *); /* when dump starts */ + int (*resume)(struct dump_dev *); /* when dump is over */ + int (*seek)(struct dump_dev *, loff_t); + /* trigger a write (async in nature typically) */ + int (*write)(struct dump_dev *, void *, unsigned long); + /* not usually used during dump, but option available */ + int (*read)(struct dump_dev *, void *, unsigned long); + /* use to poll for completion */ + int (*ready)(struct dump_dev *, void *); + int (*ioctl)(struct dump_dev *, unsigned int, unsigned long); +}; + +struct dump_dev { + char type_name[32]; /* block, net-poll etc */ + unsigned long device_id; /* interpreted differently for various types */ + struct dump_dev_ops *ops; + struct list_head list; + loff_t curr_offset; +}; + +/* + * dump_dev type variations: + */ + +/* block */ +struct dump_blockdev { + struct dump_dev ddev; + kdev_t kdev_id; + struct block_device *bdev; + struct bio *bio; + loff_t start_offset; + loff_t limit; + int err; +}; + +static inline struct dump_blockdev *DUMP_BDEV(struct dump_dev *dev) +{ + return container_of(dev, struct dump_blockdev, ddev); +} + +/* Dump device / target operation wrappers */ +/* These assume that dump_dev is initiatized to dump_config.dumper->dev */ + +extern struct dump_dev *dump_dev; + +static inline int dump_dev_open(unsigned long arg) +{ + return dump_dev->ops->open(dump_dev, arg); +} + +static inline int dump_dev_release(void) +{ + return dump_dev->ops->release(dump_dev); +} + +static inline int dump_dev_silence(void) +{ + return dump_dev->ops->silence(dump_dev); +} + +static inline int dump_dev_resume(void) +{ + return dump_dev->ops->resume(dump_dev); +} + +static inline int dump_dev_seek(loff_t offset) +{ + return dump_dev->ops->seek(dump_dev, offset); +} + +static inline int dump_dev_write(void *buf, unsigned long len) +{ + return dump_dev->ops->write(dump_dev, buf, len); +} + +static inline int dump_dev_ready(void *buf) +{ + return dump_dev->ops->ready(dump_dev, buf); +} + +static inline int dump_dev_ioctl(unsigned int cmd, unsigned long arg) +{ + if (!dump_dev->ops->ioctl) + return -EINVAL; + return dump_dev->ops->ioctl(dump_dev, cmd, arg); +} + +extern int dump_register_device(struct dump_dev *); +extern void dump_unregister_device(struct dump_dev *); + +#endif /* _LINUX_DUMPDEV_H */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/early_printk.h 900-mjb5/include/linux/early_printk.h --- 001-bk10/include/linux/early_printk.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/linux/early_printk.h Sun Mar 16 13:38:49 2003 @@ -0,0 +1,47 @@ +#ifndef __EARLY_PRINTK_H_ +#define __EARLY_PRINTK_H_ + +#ifdef CONFIG_EARLY_PRINTK +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +/* Simple serial port output */ + +#define DEFAULT_BAUD 57600 +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + + +void early_printk(const char *fmt, ...); +int __init setup_early_printk(char *opt); + +#else + +#define early_printk(...) do {} while(0) +#define setup_early_printk(X) do {} while(0) + +#endif + +#endif diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/ext2_fs_sb.h 900-mjb5/include/linux/ext2_fs_sb.h --- 001-bk10/include/linux/ext2_fs_sb.h Sun Nov 17 20:29:54 2002 +++ 900-mjb5/include/linux/ext2_fs_sb.h Sun Mar 16 13:50:49 2003 @@ -16,6 +16,13 @@ #ifndef _LINUX_EXT2_FS_SB #define _LINUX_EXT2_FS_SB +struct ext2_bg_info { + u8 debts; + spinlock_t balloc_lock; + spinlock_t ialloc_lock; + unsigned int reserved; +} ____cacheline_aligned_in_smp; + /* * second extended-fs super-block data in memory */ @@ -44,7 +51,7 @@ struct ext2_sb_info { int s_first_ino; u32 s_next_generation; unsigned long s_dir_count; - u8 *s_debts; + struct ext2_bg_info *s_bgi; }; #endif /* _LINUX_EXT2_FS_SB */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/file.h 900-mjb5/include/linux/file.h --- 001-bk10/include/linux/file.h Sun Nov 17 20:29:20 2002 +++ 900-mjb5/include/linux/file.h Sun Mar 16 13:39:03 2003 @@ -40,6 +40,9 @@ extern void FASTCALL(set_close_on_exec(u extern void put_filp(struct file *); extern int get_unused_fd(void); extern void FASTCALL(put_unused_fd(unsigned int fd)); +struct kmem_cache_s; +extern void filp_ctor(void * objp, struct kmem_cache_s *cachep, unsigned long cflags); +extern void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags); extern struct file ** alloc_fd_array(int); extern int expand_fd_array(struct files_struct *, int nr); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/gdb.h 900-mjb5/include/linux/gdb.h --- 001-bk10/include/linux/gdb.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/linux/gdb.h Sun Mar 16 13:38:57 2003 @@ -0,0 +1,67 @@ +#ifndef _GDB_H_ +#define _GDB_H_ + +/* + * Copyright (C) 2001 Amit S. Kale + */ + +/* gdb locks */ +#define KGDB_MAX_NO_CPUS NR_CPUS + +extern int gdb_enter; /* 1 = enter debugger on boot */ +extern int gdb_ttyS; +extern int gdb_baud; +extern int gdb_initialized; + +extern int gdb_hook(void); +extern void breakpoint(void); + +typedef int gdb_debug_hook(int trapno, + int signo, + int err_code, + struct pt_regs *regs); +extern gdb_debug_hook *linux_debug_hook; + +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +extern spinlock_t kgdb_nmispinlock; +#else +extern unsigned kgdb_spinlock; +extern unsigned kgdb_nmispinlock; +#endif + +extern volatile int kgdb_memerr_expected; + +struct console; +void gdb_console_write(struct console *co, const char *s, + unsigned count); +void gdb_console_init(void); + +extern volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#define KGDB_ASSERT(message, condition) do { \ + if (!(condition)) { \ + printk("kgdb assertion failed: %s\n", message); \ + asm ("int $0x3"); \ + } \ +} while (0) + +#ifdef CONFIG_KERNEL_ASSERTS +#define KERNEL_ASSERT(message, condition) KGDB_ASSERT(message, condition) +#else +#define KERNEL_ASSERT(message, condition) +#endif + +#define KA_VALID_ERRNO(errno) ((errno) > 0 && (errno) <= EMEDIUMTYPE) + +#define KA_VALID_PTR_ERR(ptr) KA_VALID_ERRNO(-PTR_ERR(ptr)) + +#define KA_VALID_KPTR(ptr) (!(ptr) || \ + ((void *)(ptr) >= (void *)PAGE_OFFSET && \ + (void *)(ptr) < ERR_PTR(-EMEDIUMTYPE))) + +#define KA_VALID_PTRORERR(errptr) (KA_VALID_KPTR(errptr) || KA_VALID_PTR_ERR(errptr)) + +#define KA_HELD_GKL() (current->lock_depth >= 0) + +#endif /* _GDB_H_ */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/kprobes.h 900-mjb5/include/linux/kprobes.h --- 001-bk10/include/linux/kprobes.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/linux/kprobes.h Sun Mar 16 13:38:57 2003 @@ -0,0 +1,60 @@ +#ifndef _LINUX_KPROBES_H +#define _LINUX_KPROBES_H +#include +#include +#include +#include +#include + +struct kprobe; +struct pt_regs; + +typedef void (*kprobe_pre_handler_t)(struct kprobe *, struct pt_regs *); +typedef void (*kprobe_post_handler_t)(struct kprobe *, struct pt_regs *, + unsigned long flags); +typedef int (*kprobe_fault_handler_t)(struct kprobe *, struct pt_regs *, + int trapnr); + +struct kprobe { + struct list_head list; + + /* location of the probe point */ + kprobe_opcode_t *addr; + + /* Called before addr is executed. */ + kprobe_pre_handler_t pre_handler; + + /* Called after addr is executed, unless... */ + kprobe_post_handler_t post_handler; + + /* ... called if executing addr causes a fault (eg. page fault). + * Return 1 if it handled fault, otherwise kernel will see it. */ + kprobe_fault_handler_t fault_handler; + + /* Saved opcode (which has been replaced with breakpoint) */ + kprobe_opcode_t opcode; +}; + +#ifdef CONFIG_KPROBES +/* Locks kprobe: irq must be disabled */ +void lock_kprobes(void); +void unlock_kprobes(void); + +/* kprobe running now on this CPU? */ +static inline int kprobe_running(void) +{ + extern unsigned int kprobe_cpu; + return kprobe_cpu == smp_processor_id(); +} + +/* Get the kprobe at this addr (if any). Must have called lock_kprobes */ +struct kprobe *get_kprobe(void *addr); + +int register_kprobe(struct kprobe *p); +void unregister_kprobe(struct kprobe *p); +#else +static inline int kprobe_running(void) { return 0; } +static inline int register_kprobe(struct kprobe *p) { return -ENOSYS; } +static inline void unregister_kprobe(struct kprobe *p) { } +#endif +#endif /* _LINUX_KPROBES_H */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/lockmeter.h 900-mjb5/include/linux/lockmeter.h --- 001-bk10/include/linux/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/linux/lockmeter.h Sun Mar 16 13:39:06 2003 @@ -0,0 +1,320 @@ +/* + * Copyright (C) 1999-2002 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) Feb-Apr 2000 + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks + * Added support for hold time statistics for read and write + * locks. + * Moved machine dependent code to include/asm/lockmeter.h. + * + */ + +#ifndef _LINUX_LOCKMETER_H +#define _LINUX_LOCKMETER_H + + +/*--------------------------------------------------- + * architecture-independent lockmeter.h + *-------------------------------------------------*/ + +/* + * raybry -- version 2: added efficient hold time statistics + * requires lstat recompile, so flagged as new version + * raybry -- version 3: added global reader lock data + * hawkes -- version 4: removed some unnecessary fields to simplify mips64 port + */ +#define LSTAT_VERSION 5 + +int lstat_update(void*, void*, int); +int lstat_update_time(void*, void*, int, uint32_t); + +/* + * Currently, the mips64 and sparc64 kernels talk to a 32-bit lockstat, so we + * need to force compatibility in the inter-communication data structure. + */ + +#if defined(CONFIG_MIPS32_COMPAT) +#define TIME_T uint32_t +#elif defined(CONFIG_SPARC32_COMPAT) +#define TIME_T uint64_t +#else +#define TIME_T time_t +#endif + +#if defined(__KERNEL__) || (!defined(CONFIG_MIPS32_COMPAT) && !defined(CONFIG_SPARC32_COMPAT)) || (_MIPS_SZLONG==32) +#define POINTER void * +#else +#define POINTER int64_t +#endif + +/* + * Values for the "action" parameter passed to lstat_update. + * ZZZ - do we want a try-success status here??? + */ +#define LSTAT_ACT_NO_WAIT 0 +#define LSTAT_ACT_SPIN 1 +#define LSTAT_ACT_REJECT 2 +#define LSTAT_ACT_WW_SPIN 3 +#define LSTAT_ACT_SLEPT 4 /* UNUSED */ + +#define LSTAT_ACT_MAX_VALUES 4 /* NOTE: Increase to 5 if use ACT_SLEPT */ + +/* + * Special values for the low 2 bits of an RA passed to + * lstat_update. + */ +/* we use these values to figure out what kind of lock data */ +/* is stored in the statistics table entry at index ....... */ +#define LSTAT_RA_SPIN 0 /* spin lock data */ +#define LSTAT_RA_READ 1 /* read lock statistics */ +#define LSTAT_RA_SEMA 2 /* RESERVED */ +#define LSTAT_RA_WRITE 3 /* write lock statistics*/ + +#define LSTAT_RA(n) \ + ((void*)( ((unsigned long)__builtin_return_address(0) & ~3) | n) ) + +/* + * Constants used for lock addresses in the lstat_directory + * to indicate special values of the lock address. + */ +#define LSTAT_MULTI_LOCK_ADDRESS NULL + +/* + * Maximum size of the lockstats tables. Increase this value + * if its not big enough. (Nothing bad happens if its not + * big enough although some locks will not be monitored.) + * We record overflows of this quantity in lstat_control.dir_overflows + * + * Note: The max value here must fit into the field set + * and obtained by the macro's PUT_INDEX() and GET_INDEX(). + * This value depends on how many bits are available in the + * lock word in the particular machine implementation we are on. + */ +#define LSTAT_MAX_STAT_INDEX 2000 + +/* + * Size and mask for the hash table into the directory. + */ +#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ +#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) + +#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) + +/* + * This defines an entry in the lockstat directory. It contains + * information about a lock being monitored. + * A directory entry only contains the lock identification - + * counts on usage of the lock are kept elsewhere in a per-cpu + * data structure to minimize cache line pinging. + */ +typedef struct { + POINTER caller_ra; /* RA of code that set lock */ + POINTER lock_ptr; /* lock address */ + ushort next_stat_index; /* Used to link multiple locks that have the same hash table value */ +} lstat_directory_entry_t; + +/* + * A multi-dimensioned array used to contain counts for lock accesses. + * The array is 3-dimensional: + * - CPU number. Keep from thrashing cache lines between CPUs + * - Directory entry index. Identifies the lock + * - Action. Indicates what kind of contention occurred on an + * access to the lock. + * + * The index of an entry in the directory is the same as the 2nd index + * of the entry in the counts array. + */ +/* + * This table contains data for spin_locks, write locks, and read locks + * Not all data is used for all cases. In particular, the hold time + * information is not stored here for read locks since that is a global + * (e. g. cannot be separated out by return address) quantity. + * See the lstat_read_lock_counts_t structure for the global read lock + * hold time. + */ +typedef struct { + uint64_t cum_wait_ticks; /* sum of wait times */ + /* for write locks, sum of time a */ + /* writer is waiting for a reader */ + int64_t cum_hold_ticks; /* cumulative sum of holds */ + /* not used for read mode locks */ + /* must be signed. ............... */ + uint32_t max_wait_ticks; /* max waiting time */ + uint32_t max_hold_ticks; /* max holding time */ + uint64_t cum_wait_ww_ticks; /* sum times writer waits on writer*/ + uint32_t max_wait_ww_ticks; /* max wait time writer vs writer */ + /* prev 2 only used for write locks*/ + uint32_t acquire_time; /* time lock acquired this CPU */ + uint32_t count[LSTAT_ACT_MAX_VALUES]; +} lstat_lock_counts_t; + +typedef lstat_lock_counts_t lstat_cpu_counts_t[LSTAT_MAX_STAT_INDEX]; + +/* + * User request to: + * - turn statistic collection on/off, or to reset + */ +#define LSTAT_OFF 0 +#define LSTAT_ON 1 +#define LSTAT_RESET 2 +#define LSTAT_RELEASE 3 + +#define LSTAT_MAX_READ_LOCK_INDEX 1000 +typedef struct { + POINTER lock_ptr; /* address of lock for output stats */ + uint32_t read_lock_count; + int64_t cum_hold_ticks; /* sum of read lock hold times over */ + /* all callers. ....................*/ + uint32_t write_index; /* last write lock hash table index */ + uint32_t busy_periods; /* count of busy periods ended this */ + uint64_t start_busy; /* time this busy period started. ..*/ + uint64_t busy_ticks; /* sum of busy periods this lock. ..*/ + uint64_t max_busy; /* longest busy period for this lock*/ + uint32_t max_readers; /* maximum number of readers ...... */ +#ifdef USER_MODE_TESTING + rwlock_t entry_lock; /* lock for this read lock entry... */ + /* avoid having more than one rdr at*/ + /* needed for user space testing... */ + /* not needed for kernel 'cause it */ + /* is non-preemptive. ............. */ +#endif +} lstat_read_lock_counts_t; +typedef lstat_read_lock_counts_t lstat_read_lock_cpu_counts_t[LSTAT_MAX_READ_LOCK_INDEX]; + +#if defined(__KERNEL__) || defined(USER_MODE_TESTING) + +#ifndef USER_MODE_TESTING +#include +#else +#include "asm_newlockmeter.h" +#endif + +/* + * Size and mask for the hash table into the directory. + */ +#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ +#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) + +#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) + +/* + * This version eliminates the per processor lock stack. What we do is to + * store the index of the lock hash structure in unused bits in the lock + * itself. Then on unlock we can find the statistics record without doing + * any additional hash or lock stack lookup. This works for spin_locks. + * Hold time reporting is now basically as cheap as wait time reporting + * so we ignore the difference between LSTAT_ON_HOLD and LSTAT_ON_WAIT + * as in version 1.1.* of lockmeter. + * + * For rw_locks, we store the index of a global reader stats structure in + * the lock and the writer index is stored in the latter structure. + * For read mode locks we hash at the time of the lock to find an entry + * in the directory for reader wait time and the like. + * At unlock time for read mode locks, we update just the global structure + * so we don't need to know the reader directory index value at unlock time. + * + */ + +/* + * Protocol to change lstat_control.state + * This is complicated because we don't want the cum_hold_time for + * a rw_lock to be decremented in _read_lock_ without making sure it + * is incremented in _read_lock_ and vice versa. So here is the + * way we change the state of lstat_control.state: + * I. To Turn Statistics On + * After allocating storage, set lstat_control.state non-zero. + * This works because we don't start updating statistics for in use + * locks until the reader lock count goes to zero. + * II. To Turn Statistics Off: + * (0) Disable interrupts on this CPU + * (1) Seize the lstat_control.directory_lock + * (2) Obtain the current value of lstat_control.next_free_read_lock_index + * (3) Store a zero in lstat_control.state. + * (4) Release the lstat_control.directory_lock + * (5) For each lock in the read lock list up to the saved value + * (well, -1) of the next_free_read_lock_index, do the following: + * (a) Check validity of the stored lock address + * by making sure that the word at the saved addr + * has an index that matches this entry. If not + * valid, then skip this entry. + * (b) If there is a write lock already set on this lock, + * skip to (d) below. + * (c) Set a non-metered write lock on the lock + * (d) set the cached INDEX in the lock to zero + * (e) Release the non-metered write lock. + * (6) Re-enable interrupts + * + * These rules ensure that a read lock will not have its statistics + * partially updated even though the global lock recording state has + * changed. See put_lockmeter_info() for implementation. + * + * The reason for (b) is that there may be write locks set on the + * syscall path to put_lockmeter_info() from user space. If we do + * not do this check, then we can deadlock. A similar problem would + * occur if the lock was read locked by the current CPU. At the + * moment this does not appear to happen. + */ + +/* + * Main control structure for lockstat. Used to turn statistics on/off + * and to maintain directory info. + */ +typedef struct { + int state; + spinlock_t control_lock; /* used to serialize turning statistics on/off */ + spinlock_t directory_lock; /* for serialize adding entries to directory */ + volatile int next_free_dir_index;/* next free entry in the directory */ + /* FIXME not all of these fields are used / needed .............. */ + /* the following fields represent data since */ + /* first "lstat on" or most recent "lstat reset" */ + TIME_T first_started_time; /* time when measurement first enabled */ + TIME_T started_time; /* time when measurement last started */ + TIME_T ending_time; /* time when measurement last disabled */ + uint64_t started_cycles64; /* cycles when measurement last started */ + uint64_t ending_cycles64; /* cycles when measurement last disabled */ + uint64_t enabled_cycles64; /* total cycles with measurement enabled */ + int intervals; /* number of measurement intervals recorded */ + /* i. e. number of times did lstat on;lstat off */ + lstat_directory_entry_t *dir; /* directory */ + int dir_overflow; /* count of times ran out of space in directory */ + int rwlock_overflow; /* count of times we couldn't allocate a rw block*/ + ushort *hashtab; /* hash table for quick dir scans */ + lstat_cpu_counts_t *counts[NR_CPUS]; /* Array of pointers to per-cpu stats */ + int next_free_read_lock_index; /* next rwlock reader (global) stats block */ + lstat_read_lock_cpu_counts_t *read_lock_counts[NR_CPUS]; /* per cpu read lock stats */ +} lstat_control_t; + +#endif /* defined(__KERNEL__) || defined(USER_MODE_TESTING) */ + +typedef struct { + short lstat_version; /* version of the data */ + short state; /* the current state is returned */ + int maxcpus; /* Number of cpus present */ + int next_free_dir_index; /* index of the next free directory entry */ + TIME_T first_started_time; /* when measurement enabled for first time */ + TIME_T started_time; /* time in secs since 1969 when stats last turned on */ + TIME_T ending_time; /* time in secs since 1969 when stats last turned off */ + uint32_t cycleval; /* cycles per second */ +#ifdef notyet + void *kernel_magic_addr; /* address of kernel_magic */ + void *kernel_end_addr; /* contents of kernel magic (points to "end") */ +#endif + int next_free_read_lock_index; /* index of next (global) read lock stats struct */ + uint64_t started_cycles64; /* cycles when measurement last started */ + uint64_t ending_cycles64; /* cycles when stats last turned off */ + uint64_t enabled_cycles64; /* total cycles with measurement enabled */ + int intervals; /* number of measurement intervals recorded */ + /* i.e. number of times we did lstat on;lstat off*/ + int dir_overflow; /* number of times we wanted more space in directory */ + int rwlock_overflow; /* # of times we wanted more space in read_locks_count */ + struct new_utsname uts; /* info about machine where stats are measured */ + /* -T option of lockstat allows data to be */ + /* moved to another machine. ................. */ +} lstat_user_request_t; + +#endif /* _LINUX_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/major.h 900-mjb5/include/linux/major.h --- 001-bk10/include/linux/major.h Fri Dec 13 23:18:13 2002 +++ 900-mjb5/include/linux/major.h Sun Mar 16 13:39:02 2003 @@ -165,6 +165,8 @@ #define OSST_MAJOR 206 /* OnStream-SCx0 SCSI tape */ +#define CRASH_DUMP_MAJOR 221 /* crash dump interface */ + #define IBM_TTY3270_MAJOR 227 /* Official allocations now */ #define IBM_FS3270_MAJOR 228 diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/mm.h 900-mjb5/include/linux/mm.h --- 001-bk10/include/linux/mm.h Sun Mar 16 13:38:21 2003 +++ 900-mjb5/include/linux/mm.h Sun Mar 16 13:39:06 2003 @@ -107,6 +107,7 @@ struct vm_area_struct { #define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_NONLINEAR 0x00800000 /* VM contains nonlinear mappings */ #ifdef CONFIG_STACK_GROWSUP #define VM_STACK_FLAGS (VM_GROWSUP | VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT) @@ -126,7 +127,6 @@ struct vm_area_struct { */ extern pgprot_t protection_map[16]; - /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -139,8 +139,9 @@ struct vm_operations_struct { int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, unsigned long prot, unsigned long pgoff, int nonblock); }; -/* forward declaration; pte_chain is meant to be internal to rmap.c */ +/* forward declaration; pte_chain and mm_chain are meant to be internal to rmap.c */ struct pte_chain; +struct mm_chain; struct mmu_gather; /* @@ -171,6 +172,7 @@ struct page { struct pte_chain *chain;/* Reverse pte mapping pointer. * protected by PG_chainlock */ pte_addr_t direct; + int mapcount; } pte; unsigned long private; /* mapping-private opaque data */ @@ -196,6 +198,26 @@ struct page { */ #include +struct ptpage { + unsigned long flags; /* atomic flags, some possibly + updated asynchronously */ + atomic_t count; /* Usage count, see below. */ + unsigned long virtual; /* virtual address this page maps */ + unsigned short mapcount; /* Number of pages mapped to this page */ + unsigned short swapcount; /* Number of swap pages in this page */ + union { + struct mm_chain *mmchain;/* Reverse mm_struct mapping pointer */ + struct mm_struct *mmdirect; + } pte; + struct semaphore sem; +}; + +static inline void clear_pte_page(struct ptpage *ptepage) +{ + ClearPagePtepage(ptepage); + memset(&ptepage->sem, 0, sizeof(struct semaphore)); +} + /* * Methods to modify the page usage count. * @@ -400,14 +422,19 @@ struct file *shmem_file_setup(char * nam void shmem_lock(struct file * file, int lock); int shmem_zero_setup(struct vm_area_struct *); +void increment_rss(struct ptpage *ptpage); +void decrement_rss(struct ptpage *ptpage); +void increment_swapcount(struct ptpage *ptpage); +void decrement_swapcount(struct ptpage *ptpage); + void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted); -void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, +void unmap_page_range(struct mmu_gather **tlb, struct vm_area_struct *vma, unsigned long address, unsigned long size); -void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr); +void unmap_all_pages(struct mm_struct *mm); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/netdevice.h 900-mjb5/include/linux/netdevice.h --- 001-bk10/include/linux/netdevice.h Fri Jan 17 09:18:32 2003 +++ 900-mjb5/include/linux/netdevice.h Sun Mar 16 13:39:02 2003 @@ -422,6 +422,9 @@ struct net_device unsigned char *haddr); int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); int (*accept_fastpath)(struct net_device *, struct dst_entry*); +#define HAVE_POLL_CONTROLLER + void (*poll_controller)(struct net_device *dev); + int (*rx_hook)(struct sk_buff *skb); /* open/release and usage marking */ struct module *owner; diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/page-flags.h 900-mjb5/include/linux/page-flags.h --- 001-bk10/include/linux/page-flags.h Thu Feb 13 11:08:14 2003 +++ 900-mjb5/include/linux/page-flags.h Sun Mar 16 13:39:06 2003 @@ -74,6 +74,8 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_ptepage 20 /* This page is a pte page */ +#define PG_anon 22 /* Anonymous page */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -247,6 +249,12 @@ extern void get_full_page_state(struct p #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) +#define PagePtepage(page) test_bit(PG_ptepage, &(page)->flags) +#define SetPagePtepage(page) set_bit(PG_ptepage, &(page)->flags) +#define TestSetPagePtepage(page) test_and_set_bit(PG_ptepage, &(page)->flags) +#define ClearPagePtepage(page) clear_bit(PG_ptepage, &(page)->flags) +#define TestClearPagePtepage(page) test_and_clear_bit(PG_ptepage, &(page)->flags) + #define PageReclaim(page) test_bit(PG_reclaim, &(page)->flags) #define SetPageReclaim(page) set_bit(PG_reclaim, &(page)->flags) #define ClearPageReclaim(page) clear_bit(PG_reclaim, &(page)->flags) @@ -255,6 +263,10 @@ extern void get_full_page_state(struct p #define PageCompound(page) test_bit(PG_compound, &(page)->flags) #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) + +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) /* * The PageSwapCache predicate doesn't use a PG_flag at this time, diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/ptshare.h 900-mjb5/include/linux/ptshare.h --- 001-bk10/include/linux/ptshare.h Wed Dec 31 16:00:00 1969 +++ 900-mjb5/include/linux/ptshare.h Sun Mar 16 13:39:06 2003 @@ -0,0 +1,159 @@ +#ifndef _LINUX_PTSHARE_H +#define _LINUX_PTSHARE_H + +#include + +#include +#include + +/* + * Lock primitives for the pte page. They're aliased to the + * pte chain lock in struct page, since pte pages can't have + * pte chains. + */ + + +static inline void pte_page_lock(struct ptpage *ptepage) +{ + pte_chain_lock((struct page *)ptepage); +} + +static inline int pte_page_trylock(struct ptpage *ptepage) +{ + return pte_chain_trylock((struct page *)ptepage); +} + +static inline void pte_page_unlock(struct ptpage *ptepage) +{ + pte_chain_unlock((struct page *)ptepage); +} + +/* + * Provide a primitive for taking a pmd entry and using it to + * get the corresponding pte_page_lock. This function takes + * the page_table_lock briefly to freeze the pmd entry, so it can + * only be used in places where the page_table_lock is not held. + * The pte page pointer is returned, since most callers will want it + * and it's handy. + */ + +static inline struct ptpage *pte_page_lock_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + struct ptpage *ptepage; + + spin_lock(&mm->page_table_lock); + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + spin_unlock(&mm->page_table_lock); + return ptepage; +} + +/* + * Functions to handle shared page tables + */ + +#ifdef CONFIG_SHAREPTE + +int zap_shared_range(struct mmu_gather **tlb, pmd_t *pmd, unsigned long address, + unsigned long end); +int zap_shared_pmd(struct mm_struct *mm, pmd_t *pmd); +pte_t *pte_alloc_unshare(struct mm_struct *mm, pmd_t *pmd, + unsigned long address); +pte_t *pte_map_unshare(struct mm_struct *mm, pmd_t *pmd, + unsigned long address); +int share_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma, pmd_t **prev_pmd); +void unshare_page_range(struct mm_struct *mm, unsigned long address, + unsigned long len); +pte_t *mprotect_shared_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, unsigned long end); +void mremap_unshare(struct mm_struct *mm, pmd_t *src_pmd, pmd_t *dst_pmd, + unsigned long src_addr, unsigned long dst_addr); +pte_t *pte_fault_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address, int write_access); +int fork_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma, pmd_t **prev_pmd); + +#else + +static inline void unshare_page_range(struct mm_struct *mm, + unsigned long address, unsigned long len) +{ + return; +} + +static inline int fork_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma, pmd_t **prev_pmd) +{ + return copy_page_range(dst, src, vma); +} + + +static inline int zap_shared_range(struct mmu_gather **tlb, pmd_t *pmd, + unsigned long address, unsigned long end) +{ + pte_page_lock(pmd_ptpage(*pmd)); + return 1; +} + +static inline int zap_shared_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + return 1; +} + +static inline pte_t *pte_alloc_unshare(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + pte_t *pte; + + pte = pte_alloc_map(mm, pmd, address); + if (pte) + pte_page_lock(pmd_ptpage(*pmd)); + + return pte; +} + +static inline pte_t *pte_map_unshare(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + pte_t *pte; + + if (pmd_present(*pmd)) { + pte_page_lock(pmd_ptpage(*pmd)); + pte = pte_offset_map(pmd, address); + } else + pte = NULL; + + return pte; +} + +static inline pte_t * +mprotect_shared_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, unsigned long end) +{ + pte_page_lock(pmd_ptpage(*pmd)); + return pte_offset_map(pmd, address); +} + +static inline void +mremap_unshare(struct mm_struct *mm, pmd_t *src_pmd, pmd_t *dst_pmd, + unsigned long src_addr, unsigned long dst_addr) +{ + return; +} + +static inline pte_t * +pte_fault_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address, int write_access) +{ + pte_t *pte; + + pte = pte_alloc_map(mm, pmd, address); + if (pte) + pte_page_lock(pmd_ptpage(*pmd)); + + return pte; +} +#endif /* CONFIG_SHARE_PTE */ + +#endif diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/rmap-locking.h 900-mjb5/include/linux/rmap-locking.h --- 001-bk10/include/linux/rmap-locking.h Thu Jan 9 19:16:14 2003 +++ 900-mjb5/include/linux/rmap-locking.h Sun Mar 16 13:39:06 2003 @@ -28,6 +28,18 @@ static inline void pte_chain_lock(struct #endif } +static inline int pte_chain_trylock(struct page *page) +{ + preempt_disable(); +#ifdef CONFIG_SMP + if (test_and_set_bit(PG_chainlock, &page->flags)) { + preempt_enable(); + return 0; + } +#endif + return 1; +} + static inline void pte_chain_unlock(struct page *page) { #ifdef CONFIG_SMP diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/sched.h 900-mjb5/include/linux/sched.h --- 001-bk10/include/linux/sched.h Sun Mar 16 13:38:21 2003 +++ 900-mjb5/include/linux/sched.h Sun Mar 16 18:34:49 2003 @@ -69,7 +69,11 @@ struct exec_domain; * the EXP_n values would be 1981, 2034 and 2043 if still using only * 11 bit fractions. */ -extern unsigned long avenrun[]; /* Load averages */ +extern unsigned long avenrun[]; /* Load averages */ +extern unsigned long tasks_running[3]; /* Real load averages */ +extern unsigned long cpu_tasks_running[3][NR_CPUS]; /* Real load averages per cpu */ + +extern unsigned long tasks_running[]; /* Real load averages */ #define FSHIFT 11 /* nr of bits of precision */ #define FIXED_1 (1<fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ static inline void task_lock(struct task_struct *p) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/spinlock.h 900-mjb5/include/linux/spinlock.h --- 001-bk10/include/linux/spinlock.h Sat Feb 15 16:11:47 2003 +++ 900-mjb5/include/linux/spinlock.h Sun Mar 16 13:39:06 2003 @@ -183,6 +183,17 @@ typedef struct { #endif /* !SMP */ +#ifdef CONFIG_LOCKMETER +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock (spinlock_t *lock); +extern int _metered_spin_trylock(spinlock_t *lock); +extern void _metered_read_lock (rwlock_t *lock); +extern void _metered_read_unlock (rwlock_t *lock); +extern void _metered_write_lock (rwlock_t *lock); +extern void _metered_write_unlock (rwlock_t *lock); +extern int _metered_write_trylock(rwlock_t *lock); +#endif + /* * Define the various spin_lock and rw_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various @@ -317,20 +328,20 @@ do { \ #define spin_unlock_irqrestore(lock, flags) \ do { \ - _raw_spin_unlock(lock); \ + spin_unlock(lock); \ local_irq_restore(flags); \ preempt_enable(); \ } while (0) #define _raw_spin_unlock_irqrestore(lock, flags) \ do { \ - _raw_spin_unlock(lock); \ + spin_unlock(lock); \ local_irq_restore(flags); \ } while (0) #define spin_unlock_irq(lock) \ do { \ - _raw_spin_unlock(lock); \ + spin_unlock(lock); \ local_irq_enable(); \ preempt_enable(); \ } while (0) @@ -344,14 +355,14 @@ do { \ #define read_unlock_irqrestore(lock, flags) \ do { \ - _raw_read_unlock(lock); \ + read_unlock(lock); \ local_irq_restore(flags); \ preempt_enable(); \ } while (0) #define read_unlock_irq(lock) \ do { \ - _raw_read_unlock(lock); \ + read_unlock(lock); \ local_irq_enable(); \ preempt_enable(); \ } while (0) @@ -365,14 +376,14 @@ do { \ #define write_unlock_irqrestore(lock, flags) \ do { \ - _raw_write_unlock(lock); \ + write_unlock(lock); \ local_irq_restore(flags); \ preempt_enable(); \ } while (0) #define write_unlock_irq(lock) \ do { \ - _raw_write_unlock(lock); \ + write_unlock(lock); \ local_irq_enable(); \ preempt_enable(); \ } while (0) @@ -387,6 +398,35 @@ do { \ #define spin_trylock_bh(lock) ({ local_bh_disable(); preempt_disable(); \ _raw_spin_trylock(lock) ? 1 : \ ({preempt_enable(); local_bh_enable(); 0;});}) + +#ifdef CONFIG_LOCKMETER +#define spin_lock(lock) \ +do { \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while(0) + +#define spin_trylock(lock) ({preempt_disable(); _metered_spin_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable(); \ +} while (0) + +#define read_lock(lock) ({preempt_disable(); _metered_read_lock(lock);}) +#define read_unlock(lock) ({_metered_read_unlock(lock); preempt_enable();}) +#define write_lock(lock) ({preempt_disable(); _metered_write_lock(lock);}) +#define write_unlock(lock) ({_metered_write_unlock(lock); preempt_enable();}) +#define write_trylock(lock) ({preempt_disable();_metered_write_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock_no_resched(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable_no_resched(); \ +} while (0) + +#endif /* !CONFIG_LOCKMETER */ /* "lock on reference count zero" */ #ifndef ATOMIC_DEC_AND_LOCK diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/sysctl.h 900-mjb5/include/linux/sysctl.h --- 001-bk10/include/linux/sysctl.h Tue Feb 25 23:03:51 2003 +++ 900-mjb5/include/linux/sysctl.h Sun Mar 16 13:39:02 2003 @@ -66,7 +66,8 @@ enum CTL_DEV=7, /* Devices */ CTL_BUS=8, /* Busses */ CTL_ABI=9, /* Binary emulation */ - CTL_CPU=10 /* CPU stuff (speed scaling, etc) */ + CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_SCHED=11, /* scheduler tunables */ }; /* CTL_BUS names: */ @@ -129,6 +130,8 @@ enum KERN_CADPID=54, /* int: PID of the process to notify on CAD */ KERN_PIDMAX=55, /* int: PID # limit */ KERN_CORE_PATTERN=56, /* string: pattern for core-file names */ + + KERN_DUMP=60, /* directory: dump parameters */ }; @@ -157,6 +160,21 @@ enum VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ }; +/* Tunable scheduler parameters in /proc/sys/sched/ */ +enum { + SCHED_MIN_TIMESLICE=1, /* minimum process timeslice */ + SCHED_MAX_TIMESLICE=2, /* maximum process timeslice */ + SCHED_CHILD_PENALTY=3, /* penalty on fork to child */ + SCHED_PARENT_PENALTY=4, /* penalty on fork to parent */ + SCHED_EXIT_WEIGHT=5, /* penalty to parent of CPU hog child */ + SCHED_PRIO_BONUS_RATIO=6, /* percent of max prio given as bonus */ + SCHED_INTERACTIVE_DELTA=7, /* delta used to scale interactivity */ + SCHED_MAX_SLEEP_AVG=8, /* maximum sleep avg attainable */ + SCHED_STARVATION_LIMIT=9, /* no re-active if expired is starved */ + SCHED_NODE_THRESHOLD=10, /* NUMA node rebalance threshold */ + SCHED_IDLE_NODE_REBALANCE_RATIO=11, /* how often to global balance */ + SCHED_BUSY_NODE_REBALANCE_RATIO=12, /* how often to global balance */ +}; /* CTL_NET names: */ enum diff -urpN -X /home/fletch/.diff.exclude 001-bk10/include/linux/timex.h 900-mjb5/include/linux/timex.h --- 001-bk10/include/linux/timex.h Sun Nov 17 20:29:21 2002 +++ 900-mjb5/include/linux/timex.h Sun Mar 16 13:38:50 2003 @@ -76,7 +76,7 @@ #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #else -# error You lose. +# error Please use a HZ value which is between 12 and 1536 #endif /* diff -urpN -X /home/fletch/.diff.exclude 001-bk10/init/Makefile 900-mjb5/init/Makefile --- 001-bk10/init/Makefile Wed Mar 5 07:37:08 2003 +++ 900-mjb5/init/Makefile Sun Mar 16 13:39:02 2003 @@ -1,6 +1,10 @@ # # Makefile for the linux kernel. # +ifdef CONFIG_CRASH_DUMP +EXTRA_TARGETS := kerntypes.o +CFLAGS_kerntypes.o := -gstabs +endif obj-y := main.o version.o mounts.o initramfs.o mounts-y := do_mounts.o @@ -23,3 +27,4 @@ $(obj)/version.o: include/linux/compile. include/linux/compile.h: FORCE @echo -n ' GEN $@' @sh $(srctree)/scripts/mkcompile_h $@ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CC) $(CFLAGS)" + diff -urpN -X /home/fletch/.diff.exclude 001-bk10/init/kerntypes.c 900-mjb5/init/kerntypes.c --- 001-bk10/init/kerntypes.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/init/kerntypes.c Sun Mar 16 13:39:02 2003 @@ -0,0 +1,24 @@ +/* + * kerntypes.c + * + * Copyright (C) 2000 Tom Morano (tjm@sgi.com) and + * Matt D. Robinson (yakker@alacritech.com) + * + * Dummy module that includes headers for all kernel types of interest. + * The kernel type information is used by the lcrash utility when + * analyzing system crash dumps or the live system. Using the type + * information for the running system, rather than kernel header files, + * makes for a more flexible and robust analysis tool. + * + * This source code is released under version 2 of the GNU GPL. + */ +#include +#include +#include +#include +#include + +void +kerntypes_dummy(void) +{ +} diff -urpN -X /home/fletch/.diff.exclude 001-bk10/init/main.c 900-mjb5/init/main.c --- 001-bk10/init/main.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/init/main.c Sun Mar 16 13:39:02 2003 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,10 @@ #include #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + /* * Versions of gcc older than that listed below may actually compile * and link okay, but the end product can have subtle run time bugs. @@ -98,6 +103,16 @@ extern void ipc_init(void); int system_running = 0; /* + * The kernel_magic value represents the address of _end, which allows + * namelist tools to "match" each other respectively. That way a tool + * that looks at /dev/mem can verify that it is using the right System.map + * file -- if kernel_magic doesn't equal the namelist value of _end, + * something's wrong. + */ +extern unsigned long _end; +unsigned long *kernel_magic = &_end; + +/* * Boot command-line arguments */ #define MAX_INIT_ARGS 8 @@ -385,6 +400,7 @@ asmlinkage void __init start_kernel(void */ lock_kernel(); printk(linux_banner); + setup_early_printk(&command_line); setup_arch(&command_line); setup_per_cpu_areas(); @@ -455,6 +471,12 @@ asmlinkage void __init start_kernel(void * make syscalls (and thus be locked). */ init_idle(current, smp_processor_id()); + +#ifdef CONFIG_X86_REMOTE_DEBUG + if (gdb_enter) { + gdb_hook(); /* right at boot time */ + } +#endif /* Do the rest non-__init'ed, we're now alive */ rest_init(); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/kernel/Makefile 900-mjb5/kernel/Makefile --- 001-bk10/kernel/Makefile Tue Feb 25 23:03:51 2003 +++ 900-mjb5/kernel/Makefile Sun Mar 16 13:39:06 2003 @@ -10,6 +10,7 @@ obj-y = sched.o fork.o exec_domain.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o +obj-$(CONFIG_LOCKMETER) += lockmeter.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += ksyms.o module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o @@ -18,6 +19,8 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_KPROBES) += kprobes.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -urpN -X /home/fletch/.diff.exclude 001-bk10/kernel/early_printk.c 900-mjb5/kernel/early_printk.c --- 001-bk10/kernel/early_printk.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/kernel/early_printk.c Sun Mar 16 13:38:49 2003 @@ -0,0 +1,209 @@ +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +static int current_ypos = 1, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= MAX_YPOS) { + /* scroll 1 line up */ + for(k = 1, j = 0; k < MAX_YPOS; k++, j++) { + for(i = 0; i < MAX_XPOS; i++) { + writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), + VGABASE + 2*(MAX_XPOS*j + i)); + } + } + for(i = 0; i < MAX_XPOS; i++) { + writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); + } + current_ypos = MAX_YPOS-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++)); + if (current_xpos >= MAX_XPOS) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ + +int early_serial_base; /* ttyS0 */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + rep_nop(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); + s++; + } +} + +static __init void early_serial_init(char *opt) +{ + unsigned char c; + unsigned divisor, baud = DEFAULT_BAUD; + static int bases[] = SERIAL_BASES; + char *s, *e; + + early_serial_base = bases[0]; + + if (*opt == ',') + ++opt; + + s = strsep(&opt, ","); + if (s != NULL) { + unsigned port; + if (!strncmp(s,"0x",2)) + early_serial_base = simple_strtoul(s, &e, 16); + else { + if (!strncmp(s,"ttyS",4)) + s+=4; + port = simple_strtoul(s, &e, 10); + if (port > (SERIAL_BASES_LEN-1) || s == e) + port = 0; + early_serial_base = bases[port]; + } + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + s = strsep(&opt, ","); + if (s != NULL) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + va_start(ap,fmt); + n = vsnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int keep_early; + +int __init setup_early_printk(char *opt) +{ + char *space, *s; + char buf[256]; + + s = strstr(opt, "earlyprintk="); + if (s == NULL) + return -1; + opt = s+12; + + if (early_console_initialized) + return -1; + + strncpy(buf,opt,256); + buf[255] = 0; + space = strchr(buf, ' '); + if (space) + *space = 0; + + if (strstr(buf,"keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3)) { + early_console = &early_vga_console; + } else { + early_console = NULL; + return -1; + } + early_console_initialized = 1; + register_console(early_console); + early_printk( "early printk console registered\n" ); + return 0; +} + +void __init disable_early_printk(void) +{ + if (!early_console_initialized || !early_console) + return; + if (!keep_early) { + printk("disabling early console...\n"); + unregister_console(early_console); + early_console_initialized = 0; + } else { + printk("keeping early console.\n"); + } +} + +/* syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. */ +__setup("earlyprintk=", setup_early_printk); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/kernel/fork.c 900-mjb5/kernel/fork.c --- 001-bk10/kernel/fork.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/kernel/fork.c Sun Mar 16 13:39:06 2003 @@ -30,6 +30,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -197,7 +201,10 @@ void __init fork_init(unsigned long memp * value: the thread structures can take up at most half * of memory. */ - max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + if (THREAD_SIZE >= PAGE_SIZE) + max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + else + max_threads = (mempages * (PAGE_SIZE/THREAD_SIZE)) / 8; /* * we need to allow at least 20 threads to boot a system */ @@ -248,6 +255,7 @@ static inline int dup_mmap(struct mm_str struct vm_area_struct * mpnt, *tmp, **pprev; int retval; unsigned long charge = 0; + pmd_t *prev_pmd = 0; down_write(&oldmm->mmap_sem); flush_cache_mm(current->mm); @@ -257,6 +265,7 @@ static inline int dup_mmap(struct mm_str mm->free_area_cache = TASK_UNMAPPED_BASE; mm->map_count = 0; mm->rss = 0; + atomic_set(&mm->ptepages, 0); mm->cpu_vm_mask = 0; pprev = &mm->mmap; @@ -311,7 +320,7 @@ static inline int dup_mmap(struct mm_str *pprev = tmp; pprev = &tmp->vm_next; mm->map_count++; - retval = copy_page_range(mm, current->mm, tmp); + retval = fork_page_range(mm, current->mm, tmp, &prev_pmd); spin_unlock(&mm->page_table_lock); if (tmp->vm_ops && tmp->vm_ops->open) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/kernel/kprobes.c 900-mjb5/kernel/kprobes.c --- 001-bk10/kernel/kprobes.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/kernel/kprobes.c Sun Mar 16 13:38:57 2003 @@ -0,0 +1,89 @@ +/* Support for kernel probes. + (C) 2002 Vamsi Krishna S . +*/ +#include +#include +#include +#include +#include +#include +#include + +#define KPROBE_HASH_BITS 6 +#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) + +static struct list_head kprobe_table[KPROBE_TABLE_SIZE]; + +unsigned int kprobe_cpu = NR_CPUS; +static spinlock_t kprobe_lock = SPIN_LOCK_UNLOCKED; + +/* Locks kprobe: irqs must be disabled */ +void lock_kprobes(void) +{ + spin_lock(&kprobe_lock); + kprobe_cpu = smp_processor_id(); +} + +void unlock_kprobes(void) +{ + kprobe_cpu = NR_CPUS; + spin_unlock(&kprobe_lock); +} + +/* You have to be holding the kprobe_lock */ +struct kprobe *get_kprobe(void *addr) +{ + struct list_head *head, *tmp; + + head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; + list_for_each(tmp, head) { + struct kprobe *p = list_entry(tmp, struct kprobe, list); + if (p->addr == addr) + return p; + } + return NULL; +} + +int register_kprobe(struct kprobe *p) +{ + int ret = 0; + + spin_lock_irq(&kprobe_lock); + if (get_kprobe(p->addr)) { + ret = -EEXIST; + goto out; + } + list_add(&p->list, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); + + p->opcode = *p->addr; + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t)); + out: + spin_unlock_irq(&kprobe_lock); + return ret; +} + +void unregister_kprobe(struct kprobe *p) +{ + spin_lock_irq(&kprobe_lock); + *p->addr = p->opcode; + list_del(&p->list); + flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t)); + spin_unlock_irq(&kprobe_lock); +} + +static int __init init_kprobes(void) +{ + int i; + + /* FIXME allocate the probe table, currently defined statically */ + /* initialize all list heads */ + for (i = 0; i < KPROBE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kprobe_table[i]); + + return 0; +} +__initcall(init_kprobes); + +EXPORT_SYMBOL_GPL(register_kprobe); +EXPORT_SYMBOL_GPL(unregister_kprobe); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/kernel/ksyms.c 900-mjb5/kernel/ksyms.c --- 001-bk10/kernel/ksyms.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/kernel/ksyms.c Sun Mar 16 18:34:49 2003 @@ -58,6 +58,8 @@ #include #include #include +#include +#include #include #if defined(CONFIG_PROC_FS) @@ -464,7 +466,12 @@ EXPORT_SYMBOL(sleep_on); EXPORT_SYMBOL(sleep_on_timeout); EXPORT_SYMBOL(interruptible_sleep_on); EXPORT_SYMBOL(interruptible_sleep_on_timeout); +#ifdef CONFIG_KGDB_THREAD +EXPORT_SYMBOL(kern_schedule); +EXPORT_SYMBOL(do_schedule); +#else EXPORT_SYMBOL(schedule); +#endif #ifdef CONFIG_PREEMPT EXPORT_SYMBOL(preempt_schedule); #endif @@ -608,8 +615,23 @@ EXPORT_SYMBOL(next_thread); EXPORT_SYMBOL(__per_cpu_offset); #endif +#if defined(CONFIG_LOCKMETER) +EXPORT_SYMBOL(_metered_spin_lock); +EXPORT_SYMBOL(_metered_spin_unlock); +EXPORT_SYMBOL(_metered_spin_trylock); +EXPORT_SYMBOL(_metered_read_lock); +EXPORT_SYMBOL(_metered_read_unlock); +EXPORT_SYMBOL(_metered_write_lock); +EXPORT_SYMBOL(_metered_write_unlock); +#endif + /* debug */ EXPORT_SYMBOL(dump_stack); EXPORT_SYMBOL(ptrace_notify); EXPORT_SYMBOL(current_kernel_time); + +#ifdef CONFIG_CRASH_DUMP_MODULE +EXPORT_SYMBOL(min_low_pfn); +EXPORT_SYMBOL(dump_oncpu); +#endif diff -urpN -X /home/fletch/.diff.exclude 001-bk10/kernel/lockmeter.c 900-mjb5/kernel/lockmeter.c --- 001-bk10/kernel/lockmeter.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/kernel/lockmeter.c Sun Mar 16 13:39:06 2003 @@ -0,0 +1,1088 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.c by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks + * Added support for hold time statistics for read and write + * locks. + */ + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#else +#define __SMP__ +#include +#include +#include +#include "bitops.h" +#include "user_scaffold.h" +#include +#include +#include "newlockmeter.h" +#endif + +#ifdef __KERNEL__ +#define ASSERT(cond) +#define bzero(loc,size) memset(loc,0,size) +#endif + +/*<---------------------------------------------------*/ +/* lockmeter.c */ +/*>---------------------------------------------------*/ + +#ifdef __KERNEL__ +static lstat_control_t lstat_control __cacheline_aligned = {LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 19*0, NR_CPUS*0, 0, NR_CPUS*0}; +#else +lstat_control_t lstat_control = {LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 19*0, NR_CPUS*0, 0, NR_CPUS*0}; +#endif + +int smp_num_cpus=NR_CPUS; + +#undef BUG +#define BUG() + +static ushort lstat_make_dir_entry(void *, void *); + +/* + * lstat_lookup + * + * Given a RA, locate the directory entry for the lock. + */ +static ushort +lstat_lookup( + void *lock_ptr, + void *caller_ra) +{ + ushort index; + lstat_directory_entry_t *dirp; + + dirp = lstat_control.dir; + + index = lstat_control.hashtab[DIRHASH(caller_ra)]; + while (dirp[index].caller_ra != caller_ra) { + if (index == 0) { + return(lstat_make_dir_entry(lock_ptr, caller_ra)); + } + index = dirp[index].next_stat_index; + } + + if (dirp[index].lock_ptr != NULL && + dirp[index].lock_ptr != lock_ptr) { + dirp[index].lock_ptr = NULL; + } + + return(index); +} + + +/* + * lstat_make_dir_entry + * Called to add a new lock to the lock directory. + */ +static ushort +lstat_make_dir_entry( + void *lock_ptr, + void *caller_ra) +{ + lstat_directory_entry_t *dirp; + ushort index, hindex; + unsigned long flags; + + /* lock the table without recursively reentering this metering code */ + do { local_irq_save(flags); + _raw_spin_lock(&lstat_control.directory_lock); } while(0); + + hindex = DIRHASH(caller_ra); + index = lstat_control.hashtab[hindex]; + dirp = lstat_control.dir; + while (index && dirp[index].caller_ra != caller_ra) + index = dirp[index].next_stat_index; + + if (index == 0) { + if(lstat_control.next_free_dir_index < LSTAT_MAX_STAT_INDEX) { + index = lstat_control.next_free_dir_index++; + lstat_control.dir[index].caller_ra = caller_ra; + lstat_control.dir[index].lock_ptr = lock_ptr; + lstat_control.dir[index].next_stat_index = lstat_control.hashtab[hindex]; + lstat_control.hashtab[hindex] = index; + } else { + lstat_control.dir_overflow++; + } + } + + do { _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags);} while(0); + return(index); +} + +int +lstat_update ( + void *lock_ptr, + void *caller_ra, + int action) +{ + int index; + int cpu; + + ASSERT(action < LSTAT_ACT_MAX_VALUES); + + if (lstat_control.state == LSTAT_OFF) { + return(0); + } + + index = lstat_lookup(lock_ptr, caller_ra); + cpu = THIS_CPU_NUMBER; + (*lstat_control.counts[cpu])[index].count[action]++; + (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); + + return(index); +} + +int +lstat_update_time ( + void *lock_ptr, + void *caller_ra, + int action, + uint32_t ticks) +{ + ushort index; + int cpu; + + ASSERT(action < LSTAT_ACT_MAX_VALUES); + + if (lstat_control.state == LSTAT_OFF) { + return(0); + } + + index = lstat_lookup(lock_ptr, caller_ra); + cpu = THIS_CPU_NUMBER; + (*lstat_control.counts[cpu])[index].count[action]++; + (*lstat_control.counts[cpu])[index].cum_wait_ticks += (uint64_t)ticks; + if ((*lstat_control.counts[cpu])[index].max_wait_ticks < ticks) + (*lstat_control.counts[cpu])[index].max_wait_ticks = ticks; + + (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); + + return(index); +} + +void _metered_spin_lock(spinlock_t *lock_ptr) +{ + if (lstat_control.state == LSTAT_OFF) { + _raw_spin_lock(lock_ptr); /* do the real lock */ + PUT_INDEX(lock_ptr,0); /* clean index in case lockmetering */ + /* gets turned on before unlock */ + } else { + void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); + int index; + + if (_raw_spin_trylock(lock_ptr)) { + index = lstat_update(lock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + } else { + uint32_t start_cycles = get_cycles(); + _raw_spin_lock(lock_ptr); /* do the real lock */ + index = lstat_update_time(lock_ptr, this_pc, LSTAT_ACT_SPIN, + get_cycles() - start_cycles); + } + /* save the index in the lock itself for use in spin unlock */ + PUT_INDEX(lock_ptr,index); + } +} + +int _metered_spin_trylock(spinlock_t *lock_ptr) +{ + if (lstat_control.state == LSTAT_OFF) { + return _raw_spin_trylock(lock_ptr); + } else { + int retval; + void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); + + if ((retval = _raw_spin_trylock(lock_ptr))) { + int index = lstat_update(lock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* save the index in the lock itself for use in spin unlock */ + PUT_INDEX(lock_ptr,index); + } else { + lstat_update(lock_ptr, this_pc, LSTAT_ACT_REJECT); + } + + return retval; + } +} + +void _metered_spin_unlock(spinlock_t *lock_ptr) +{ + int index=-1; + + if (lstat_control.state != LSTAT_OFF) { + index = GET_INDEX(lock_ptr); + /* + * If statistics were turned off when we set the lock, + * then the index can be zero. If that is the case, + * then collect no stats on this call. + */ + if (index > 0) { + uint32_t hold_time; + int cpu = THIS_CPU_NUMBER; + hold_time = get_cycles() - (*lstat_control.counts[cpu])[index].acquire_time; + (*lstat_control.counts[cpu])[index].cum_hold_ticks += (uint64_t)hold_time; + if ((*lstat_control.counts[cpu])[index].max_hold_ticks < hold_time) + (*lstat_control.counts[cpu])[index].max_hold_ticks = hold_time; + } + } + + /* make sure we don't have a stale index value saved */ + PUT_INDEX(lock_ptr,0); + _raw_spin_unlock(lock_ptr); /* do the real unlock */ +} + +/* + * allocate the next global read lock structure and store its index + * in the rwlock at "lock_ptr". + */ +uint32_t alloc_rwlock_struct(rwlock_t *rwlock_ptr) +{ + int index; + int flags; + int cpu=THIS_CPU_NUMBER; + + /* If we've already overflowed, then do a quick exit */ + if (lstat_control.next_free_read_lock_index > LSTAT_MAX_READ_LOCK_INDEX) { + lstat_control.rwlock_overflow++; + return(0); + } + + do { local_irq_save(flags); + _raw_spin_lock(&lstat_control.directory_lock); } while(0); + + /* It is possible this changed while we were waiting for the directory_lock */ + if (lstat_control.state == LSTAT_OFF) { + index=0; + goto unlock; + } + + /* It is possible someone else got here first and set the index */ + if ((index=GET_RWINDEX(rwlock_ptr)) == 0) { + + /* we can't turn on read stats for this lock while there are readers */ + /* (this would mess up the running hold time sum at unlock time) */ + if (RWLOCK_READERS(rwlock_ptr) != 0) { + index=0; + goto unlock; + } + + /* if stats are turned on after being off, we may need to return an old */ + /* index from when the statistics were on last time. ................... */ + for(index=1;index= LSTAT_MAX_READ_LOCK_INDEX) { + lstat_control.rwlock_overflow++; + index = 0; + goto unlock; + } + index = lstat_control.next_free_read_lock_index++; + + /* initialize the global read stats data structure for each cpu */ + for(cpu=0; cpu < smp_num_cpus; cpu++) { + (*lstat_control.read_lock_counts[cpu])[index].lock_ptr = rwlock_ptr; + } +put_index_and_unlock: + /* store the index for the read lock structure into the lock */ + PUT_RWINDEX(rwlock_ptr,index); + } + +unlock: + do { _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags);} while(0); + + return(index); +} + +void +_metered_read_lock(rwlock_t *rwlock_ptr) +{ + void *this_pc; + uint32_t start_cycles; + int index; + int cpu; + int flags; + int readers_before, readers_after; + uint64_t cycles64; + + if (lstat_control.state == LSTAT_OFF) { + _raw_read_lock(rwlock_ptr); + /* clean index in case lockmetering turns on before an unlock */ + PUT_RWINDEX(rwlock_ptr, 0); + return; + } + + this_pc = LSTAT_RA(LSTAT_RA_READ); + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* allocate the global stats entry for this lock, if needed */ + if (index==0) { + index = alloc_rwlock_struct(rwlock_ptr); + } + + readers_before = RWLOCK_READERS(rwlock_ptr); + if (_raw_read_trylock(rwlock_ptr)) { + /* + * We have decremented the lock to count a new reader, + * and have confirmed that no writer has it locked. + */ + /* update statistics if enabled */ + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do { local_irq_save(flags); } while(0); +#endif + lstat_update((void *)rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* preserve value of TSC so cum_hold_ticks and start_busy use same value */ + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= cycles64; + + /* record time and cpu of start of busy period */ + /* this is not perfect (some race conditions are possible) */ + if (readers_before==0) { + (*lstat_control.read_lock_counts[cpu])[index].start_busy = cycles64; + PUT_RW_CPU(rwlock_ptr, cpu); + } + readers_after=RWLOCK_READERS(rwlock_ptr); + if (readers_after > (*lstat_control.read_lock_counts[cpu])[index].max_readers) + (*lstat_control.read_lock_counts[cpu])[index].max_readers = readers_after; +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t*)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } + + return; + } + /* If we get here, then we could not quickly grab the read lock */ + + start_cycles = get_cycles(); /* start counting the wait time */ + + /* Now spin until read_lock is successful */ + _raw_read_lock(rwlock_ptr); + + lstat_update_time((void *)rwlock_ptr, this_pc, LSTAT_ACT_SPIN, + get_cycles() - start_cycles); + + /* update statistics if they are enabled for this lock */ + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do { local_irq_save(flags); } while(0); +#endif + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= cycles64; + + /* this is not perfect (some race conditions are possible) */ + if (readers_before==0) { + (*lstat_control.read_lock_counts[cpu])[index].start_busy = cycles64; + PUT_RW_CPU(rwlock_ptr, cpu); + } + readers_after=RWLOCK_READERS(rwlock_ptr); + if (readers_after > (*lstat_control.read_lock_counts[cpu])[index].max_readers) + (*lstat_control.read_lock_counts[cpu])[index].max_readers = readers_after; + +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } +} + +void _metered_read_unlock(rwlock_t *rwlock_ptr) +{ + int index; + int cpu; + int flags; + uint64_t busy_length; + uint64_t cycles64; + + if (lstat_control.state == LSTAT_OFF) { + _raw_read_unlock(rwlock_ptr); + return; + } + + index = GET_RWINDEX(rwlock_ptr); + cpu = THIS_CPU_NUMBER; + + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + /* updates below are non-atomic */ + do { local_irq_save(flags); } while(0); +#endif + /* preserve value of TSC so cum_hold_ticks and busy_ticks are consistent.. */ + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks += cycles64; + (*lstat_control.read_lock_counts[cpu])[index].read_lock_count++; + + /* once again, this is not perfect (some race conditions are possible) */ + if (RWLOCK_READERS(rwlock_ptr) == 1) { + int cpu1 = GET_RW_CPU(rwlock_ptr); + uint64_t last_start_busy = (*lstat_control.read_lock_counts[cpu1])[index].start_busy; + (*lstat_control.read_lock_counts[cpu])[index].busy_periods++; + if (cycles64 > last_start_busy) { + busy_length = cycles64 - last_start_busy; + (*lstat_control.read_lock_counts[cpu])[index].busy_ticks += busy_length; + if (busy_length > (*lstat_control.read_lock_counts[cpu])[index].max_busy) + (*lstat_control.read_lock_counts[cpu])[index].max_busy = busy_length; + } + } +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } + + /* unlock the lock */ + _raw_read_unlock(rwlock_ptr); +} + +void _metered_write_lock(rwlock_t *rwlock_ptr) +{ + uint32_t start_cycles; + void *this_pc; + uint32_t spin_ticks = 0; /* in anticipation of a potential wait */ + int index; + int write_index = 0; + int cpu; + enum {writer_writer_conflict, writer_reader_conflict} why_wait = writer_writer_conflict; + + if (lstat_control.state == LSTAT_OFF) { + _raw_write_lock(rwlock_ptr); + /* clean index in case lockmetering turns on before an unlock */ + PUT_RWINDEX(rwlock_ptr, 0); + return; + } + + this_pc = LSTAT_RA(LSTAT_RA_WRITE); + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* allocate the global stats entry for this lock, if needed */ + if (index == 0) { + index = alloc_rwlock_struct(rwlock_ptr); + } + + if (_raw_write_trylock(rwlock_ptr)) { + /* We acquired the lock on the first try */ + write_index = lstat_update((void *)rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* save the write_index for use in unlock if stats enabled */ + if (index > 0) + (*lstat_control.read_lock_counts[cpu])[index].write_index = write_index; + return; + } + + /* If we get here, then we could not quickly grab the write lock */ + start_cycles = get_cycles(); /* start counting the wait time */ + + why_wait = RWLOCK_READERS(rwlock_ptr) ? writer_reader_conflict : writer_writer_conflict; + + /* Now set the lock and wait for conflicts to disappear */ + _raw_write_lock(rwlock_ptr); + + spin_ticks = get_cycles() - start_cycles; + + /* update stats -- if enabled */ + if (index > 0) + if (spin_ticks) { + if (why_wait == writer_reader_conflict) { + /* waited due to a reader holding the lock */ + write_index = lstat_update_time((void *)rwlock_ptr, this_pc, + LSTAT_ACT_SPIN, spin_ticks); + } else { + /* waited due to another writer holding the lock */ + write_index = lstat_update_time((void *)rwlock_ptr, this_pc, + LSTAT_ACT_WW_SPIN, spin_ticks); + (*lstat_control.counts[cpu])[write_index].cum_wait_ww_ticks += spin_ticks; + if (spin_ticks > + (*lstat_control.counts[cpu])[write_index].max_wait_ww_ticks) { + (*lstat_control.counts[cpu])[write_index].max_wait_ww_ticks = spin_ticks; + } + } + + /* save the directory index for use on write_unlock */ + (*lstat_control.read_lock_counts[cpu])[index].write_index = write_index; + } + +} + +void +_metered_write_unlock(rwlock_t *rwlock_ptr) +{ + int index; + int cpu; + int write_index; + uint32_t hold_time; + + if (lstat_control.state == LSTAT_OFF) { + _raw_write_unlock(rwlock_ptr); + return; + } + + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* update statistics if stats enabled for this lock */ + if (index>0) { + write_index = (*lstat_control.read_lock_counts[cpu])[index].write_index; + + hold_time = get_cycles() - (*lstat_control.counts[cpu])[write_index].acquire_time; + (*lstat_control.counts[cpu])[write_index].cum_hold_ticks += (uint64_t)hold_time; + if ((*lstat_control.counts[cpu])[write_index].max_hold_ticks < hold_time) + (*lstat_control.counts[cpu])[write_index].max_hold_ticks = hold_time; + } + _raw_write_unlock(rwlock_ptr); +} + +int _metered_write_trylock(rwlock_t *rwlock_ptr) +{ + int retval; + void *this_pc = LSTAT_RA(LSTAT_RA_WRITE); + + if ((retval = _raw_write_trylock(rwlock_ptr))) { + lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + } else { + lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_REJECT); + } + + return retval; +} + +#ifdef __KERNEL__ +static void +init_control_space(void) +{ + /* Set all control space pointers to null and indices to "empty" */ + int cpu; + + /* + * Access CPU_CYCLE_FREQUENCY at the outset, which in some + * architectures may trigger a runtime calculation that uses a + * spinlock. Let's do this before lockmetering is turned on. + */ + if (CPU_CYCLE_FREQUENCY == 0) + BUG(); + + lstat_control.hashtab = NULL; + lstat_control.dir = NULL; + for (cpu=0; cpu max_len) + return actual_ret_bcount; + + copy_to_user(buffer, (void *)&req, next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + return actual_ret_bcount; + } else { + /* measurement is off but valid data present */ + /* fetch time info from lstat_control */ + req.ending_time = lstat_control.ending_time; + req.ending_cycles64 = lstat_control.ending_cycles64; + req.enabled_cycles64 = lstat_control.enabled_cycles64; + } + } else { + /* this must be a read while data active--use current time, etc */ + do_gettimeofday(&tv); + req.ending_time = tv.tv_sec; + req.ending_cycles64 = get_cycles64(); + req.enabled_cycles64 = req.ending_cycles64-req.started_cycles64 + + lstat_control.enabled_cycles64; + } + + next_ret_bcount = sizeof(lstat_user_request_t); + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; + + copy_to_user(buffer, (void *)&req, next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + + if (!lstat_control.counts[0]) /* not initialized? */ + return actual_ret_bcount; + + next_ret_bcount = sizeof(lstat_cpu_counts_t); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; /* leave early */ + copy_to_user(buffer + actual_ret_bcount, lstat_control.counts[cpu], + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + } + + next_ret_bcount = LSTAT_MAX_STAT_INDEX * sizeof(lstat_directory_entry_t); + if ( ((actual_ret_bcount + next_ret_bcount) > max_len) + || !lstat_control.dir ) + return actual_ret_bcount; /* leave early */ + + copy_to_user(buffer + actual_ret_bcount, lstat_control.dir, + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + + next_ret_bcount = sizeof(lstat_read_lock_cpu_counts_t); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if (actual_ret_bcount + next_ret_bcount > max_len) + return actual_ret_bcount; + copy_to_user(buffer + actual_ret_bcount, lstat_control.read_lock_counts[cpu], + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + } + + return actual_ret_bcount; +} + +/* + * Writing to the /proc lockmeter node enables or disables metering. + * based upon the first byte of the "written" data. + * The following values are defined: + * LSTAT_ON: 1st call: allocates storage, intializes and turns on measurement + * subsequent calls just turn on measurement + * LSTAT_OFF: turns off measurement + * LSTAT_RESET: resets statistics + * LSTAT_RELEASE: releases statistics storage + * + * This allows one to accumulate statistics over several lockstat runs: + * + * lockstat on + * lockstat off + * ...repeat above as desired... + * lockstat get + * ...now start a new set of measurements... + * lockstat reset + * lockstat on + * ... + * + */ +ssize_t put_lockmeter_info(const char *buffer, size_t len) +{ + int error = 0; + int dirsize, countsize, read_lock_countsize, hashsize; + int cpu; + char put_char; + int i, read_lock_blocks, flags; + rwlock_t *lock_ptr; + struct timeval tv; + + if (len <= 0) + return -EINVAL; + + _raw_spin_lock(&lstat_control.control_lock); + + get_user(put_char, buffer); + switch (put_char) { + + case LSTAT_OFF: + if (lstat_control.state != LSTAT_OFF) { + /* + * To avoid seeing read lock hold times in an inconsisent state, + * we have to follow this protocol to turn off statistics + */ + do { local_irq_save(flags); } while(0); + /* getting this lock will stop any read lock block allocations */ + _raw_spin_lock(&lstat_control.directory_lock); + /* keep any more read lock blocks from being allocated */ + lstat_control.state = LSTAT_OFF; + /* record how may read lock blocks there are */ + read_lock_blocks = lstat_control.next_free_read_lock_index; + _raw_spin_unlock(&lstat_control.directory_lock); + /* now go through the list of read locks */ + cpu = THIS_CPU_NUMBER; + for(i=1;i 0) - { + if (panic_timeout > 0) { /* * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked.. diff -urpN -X /home/fletch/.diff.exclude 001-bk10/kernel/sched.c 900-mjb5/kernel/sched.c --- 001-bk10/kernel/sched.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/kernel/sched.c Sun Mar 16 18:34:49 2003 @@ -39,6 +39,9 @@ #define cpu_to_node_mask(cpu) (cpu_online_map) #endif +/* used to soft spin in sched while dump is in progress */ +int dump_oncpu; + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -64,16 +67,27 @@ * maximum timeslice is 200 msecs. Timeslices get refilled after * they expire. */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) -#define NODE_THRESHOLD 125 +int min_timeslice = (10 * HZ) / 1000; +int max_timeslice = (200 * HZ) / 1000; +int child_penalty = 50; +int parent_penalty = 100; +int exit_weight = 3; +int prio_bonus_ratio = 25; +int interactive_delta = 2; +int max_sleep_avg = 10 * HZ; +int starvation_limit = 10 * HZ; +int node_threshold = 125; + +#define MIN_TIMESLICE (min_timeslice) +#define MAX_TIMESLICE (max_timeslice) +#define CHILD_PENALTY (child_penalty) +#define PARENT_PENALTY (parent_penalty) +#define EXIT_WEIGHT (exit_weight) +#define PRIO_BONUS_RATIO (prio_bonus_ratio) +#define INTERACTIVE_DELTA (interactive_delta) +#define MAX_SLEEP_AVG (max_sleep_avg) +#define STARVATION_LIMIT (starvation_limit) +#define NODE_THRESHOLD (node_threshold) /* * If a task is 'interactive' then we reinsert it in the active @@ -230,6 +244,111 @@ __init void node_nr_running_init(void) #endif /* CONFIG_NUMA */ + +struct schedstat { + /* sys_sched_yield stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + + /* load_balance stats */ + unsigned long lb_imbalance; + unsigned long lb_idle; + unsigned long lb_busy; + unsigned long lb_resched; + unsigned long lb_cnt; + unsigned long lb_nobusy; + unsigned long lb_bnode; + + /* pull_task stats */ + unsigned long pt_gained; + unsigned long pt_lost; + unsigned long pt_node_gained; + unsigned long pt_node_lost; + + /* balance_node stats */ + unsigned long bn_cnt; + unsigned long bn_idle; +} ____cacheline_aligned; + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 2 + +struct schedstat schedstats[NR_CPUS]; + +/* + * This could conceivably exceed a page's worth of output on machines with + * large number of cpus, where large == about 4096/100 or 40ish. Start + * worrying when we pass 32, probably. Then this has to stop being a + * "simple" entry in proc/proc_misc.c and needs to be an actual seq_file. + */ +int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct schedstat sums; + int i, len; + + memset(&sums, 0, sizeof(sums)); + len = sprintf(page, "version %d\n", SCHEDSTAT_VERSION); + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) continue; + sums.yld_exp_empty += schedstats[i].yld_exp_empty; + sums.yld_act_empty += schedstats[i].yld_act_empty; + sums.yld_both_empty += schedstats[i].yld_both_empty; + sums.yld_cnt += schedstats[i].yld_cnt; + sums.sched_noswitch += schedstats[i].sched_noswitch; + sums.sched_switch += schedstats[i].sched_switch; + sums.sched_cnt += schedstats[i].sched_cnt; + sums.lb_idle += schedstats[i].lb_idle; + sums.lb_busy += schedstats[i].lb_busy; + sums.lb_resched += schedstats[i].lb_resched; + sums.lb_cnt += schedstats[i].lb_cnt; + sums.lb_imbalance += schedstats[i].lb_imbalance; + sums.lb_nobusy += schedstats[i].lb_nobusy; + sums.lb_bnode += schedstats[i].lb_bnode; + sums.pt_node_gained += schedstats[i].pt_node_gained; + sums.pt_node_lost += schedstats[i].pt_node_lost; + sums.pt_gained += schedstats[i].pt_gained; + sums.pt_lost += schedstats[i].pt_lost; + sums.bn_cnt += schedstats[i].bn_cnt; + sums.bn_idle += schedstats[i].bn_idle; + len += sprintf(page + len, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu %lu\n", + i, schedstats[i].yld_both_empty, + schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty, + schedstats[i].yld_cnt, schedstats[i].sched_noswitch, + schedstats[i].sched_switch, schedstats[i].sched_cnt, + schedstats[i].lb_idle, schedstats[i].lb_busy, + schedstats[i].lb_resched, + schedstats[i].lb_cnt, schedstats[i].lb_imbalance, + schedstats[i].lb_nobusy, schedstats[i].lb_bnode, + schedstats[i].pt_gained, schedstats[i].pt_lost, + schedstats[i].pt_node_gained, schedstats[i].pt_node_lost, + schedstats[i].bn_cnt, schedstats[i].bn_idle); + } + len += sprintf(page + len, + "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu\n", + sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty, + sums.yld_cnt, sums.sched_noswitch, sums.sched_switch, + sums.sched_cnt, sums.lb_idle, sums.lb_busy, sums.lb_resched, + sums.lb_cnt, sums.lb_imbalance, sums.lb_nobusy, sums.lb_bnode, + sums.pt_gained, sums.pt_lost, sums.pt_node_gained, + sums.pt_node_lost, sums.bn_cnt, sums.bn_idle); + + return len; +} + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -342,10 +461,10 @@ static inline void __activate_task(task_ */ static inline int activate_task(task_t *p, runqueue_t *rq) { - unsigned long sleep_time = jiffies - p->last_run; + long sleep_time = jiffies - p->last_run - 1; int requeue_waker = 0; - if (sleep_time) { + if (sleep_time > 0) { int sleep_avg; /* @@ -690,7 +809,6 @@ static inline task_t * context_switch(ru return prev; } - /* * nr_running, nr_uninterruptible and nr_context_switches: * @@ -708,6 +826,11 @@ unsigned long nr_running(void) return sum; } +unsigned long nr_running_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_running; +} + unsigned long nr_uninterruptible(void) { unsigned long i, sum = 0; @@ -874,6 +997,9 @@ static int find_busiest_node(int this_no #endif /* CONFIG_NUMA */ +int idle_node_rebalance_ratio = 10; +int busy_node_rebalance_ratio = 100; + #if CONFIG_SMP /* @@ -985,6 +1111,12 @@ out: */ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) { + if (cpu_to_node(this_cpu) != cpu_to_node(src_rq - runqueues)) { + schedstats[this_cpu].pt_node_gained++; + schedstats[src_rq - runqueues].pt_node_lost++; + } + schedstats[this_cpu].pt_gained++; + schedstats[src_rq - runqueues].pt_lost++; dequeue_task(p, src_array); nr_running_dec(src_rq); set_task_cpu(p, this_cpu); @@ -1019,10 +1151,14 @@ static void load_balance(runqueue_t *thi struct list_head *head, *curr; task_t *tmp; + schedstats[this_cpu].lb_cnt++; busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); - if (!busiest) + if (!busiest) { + schedstats[this_cpu].lb_nobusy++; goto out; + } + schedstats[this_cpu].lb_imbalance += imbalance; /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1101,8 +1237,9 @@ out: */ #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) #define BUSY_REBALANCE_TICK (HZ/5 ?: 1) -#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) -#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 100) + +#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * idle_node_rebalance_ratio) +#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * busy_node_rebalance_ratio) #if CONFIG_NUMA static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) @@ -1110,9 +1247,13 @@ static void balance_node(runqueue_t *thi int node = find_busiest_node(cpu_to_node(this_cpu)); unsigned long cpumask, this_cpumask = 1UL << this_cpu; + schedstats[this_cpu].bn_cnt++; + if (idle) + schedstats[this_cpu].bn_idle++; if (node >= 0) { cpumask = node_to_cpumask(node) | this_cpumask; spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_bnode++; load_balance(this_rq, idle, cpumask); spin_unlock(&this_rq->lock); } @@ -1121,9 +1262,7 @@ static void balance_node(runqueue_t *thi static void rebalance_tick(runqueue_t *this_rq, int idle) { -#if CONFIG_NUMA int this_cpu = smp_processor_id(); -#endif unsigned long j = jiffies; /* @@ -1141,6 +1280,7 @@ static void rebalance_tick(runqueue_t *t #endif if (!(j % IDLE_REBALANCE_TICK)) { spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_idle++; load_balance(this_rq, 0, cpu_to_node_mask(this_cpu)); spin_unlock(&this_rq->lock); } @@ -1152,6 +1292,7 @@ static void rebalance_tick(runqueue_t *t #endif if (!(j % BUSY_REBALANCE_TICK)) { spin_lock(&this_rq->lock); + schedstats[this_cpu].lb_busy++; load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); spin_unlock(&this_rq->lock); } @@ -1270,19 +1411,32 @@ void scheduling_functions_start_here(voi /* * schedule() is the main scheduler function. */ +#ifdef CONFIG_KGDB_THREAD +asmlinkage void do_schedule(void) +#else asmlinkage void schedule(void) +#endif { task_t *prev, *next; runqueue_t *rq; prio_array_t *array; struct list_head *queue; - int idx; + int idx, mycpu = smp_processor_id(); + + /* + * If crash dump is in progress, this other cpu's + * need to wait until it completes. + * NB: this code is optimized away for kernels without dumping enabled. + */ + if (unlikely(dump_oncpu)) + goto dump_scheduling_disabled; /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ + schedstats[mycpu].sched_cnt++; if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { if (unlikely(in_atomic())) { printk(KERN_ERR "bad: scheduling while atomic!\n"); @@ -1321,6 +1475,7 @@ need_resched: pick_next_task: if (unlikely(!rq->nr_running)) { #if CONFIG_SMP + schedstats[mycpu].lb_resched++; load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); if (rq->nr_running) goto pick_next_task; @@ -1335,11 +1490,13 @@ pick_next_task: /* * Switch the active and expired arrays. */ + schedstats[mycpu].sched_switch++; rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; } + schedstats[mycpu].sched_noswitch++; idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; @@ -1366,6 +1523,16 @@ switch_tasks: preempt_enable_no_resched(); if (test_thread_flag(TIF_NEED_RESCHED)) goto need_resched; + + return; + + dump_scheduling_disabled: + /* allow scheduling only if this is the dumping cpu */ + if (dump_oncpu != smp_processor_id()+1) { + while (dump_oncpu) + cpu_relax(); + } + return; } #ifdef CONFIG_PREEMPT @@ -1504,6 +1671,20 @@ void complete_all(struct completion *x) spin_unlock_irqrestore(&x->wait.lock, flags); } +#ifdef CONFIG_KGDB_THREAD +asmlinkage void user_schedule(void) +{ + current->thread.kgdbregs = NULL; + do_schedule(); +} + +asmlinkage void kern_do_schedule(struct pt_regs regs) +{ + current->thread.kgdbregs = ®s; + do_schedule(); +} +#endif + void wait_for_completion(struct completion *x) { might_sleep(); @@ -1996,6 +2177,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; + int mycpu = smp_processor_id(); /* * We implement yielding by moving the task into the expired @@ -2004,7 +2186,15 @@ asmlinkage long sys_sched_yield(void) * (special rule: RT tasks will just roundrobin in the active * array.) */ + schedstats[mycpu].yld_cnt++; if (likely(!rt_task(current))) { + if (current->array->nr_active == 1) { + schedstats[mycpu].yld_act_empty++; + if (!rq->expired->nr_active) + schedstats[mycpu].yld_both_empty++; + } else if (!rq->expired->nr_active) { + schedstats[mycpu].yld_exp_empty++; + } dequeue_task(current, array); enqueue_task(current, rq->expired); } else { diff -urpN -X /home/fletch/.diff.exclude 001-bk10/kernel/sysctl.c 900-mjb5/kernel/sysctl.c --- 001-bk10/kernel/sysctl.c Mon Dec 16 21:50:51 2002 +++ 900-mjb5/kernel/sysctl.c Sun Mar 16 13:38:53 2003 @@ -55,6 +55,18 @@ extern char core_pattern[]; extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; +extern int min_timeslice; +extern int max_timeslice; +extern int child_penalty; +extern int parent_penalty; +extern int exit_weight; +extern int prio_bonus_ratio; +extern int interactive_delta; +extern int max_sleep_avg; +extern int starvation_limit; +extern int node_threshold; +extern int idle_node_rebalance_ratio; +extern int busy_node_rebalance_ratio; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -112,6 +124,7 @@ static struct ctl_table_header root_tabl static ctl_table kern_table[]; static ctl_table vm_table[]; +static ctl_table sched_table[]; #ifdef CONFIG_NET extern ctl_table net_table[]; #endif @@ -156,6 +169,7 @@ static ctl_table root_table[] = { {CTL_FS, "fs", NULL, 0, 0555, fs_table}, {CTL_DEBUG, "debug", NULL, 0, 0555, debug_table}, {CTL_DEV, "dev", NULL, 0, 0555, dev_table}, + {CTL_SCHED, "sched", NULL, 0, 0555, sched_table}, {0} }; @@ -358,7 +372,49 @@ static ctl_table debug_table[] = { static ctl_table dev_table[] = { {0} -}; +}; + +static ctl_table sched_table[] = { + {SCHED_MAX_TIMESLICE, "max_timeslice", &max_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_MIN_TIMESLICE, "min_timeslice", &min_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_CHILD_PENALTY, "child_penalty", &child_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PARENT_PENALTY, "parent_penalty", &parent_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_EXIT_WEIGHT, "exit_weight", &exit_weight, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio", &prio_bonus_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_INTERACTIVE_DELTA, "interactive_delta", &interactive_delta, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_MAX_SLEEP_AVG, "max_sleep_avg", &max_sleep_avg, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_STARVATION_LIMIT, "starvation_limit", &starvation_limit, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_NODE_THRESHOLD, "node_threshold", &node_threshold, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + sysctl_intvec, NULL, &one, NULL}, + {SCHED_IDLE_NODE_REBALANCE_RATIO, "idle_node_rebalance_ratio", + &idle_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_BUSY_NODE_REBALANCE_RATIO, "busy_node_rebalance_ratio", + &busy_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {0} +}; extern void init_irq_proc (void); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/kernel/timer.c 900-mjb5/kernel/timer.c --- 001-bk10/kernel/timer.c Wed Mar 5 07:37:08 2003 +++ 900-mjb5/kernel/timer.c Sun Mar 16 13:39:03 2003 @@ -736,6 +736,8 @@ static unsigned long count_active_tasks( * Requires xtime_lock to access. */ unsigned long avenrun[3]; +unsigned long tasks_running[3]; +unsigned long cpu_tasks_running[3][NR_CPUS]; /* * calc_load - given tick count, update the avenrun load estimates. @@ -743,8 +745,9 @@ unsigned long avenrun[3]; */ static inline void calc_load(unsigned long ticks) { - unsigned long active_tasks; /* fixed-point */ + unsigned long active_tasks, running_tasks; /* fixed-point */ static int count = LOAD_FREQ; + int cpu; count -= ticks; if (count < 0) { @@ -753,6 +756,19 @@ static inline void calc_load(unsigned lo CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); + running_tasks = nr_running() * FIXED_1; + CALC_LOAD(tasks_running[0], EXP_1, running_tasks); + CALC_LOAD(tasks_running[1], EXP_5, running_tasks); + CALC_LOAD(tasks_running[2], EXP_15, running_tasks); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!cpu_online(cpu)) + continue; + running_tasks = nr_running_cpu(cpu) * FIXED_1; + CALC_LOAD(cpu_tasks_running[0][cpu], EXP_1, running_tasks); + CALC_LOAD(cpu_tasks_running[1][cpu], EXP_5, running_tasks); + CALC_LOAD(cpu_tasks_running[2][cpu], EXP_15, running_tasks); + } + } } diff -urpN -X /home/fletch/.diff.exclude 001-bk10/lib/Kconfig 900-mjb5/lib/Kconfig --- 001-bk10/lib/Kconfig Sun Nov 17 20:29:47 2002 +++ 900-mjb5/lib/Kconfig Sun Mar 16 13:39:02 2003 @@ -17,13 +17,13 @@ config CRC32 # config ZLIB_INFLATE tristate - default y if CRAMFS=y || PPP_DEFLATE=y || JFFS2_FS=y || ZISOFS_FS=y || BINFMT_ZFLAT=y - default m if CRAMFS=m || PPP_DEFLATE=m || JFFS2_FS=m || ZISOFS_FS=m || BINFMT_ZFLAT=m + default y if CRAMFS=y || PPP_DEFLATE=y || JFFS2_FS=y || ZISOFS_FS=y || BINFMT_ZFLAT=y || CRASH_DUMP_COMPRESS_GZIP=y + default m if CRAMFS=m || PPP_DEFLATE=m || JFFS2_FS=m || ZISOFS_FS=m || BINFMT_ZFLAT=m || CRASH_DUMP_COMPRESS_GZIP=m config ZLIB_DEFLATE tristate - default m if PPP_DEFLATE!=y && JFFS2_FS!=y && (PPP_DEFLATE=m || JFFS2_FS=m) - default y if PPP_DEFLATE=y || JFFS2_FS=y + default m if PPP_DEFLATE!=y && JFFS2_FS!=y && (PPP_DEFLATE=m || JFFS2_FS=m) || CRASH_DUMP_COMPRESS_GZIP=m + default y if PPP_DEFLATE=y || JFFS2_FS=y || CRASH_DUMP_COMPRESS_GZIP=y endmenu diff -urpN -X /home/fletch/.diff.exclude 001-bk10/mm/Makefile 900-mjb5/mm/Makefile --- 001-bk10/mm/Makefile Thu Feb 13 11:08:15 2003 +++ 900-mjb5/mm/Makefile Sun Mar 16 13:39:06 2003 @@ -12,3 +12,5 @@ obj-y := bootmem.o filemap.o mempool.o slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o + +obj-$(CONFIG_SHAREPTE) += ptshare.o diff -urpN -X /home/fletch/.diff.exclude 001-bk10/mm/fremap.c 900-mjb5/mm/fremap.c --- 001-bk10/mm/fremap.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/mm/fremap.c Sun Mar 16 13:39:06 2003 @@ -12,11 +12,13 @@ #include #include #include +#include #include #include #include -static inline void zap_pte(struct mm_struct *mm, pte_t *ptep) +static inline void +zap_pte(struct mm_struct *mm, struct ptpage *ptepage, pte_t *ptep) { pte_t pte = *ptep; @@ -33,12 +35,13 @@ static inline void zap_pte(struct mm_str set_page_dirty(page); page_remove_rmap(page, ptep); page_cache_release(page); - mm->rss--; + decrement_rss(ptepage); } } } else { free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear(ptep); + decrement_swapcount(ptepage); } } @@ -50,6 +53,7 @@ int install_page(struct mm_struct *mm, s unsigned long addr, struct page *page, unsigned long prot) { int err = -ENOMEM; + struct ptpage *ptepage; pte_t *pte, entry; pgd_t *pgd; pmd_t *pmd; @@ -58,20 +62,20 @@ int install_page(struct mm_struct *mm, s pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto err; - pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); + pgd = pgd_offset(mm, addr); pmd = pmd_alloc(mm, pgd, addr); if (!pmd) goto err_unlock; - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_unshare(mm, pmd, addr); if (!pte) goto err_unlock; - zap_pte(mm, pte); + ptepage = pmd_ptpage(*pmd); + zap_pte(mm, ptepage, pte); - mm->rss++; flush_page_to_ram(page); flush_icache_page(vma, page); entry = mk_pte(page, protection_map[prot]); @@ -80,8 +84,10 @@ int install_page(struct mm_struct *mm, s set_pte(pte, entry); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); + increment_rss(ptepage); flush_tlb_page(vma, addr); + pte_page_unlock(ptepage); spin_unlock(&mm->page_table_lock); pte_chain_free(pte_chain); return 0; @@ -137,6 +143,11 @@ int sys_remap_file_pages(unsigned long s vma->vm_ops && vma->vm_ops->populate && end > start && start >= vma->vm_start && end <= vma->vm_end) { + vma->vm_flags |= VM_NONLINEAR; + + /* Unshare all the pte pages in the entire vma range */ + unshare_page_range(mm, vma->vm_start, vma->vm_end); + /* * Change the default protection to PROT_NONE: */ diff -urpN -X /home/fletch/.diff.exclude 001-bk10/mm/memory.c 900-mjb5/mm/memory.c --- 001-bk10/mm/memory.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/mm/memory.c Sun Mar 16 13:39:06 2003 @@ -36,6 +36,20 @@ * (Gerhard.Wichert@pdb.siemens.de) */ +/* + * A note on locking of the page table structure: + * + * The top level lock that protects the page table is the + * mm->page_table_lock. This lock protects the pgd and pmd layer. + * However, with the advent of shared pte pages, this lock is not + * sufficient. The pte layer is now protected by the pte_page_lock, + * set in the struct page of the pte page. Note that with this + * locking scheme, once the pgd and pmd layers have been set in the + * page fault path and the pte_page_lock has been taken, the + * page_table_lock can be released. + * + */ + #include #include #include @@ -45,6 +59,7 @@ #include #include #include +#include #include #include @@ -78,67 +93,10 @@ static inline void copy_cow_page(struct copy_user_highpage(to, from, address); } -/* - * Note: this doesn't free the actual pages themselves. That - * has been handled earlier when unmapping all the memory regions. - */ -static inline void free_one_pmd(struct mmu_gather *tlb, pmd_t * dir) -{ - struct page *page; - - if (pmd_none(*dir)) - return; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return; - } - page = pmd_page(*dir); - pmd_clear(dir); - pgtable_remove_rmap(page); - pte_free_tlb(tlb, page); -} - -static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) -{ - int j; - pmd_t * pmd; - - if (pgd_none(*dir)) - return; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return; - } - pmd = pmd_offset(dir, 0); - pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(tlb, pmd+j); - pmd_free_tlb(tlb, pmd); -} - -/* - * This function clears all user-level page tables of a process - this - * is needed by execve(), so that old pages aren't in the way. - * - * Must be called with pagetable lock held. - */ -void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr) -{ - pgd_t * page_dir = tlb->mm->pgd; - - page_dir += first; - do { - free_one_pgd(tlb, page_dir); - page_dir++; - } while (--nr); -} - pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { if (!pmd_present(*pmd)) { - struct page *new; + struct ptpage *new; spin_unlock(&mm->page_table_lock); new = pte_alloc_one(mm, address); @@ -154,8 +112,12 @@ pte_t * pte_alloc_map(struct mm_struct * pte_free(new); goto out; } + SetPagePtepage(new); pgtable_add_rmap(new, mm, address); pmd_populate(mm, pmd, new); + atomic_inc(&mm->ptepages); + inc_page_state(nr_page_table_pages); + init_MUTEX(&new->sem); } out: return pte_offset_map(pmd, address); @@ -180,7 +142,6 @@ pte_t * pte_alloc_kernel(struct mm_struc pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); pmd_populate_kernel(mm, pmd, new); } out: @@ -249,6 +210,7 @@ skip_copy_pmd_range: address = (address goto nomem; do { + struct ptpage *src_page, *dst_page; pte_t * src_pte, * dst_pte; /* copy_pte_range */ @@ -268,7 +230,10 @@ skip_copy_pte_range: dst_pte = pte_alloc_map(dst, dst_pmd, address); if (!dst_pte) goto nomem; - spin_lock(&src->page_table_lock); + spin_lock(&src->page_table_lock); + src_page = pmd_ptpage(*src_pmd); + dst_page = pmd_ptpage(*dst_pmd); + pte_page_lock(src_page); src_pte = pte_offset_map_nested(src_pmd, address); do { pte_t pte = *src_pte; @@ -283,6 +248,7 @@ skip_copy_pte_range: if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); set_pte(dst_pte, pte); + increment_swapcount(dst_page); goto cont_copy_pte_range_noset; } pfn = pte_pfn(pte); @@ -316,7 +282,7 @@ skip_copy_pte_range: pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(page); - dst->rss++; + increment_rss(dst_page); cont_copy_pte_range: set_pte(dst_pte, pte); @@ -332,6 +298,7 @@ cont_copy_pte_range: * pte_chain allocation failed, and we need to * run page reclaim. */ + pte_page_unlock(src_page); pte_unmap_nested(src_pte); pte_unmap(dst_pte); spin_unlock(&src->page_table_lock); @@ -341,12 +308,15 @@ cont_copy_pte_range: if (!pte_chain) goto nomem; spin_lock(&src->page_table_lock); + src_page = pmd_ptpage(*src_pmd); + pte_page_lock(src_page); dst_pte = pte_offset_map(dst_pmd, address); src_pte = pte_offset_map_nested(src_pmd, address); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { + pte_page_unlock(src_page); pte_unmap_nested(src_pte); pte_unmap(dst_pte); goto out_unlock; @@ -354,6 +324,7 @@ cont_copy_pte_range_noset: src_pte++; dst_pte++; } while ((unsigned long)src_pte & PTE_TABLE_MASK); + pte_page_unlock(src_page); pte_unmap_nested(src_pte-1); pte_unmap(dst_pte-1); spin_unlock(&src->page_table_lock); @@ -379,19 +350,15 @@ zap_pte_range(struct mmu_gather *tlb, pm { unsigned long offset; pte_t *ptep; + struct ptpage *ptepage = pmd_ptpage(*pmd); - if (pmd_none(*pmd)) - return; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - ptep = pte_offset_map(pmd, address); offset = address & ~PMD_MASK; if (offset + size > PMD_SIZE) size = PMD_SIZE - offset; size &= PAGE_MASK; + + ptep = pte_offset_map(pmd, address); + for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { pte_t pte = *ptep; if (pte_none(pte)) @@ -411,19 +378,30 @@ zap_pte_range(struct mmu_gather *tlb, pm mark_page_accessed(page); tlb->freed++; page_remove_rmap(page, ptep); + decrement_rss(ptepage); tlb_remove_page(tlb, page); } } } else { free_swap_and_cache(pte_to_swp_entry(pte)); + decrement_swapcount(ptepage); pte_clear(ptep); } + if (!ptepage->mapcount && !ptepage->swapcount) { + pmd_clear(pmd); + pgtable_remove_rmap_locked(ptepage, tlb->mm); + atomic_dec(&tlb->mm->ptepages); + dec_page_state(nr_page_table_pages); + clear_pte_page(ptepage); + pte_free_tlb(tlb, ptepage); + break; + } } pte_unmap(ptep-1); } static void -zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir, +zap_pmd_range(struct mmu_gather **tlb, pgd_t * dir, unsigned long address, unsigned long size) { pmd_t * pmd; @@ -441,13 +419,27 @@ zap_pmd_range(struct mmu_gather *tlb, pg if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) end = ((address + PGDIR_SIZE) & PGDIR_MASK); do { - zap_pte_range(tlb, pmd, address, end - address); + if (pmd_none(*pmd)) + goto skip_pmd; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + goto skip_pmd; + } + + if (zap_shared_range(tlb, pmd, address, end)) { + struct ptpage *ptepage = pmd_ptpage(*pmd); + zap_pte_range(*tlb, pmd, address, end - address); + pte_page_unlock(ptepage); + } +skip_pmd: address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); } -void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, +void +unmap_page_range(struct mmu_gather **tlb, struct vm_area_struct *vma, unsigned long address, unsigned long end) { pgd_t * dir; @@ -460,13 +452,13 @@ void unmap_page_range(struct mmu_gather BUG_ON(address >= end); dir = pgd_offset(vma->vm_mm, address); - tlb_start_vma(tlb, vma); + tlb_start_vma(*tlb, vma); do { zap_pmd_range(tlb, dir, address, end - address); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); - tlb_end_vma(tlb, vma); + tlb_end_vma(*tlb, vma); } /* Dispose of an entire struct mmu_gather per rescheduling point */ @@ -561,7 +553,7 @@ int unmap_vmas(struct mmu_gather **tlbp, tlb_start_valid = 1; } - unmap_page_range(*tlbp, vma, start, start + block); + unmap_page_range(tlbp, vma, start, start + block); start += block; zap_bytes -= block; if (zap_bytes != 0) @@ -611,6 +603,179 @@ void zap_page_range(struct vm_area_struc spin_unlock(&mm->page_table_lock); } +/** + * unmap_all_pages - unmap all the pages for an mm_struct + * @mm: the mm_struct to unmap + * + * This function is only called when an mm_struct is about to be + * released. It walks through all vmas and removes their pages + * from the page table. It understands shared pte pages and will + * decrement the count appropriately. + */ +void unmap_all_pages(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + struct ptpage *ptepage; + struct page *pagevec[16]; + int npages = 0; + unsigned long address; + unsigned long vm_end, pmd_end, pte_end; + + lru_add_drain(); + + vma = mm->mmap; + + /* On the off chance that the first vma is hugetlb... */ + if (is_vm_hugetlb_page(vma)) { + unmap_hugepage_range(vma, vma->vm_start, vma->vm_end); + vma = vma->vm_next; + mm->map_count--; + } + + for (;;) { + if (!vma) + goto out; + + address = vma->vm_start; +next_vma: + vm_end = vma->vm_end; + mm->map_count--; + /* + * Advance the vma pointer to the next vma. + * To facilitate coalescing adjacent vmas, the + * pointer always points to the next one + * beyond the range we're currently working + * on, which means vma will be null on the + * last iteration. + */ + vma = vma->vm_next; + if (vma) { + /* + * Go ahead and include hugetlb vmas + * in the range we process. The pmd + * entry will be cleared by close, so + * we'll just skip over them. This is + * easier than trying to avoid them. + */ + if (is_vm_hugetlb_page(vma)) + unmap_hugepage_range(vma, vma->vm_start, vma->vm_end); + + /* + * Coalesce adjacent vmas and process + * them all in one iteration. + */ + if (vma->vm_start == vm_end) { + goto next_vma; + } + } + pgd = pgd_offset(mm, address); + do { + if (pgd_none(*pgd)) + goto skip_pgd; + + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); +skip_pgd: + address = (address + PGDIR_SIZE) & PGDIR_MASK; + if (address > vm_end) + address = vm_end; + goto next_pgd; + } + pmd = pmd_offset(pgd, address); + if (vm_end > ((address + PGDIR_SIZE) & PGDIR_MASK)) + pmd_end = (address + PGDIR_SIZE) & PGDIR_MASK; + else + pmd_end = vm_end; + + do { + if (pmd_none(*pmd)) + goto skip_pmd; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); +skip_pmd: + address = (address + PMD_SIZE) & PMD_MASK; + if (address > pmd_end) + address = pmd_end; + goto next_pmd; + } + if (!zap_shared_pmd(mm, pmd)) + goto skip_pmd; + + ptepage = pmd_ptpage(*pmd); + pte = pte_offset_map(pmd, address); + if (pmd_end > ((address + PMD_SIZE) & PMD_MASK)) + pte_end = (address + PMD_SIZE) & PMD_MASK; + else + pte_end = pmd_end; + do { + pte_t pteval = *pte; + + if (pte_none(pteval)) + goto next_pte; + if (pte_present(pteval)) { + unsigned long pfn = pte_pfn(pteval); + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) { + if (pte_dirty(pteval)) + set_page_dirty(page); + if (page->mapping && + pte_young(pteval) && + !PageSwapCache(page)) + mark_page_accessed(page); + page_remove_rmap(page, pte); + decrement_rss(ptepage); + pagevec[npages++] = page; + if (npages == 16) { + free_pages_and_swap_cache(pagevec, npages); + npages = 0; + } + + } + } + } else { + free_swap_and_cache(pte_to_swp_entry(pteval)); + decrement_swapcount(ptepage); + } + pte_clear(pte); + if (!ptepage->mapcount && !ptepage->swapcount) { + pmd_clear(pmd); + pgtable_remove_rmap(ptepage, mm); + atomic_dec(&mm->ptepages); + dec_page_state(nr_page_table_pages); + clear_pte_page(ptepage); + pte_free(ptepage); + address = pte_end; + break; + } +next_pte: + address += PAGE_SIZE; + pte++; + } while (address < pte_end); + pte_unmap(pte-1); +next_pmd: + pmd++; + } while (address < pmd_end); +next_pgd: + pgd++; + } while (address < vm_end); + } + +out: + if (npages) + free_pages_and_swap_cache(pagevec, npages); + + if (atomic_read(&mm->ptepages) != 0) + BUG(); + + flush_tlb_mm(mm); +} + /* * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. @@ -854,11 +1019,14 @@ static inline int remap_pmd_range(struct end = PGDIR_SIZE; phys_addr -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); + pte_t *pte; + pte = pte_alloc_unshare(mm, pmd, base + address); if (!pte) return -ENOMEM; + remap_pte_range(pte, base + address, end - address, address + phys_addr, prot); pte_unmap(pte); + pte_page_unlock(pmd_ptpage(*pmd)); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -948,6 +1116,7 @@ static int do_wp_page(struct mm_struct * unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) { struct page *old_page, *new_page; + struct ptpage *ptepage = pmd_ptpage(*pmd); unsigned long pfn = pte_pfn(pte); struct pte_chain *pte_chain = NULL; int ret; @@ -983,7 +1152,7 @@ static int do_wp_page(struct mm_struct * * Ok, we need to copy. Oh, well.. */ page_cache_get(old_page); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) @@ -996,13 +1165,14 @@ static int do_wp_page(struct mm_struct * /* * Re-check the pte - we dropped the lock */ - spin_lock(&mm->page_table_lock); + ptepage = pte_page_lock_pmd(mm, pmd); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { - if (PageReserved(old_page)) - ++mm->rss; + if (PageReserved(old_page)) + increment_rss(ptepage); page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + SetPageAnon(new_page); pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); @@ -1020,7 +1190,7 @@ no_mem: oom: ret = VM_FAULT_OOM; out: - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); pte_chain_free(pte_chain); return ret; } @@ -1142,13 +1312,14 @@ static int do_swap_page(struct mm_struct pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) { struct page *page; + struct ptpage *ptepage = pmd_ptpage(*pmd); swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; int ret = VM_FAULT_MINOR; struct pte_chain *pte_chain = NULL; pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry); @@ -1158,14 +1329,14 @@ static int do_swap_page(struct mm_struct * Back out if somebody else faulted in this pte while * we released the page table lock. */ - spin_lock(&mm->page_table_lock); + ptepage = pte_page_lock_pmd(mm, pmd); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, orig_pte)) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); goto out; } @@ -1186,11 +1357,11 @@ static int do_swap_page(struct mm_struct * Back out if somebody else faulted in this pte while we * released the page table lock. */ - spin_lock(&mm->page_table_lock); + ptepage = pte_page_lock_pmd(mm, pmd); page_table = pte_offset_map(pmd, address); if (!pte_same(*page_table, orig_pte)) { pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); unlock_page(page); page_cache_release(page); ret = VM_FAULT_MINOR; @@ -1203,7 +1374,6 @@ static int do_swap_page(struct mm_struct if (vm_swap_full()) remove_exclusive_swap_page(page); - mm->rss++; pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte = pte_mkdirty(pte_mkwrite(pte)); @@ -1212,12 +1382,15 @@ static int do_swap_page(struct mm_struct flush_page_to_ram(page); flush_icache_page(vma, page); set_pte(page_table, pte); + SetPageAnon(page); pte_chain = page_add_rmap(page, page_table, pte_chain); + increment_rss(ptepage); + decrement_swapcount(ptepage); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); out: pte_chain_free(pte_chain); return ret; @@ -1235,20 +1408,10 @@ do_anonymous_page(struct mm_struct *mm, { pte_t entry; struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; + struct ptpage *ptepage = pmd_ptpage(*pmd); + struct pte_chain *pte_chain = NULL; int ret; - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - } - /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1256,43 +1419,48 @@ do_anonymous_page(struct mm_struct *mm, if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); + + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + ret = VM_FAULT_OOM; + goto out; + } page = alloc_page(GFP_HIGHUSER); - if (!page) - goto no_mem; + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } clear_user_highpage(page, addr); - spin_lock(&mm->page_table_lock); + ptepage = pte_page_lock_pmd(mm, pmd); page_table = pte_offset_map(pmd, addr); if (!pte_none(*page_table)) { pte_unmap(page_table); page_cache_release(page); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); ret = VM_FAULT_MINOR; goto out; } - mm->rss++; flush_page_to_ram(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); mark_page_accessed(page); + SetPageAnon(page); + pte_chain = page_add_rmap(page, page_table, pte_chain); + increment_rss(ptepage); } set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); ret = VM_FAULT_MINOR; - goto out; -no_mem: - ret = VM_FAULT_OOM; out: pte_chain_free(pte_chain); return ret; @@ -1315,6 +1483,7 @@ do_no_page(struct mm_struct *mm, struct unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) { struct page * new_page; + struct ptpage *ptepage = pmd_ptpage(*pmd); pte_t entry; struct pte_chain *pte_chain; int ret; @@ -1323,7 +1492,7 @@ do_no_page(struct mm_struct *mm, struct return do_anonymous_page(mm, vma, page_table, pmd, write_access, address); pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); @@ -1349,10 +1518,11 @@ do_no_page(struct mm_struct *mm, struct copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); + SetPageAnon(page); new_page = page; } - spin_lock(&mm->page_table_lock); + pte_page_lock_pmd(mm, pmd); page_table = pte_offset_map(pmd, address); /* @@ -1367,7 +1537,6 @@ do_no_page(struct mm_struct *mm, struct */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { - ++mm->rss; flush_page_to_ram(new_page); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); @@ -1376,18 +1545,19 @@ do_no_page(struct mm_struct *mm, struct set_pte(page_table, entry); pte_chain = page_add_rmap(new_page, page_table, pte_chain); pte_unmap(page_table); + increment_rss(ptepage); } else { /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); page_cache_release(new_page); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); ret = VM_FAULT_MINOR; goto out; } /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); ret = VM_FAULT_MAJOR; goto out; oom: @@ -1445,7 +1615,7 @@ static inline int handle_pte_fault(struc entry = pte_mkyoung(entry); establish_pte(vma, address, pte, entry); pte_unmap(pte); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(pmd_ptpage(*pmd)); return VM_FAULT_MINOR; } @@ -1474,9 +1644,13 @@ int handle_mm_fault(struct mm_struct *mm pmd = pmd_alloc(mm, pgd, address); if (pmd) { - pte_t * pte = pte_alloc_map(mm, pmd, address); - if (pte) + pte_t * pte; + + pte = pte_fault_alloc(mm, vma, pmd, address, write_access); + if (pte) { + spin_unlock(&mm->page_table_lock); return handle_pte_fault(mm, vma, address, write_access, pte, pmd); + } } spin_unlock(&mm->page_table_lock); return VM_FAULT_OOM; diff -urpN -X /home/fletch/.diff.exclude 001-bk10/mm/mmap.c 900-mjb5/mm/mmap.c --- 001-bk10/mm/mmap.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/mm/mmap.c Sun Mar 16 13:39:06 2003 @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include @@ -611,6 +613,7 @@ munmap_back: return -ENOMEM; goto munmap_back; } + unshare_page_range(mm, addr, len); /* Check against address space limit. */ if ((mm->total_vm << PAGE_SHIFT) + len @@ -1020,69 +1023,6 @@ find_extend_vma(struct mm_struct * mm, u } #endif -/* - * Try to free as many page directory entries as we can, - * without having to work very hard at actually scanning - * the page tables themselves. - * - * Right now we try to free page tables if we have a nice - * PGDIR-aligned area that got free'd up. We could be more - * granular if we want to, but this is fast and simple, - * and covers the bad cases. - * - * "prev", if it exists, points to a vma before the one - * we just free'd - but there's no telling how much before. - */ -static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, - unsigned long start, unsigned long end) -{ - unsigned long first = start & PGDIR_MASK; - unsigned long last = end + PGDIR_SIZE - 1; - unsigned long start_index, end_index; - struct mm_struct *mm = tlb->mm; - - if (!prev) { - prev = mm->mmap; - if (!prev) - goto no_mmaps; - if (prev->vm_end > start) { - if (last > prev->vm_start) - last = prev->vm_start; - goto no_mmaps; - } - } - for (;;) { - struct vm_area_struct *next = prev->vm_next; - - if (next) { - if (next->vm_start < start) { - prev = next; - continue; - } - if (last > next->vm_start) - last = next->vm_start; - } - if (prev->vm_end > first) - first = prev->vm_end + PGDIR_SIZE - 1; - break; - } -no_mmaps: - if (last < first) /* for arches with discontiguous pgd indices */ - return; - /* - * If the PGD bits are not consecutive in the virtual address, the - * old method of shifting the VA >> by PGDIR_SHIFT doesn't work. - */ - start_index = pgd_index(first); - if (start_index < FIRST_USER_PGD_NR) - start_index = FIRST_USER_PGD_NR; - end_index = pgd_index(last); - if (end_index > start_index) { - clear_page_tables(tlb, start_index, end_index - start_index); - flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK); - } -} - /* Normal function to fix up a mapping * This function is the default for when an area has no specific * function. This may be used as part of a more specific routine. @@ -1148,7 +1088,6 @@ static void unmap_region(struct mm_struc tlb = tlb_gather_mmu(mm, 0); unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted); vm_unacct_memory(nr_accounted); - free_pgtables(tlb, prev, start, end); tlb_finish_mmu(tlb, start, end); } @@ -1407,25 +1346,16 @@ void build_mmap_rb(struct mm_struct * mm /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { - struct mmu_gather *tlb; struct vm_area_struct *vma; - unsigned long nr_accounted = 0; profile_exit_mmap(mm); lru_add_drain(); - spin_lock(&mm->page_table_lock); - - tlb = tlb_gather_mmu(mm, 1); flush_cache_mm(mm); - /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ - mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, - ~0UL, &nr_accounted); - vm_unacct_memory(nr_accounted); - BUG_ON(mm->map_count); /* This is just debugging */ - clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); - tlb_finish_mmu(tlb, 0, TASK_SIZE); + unmap_all_pages(mm); + + BUG_ON(mm->map_count); /* This is just debugging */ vma = mm->mmap; mm->mmap = mm->mmap_cache = NULL; @@ -1434,14 +1364,20 @@ void exit_mmap(struct mm_struct *mm) mm->total_vm = 0; mm->locked_vm = 0; - spin_unlock(&mm->page_table_lock); - /* * Walk the list again, actually closing and freeing it * without holding any MM locks. */ while (vma) { struct vm_area_struct *next = vma->vm_next; + + /* + * If the VMA has been charged for, account for its + * removal + */ + if (vma->vm_flags & VM_ACCOUNT) + vm_unacct_memory((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); + remove_shared_vm_struct(vma); if (vma->vm_ops) { if (vma->vm_ops->close) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/mm/mprotect.c 900-mjb5/mm/mprotect.c --- 001-bk10/mm/mprotect.c Fri Dec 13 23:18:15 2002 +++ 900-mjb5/mm/mprotect.c Sun Mar 16 13:39:06 2003 @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include @@ -24,7 +26,7 @@ #include static inline void -change_pte_range(pmd_t *pmd, unsigned long address, +change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, unsigned long size, pgprot_t newprot) { pte_t * pte; @@ -37,11 +39,14 @@ change_pte_range(pmd_t *pmd, unsigned lo pmd_clear(pmd); return; } - pte = pte_offset_map(pmd, address); - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; + end = (address + PMD_SIZE) & PMD_MASK; + if (end > (address + size)) + end = address + size; + + pte = mprotect_shared_range(vma, pmd, address, end); + if (pte == NULL) + return; + do { if (pte_present(*pte)) { pte_t entry; @@ -56,11 +61,12 @@ change_pte_range(pmd_t *pmd, unsigned lo address += PAGE_SIZE; pte++; } while (address && (address < end)); + pte_page_unlock(pmd_ptpage(*pmd)); pte_unmap(pte - 1); } static inline void -change_pmd_range(pgd_t *pgd, unsigned long address, +change_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long address, unsigned long size, pgprot_t newprot) { pmd_t * pmd; @@ -74,12 +80,12 @@ change_pmd_range(pgd_t *pgd, unsigned lo return; } pmd = pmd_offset(pgd, address); - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; + end = (address + PGDIR_SIZE) & PGDIR_MASK; + if (end > (address + size)) + end = address + size; + do { - change_pte_range(pmd, address, end - address, newprot); + change_pte_range(vma, pmd, address, end - address, newprot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -98,7 +104,7 @@ change_protection(struct vm_area_struct BUG(); spin_lock(¤t->mm->page_table_lock); do { - change_pmd_range(dir, start, end - start, newprot); + change_pmd_range(vma, dir, start, end - start, newprot); start = (start + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (start && (start < end)); diff -urpN -X /home/fletch/.diff.exclude 001-bk10/mm/mremap.c 900-mjb5/mm/mremap.c --- 001-bk10/mm/mremap.c Sun Mar 16 13:38:21 2003 +++ 900-mjb5/mm/mremap.c Sun Mar 16 13:39:06 2003 @@ -16,106 +16,23 @@ #include #include #include +#include #include #include #include #include -static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte = NULL; - - pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) - goto end; - if (pgd_bad(*pgd)) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - goto end; - } - - pmd = pmd_offset(pgd, addr); - if (pmd_none(*pmd)) - goto end; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - goto end; - } - - pte = pte_offset_map_nested(pmd, addr); - if (pte_none(*pte)) { - pte_unmap_nested(pte); - pte = NULL; - } -end: - return pte; -} - -#ifdef CONFIG_HIGHPTE /* Save a few cycles on the sane machines */ -static inline int page_table_present(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pgd; - pmd_t *pmd; - - pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) - return 0; - pmd = pmd_offset(pgd, addr); - return pmd_present(*pmd); -} -#else -#define page_table_present(mm, addr) (1) -#endif - -static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) -{ - pmd_t *pmd; - pte_t *pte = NULL; - - pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); - if (pmd) - pte = pte_alloc_map(mm, pmd, addr); - return pte; -} - -static int -copy_one_pte(struct mm_struct *mm, pte_t *src, pte_t *dst, - struct pte_chain **pte_chainp) -{ - int error = 0; - pte_t pte; - struct page *page = NULL; - - if (pte_present(*src)) - page = pte_page(*src); - - if (!pte_none(*src)) { - if (page) - page_remove_rmap(page, src); - pte = ptep_get_and_clear(src); - if (!dst) { - /* No dest? We must put it back. */ - dst = src; - error++; - } - set_pte(dst, pte); - if (page) - *pte_chainp = page_add_rmap(page, dst, *pte_chainp); - } - return error; -} - static int move_one_page(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr) { struct mm_struct *mm = vma->vm_mm; int error = 0; - pte_t *src, *dst; + struct ptpage *src_page, *dst_page; + pgd_t *src_pgd, *dst_pgd; + pmd_t *src_pmd, *dst_pmd; + pte_t *src_pte, *dst_pte; struct pte_chain *pte_chain; pte_chain = pte_chain_alloc(GFP_KERNEL); @@ -124,28 +41,61 @@ move_one_page(struct vm_area_struct *vma goto out; } spin_lock(&mm->page_table_lock); - src = get_one_pte_map_nested(mm, old_addr); - if (src) { - /* - * Look to see whether alloc_one_pte_map needs to perform a - * memory allocation. If it does then we need to drop the - * atomic kmap - */ - if (!page_table_present(mm, new_addr)) { - pte_unmap_nested(src); - src = NULL; + src_pgd = pgd_offset(mm, old_addr); + dst_pgd = pgd_offset(mm, new_addr); + src_pmd = pmd_offset(src_pgd, old_addr); + + /* If there isn't a pmd to copy from, we're done */ + if (!src_pmd) + goto out_unlock; + if (!pmd_present(*src_pmd)) + goto out_unlock; + + dst_pmd = pmd_alloc(mm, dst_pgd, new_addr); + if (!dst_pmd) { + error++; + goto out_unlock; + } + + mremap_unshare(vma->vm_mm, src_pmd, dst_pmd, old_addr, new_addr); + + dst_pte = pte_alloc_map(mm, dst_pmd, new_addr); + if (!dst_pte) { + error++; + goto out_unlock; + } + dst_page = pmd_ptpage(*dst_pmd); + pte_page_lock(dst_page); + + src_page = pmd_ptpage(*src_pmd); + if (src_page != dst_page) + pte_page_lock(src_page); + src_pte = pte_offset_map_nested(src_pmd, old_addr); + + if (!pte_none(*src_pte)) { + pte_t pte = ptep_get_and_clear(src_pte); + set_pte(dst_pte, pte); + if (pte_present(pte)) { + struct page *page = pte_page(pte); + page_remove_rmap(page, src_pte); + if (src_page != dst_page) { + decrement_rss(src_page); + increment_rss(dst_page); + } + pte_chain = page_add_rmap(page, dst_pte, pte_chain); } - dst = alloc_one_pte_map(mm, new_addr); - if (src == NULL) - src = get_one_pte_map_nested(mm, old_addr); - error = copy_one_pte(mm, src, dst, &pte_chain); - pte_unmap_nested(src); - pte_unmap(dst); } + pte_unmap_nested(src_pte); + pte_unmap(dst_pte); + pte_page_unlock(dst_page); + if (src_page != dst_page) + pte_page_unlock(src_page); flush_tlb_page(vma, old_addr); + +out_unlock: spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); out: + pte_chain_free(pte_chain); return error; } diff -urpN -X /home/fletch/.diff.exclude 001-bk10/mm/msync.c 900-mjb5/mm/msync.c --- 001-bk10/mm/msync.c Sun Nov 17 20:29:31 2002 +++ 900-mjb5/mm/msync.c Sun Mar 16 13:39:06 2003 @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include @@ -43,6 +45,7 @@ static int filemap_sync_pte_range(pmd_t unsigned long address, unsigned long end, struct vm_area_struct *vma, unsigned int flags) { + struct ptpage *ptepage; pte_t *pte; int error; @@ -53,6 +56,8 @@ static int filemap_sync_pte_range(pmd_t pmd_clear(pmd); return 0; } + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); pte = pte_offset_map(pmd, address); if ((address & PMD_MASK) != (end & PMD_MASK)) end = (address & PMD_MASK) + PMD_SIZE; @@ -64,6 +69,7 @@ static int filemap_sync_pte_range(pmd_t } while (address && (address < end)); pte_unmap(pte - 1); + pte_page_unlock(ptepage); return error; } diff -urpN -X /home/fletch/.diff.exclude 001-bk10/mm/page_alloc.c 900-mjb5/mm/page_alloc.c --- 001-bk10/mm/page_alloc.c Thu Feb 13 11:08:15 2003 +++ 900-mjb5/mm/page_alloc.c Sun Mar 16 13:39:02 2003 @@ -85,7 +85,8 @@ static void bad_page(const char *functio page->mapping = NULL; } -#ifndef CONFIG_HUGETLB_PAGE +#if !defined(CONFIG_HUGETLB_PAGE) && !defined(CONFIG_CRASH_DUMP) \ + && !defined(CONFIG_CRASH_DUMP_MODULE) #define prep_compound_page(page, order) do { } while (0) #define destroy_compound_page(page, order) do { } while (0) #else @@ -220,6 +221,8 @@ static inline void free_pages_check(cons bad_page(function, page); if (PageDirty(page)) ClearPageDirty(page); + if (PageAnon(page)) + ClearPageAnon(page); } /* @@ -265,6 +268,7 @@ void __free_pages_ok(struct page *page, mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); + list_add(&page->list, &list); free_pages_bulk(page_zone(page), 1, &list, order); } @@ -447,6 +451,7 @@ static void free_hot_cold_page(struct pa inc_page_state(pgfree); free_pages_check(__FUNCTION__, page); + pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); if (pcp->count >= pcp->high) diff -urpN -X /home/fletch/.diff.exclude 001-bk10/mm/ptshare.c 900-mjb5/mm/ptshare.c --- 001-bk10/mm/ptshare.c Wed Dec 31 16:00:00 1969 +++ 900-mjb5/mm/ptshare.c Sun Mar 16 13:39:06 2003 @@ -0,0 +1,841 @@ +/* + * mm/ptshare.c + * + * Shared page table support. + * + * Created 2002 by Dave McCracken (dmccr@us.ibm.com) + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * Protections that can be set on the pmd entry (see discussion in mmap.c). + */ +static pgprot_t protection_pmd[8] = { + __PMD000, __PMD001, __PMD010, __PMD011, __PMD100, __PMD101, __PMD110, __PMD111 +}; + +/** + * is_pte_shared - Basic test for whether a pte page is shared + * @ptepage - the struct page of the pte page to test + * + * The count field in the page struct counts how many page tables are using this pte + * page. The share test simply tests for more then one reference. + */ +static inline int is_pte_shared(struct ptpage *ptepage) +{ + return page_count(ptepage) > 1; +} + +/** + * pte_needs_unshare - Test whether a pte page needs to be unshared at fault time + * @mm - The mm_struct being faulted + * @vma - The vma describing the range the faulting address is in + * @pmd - The pmd entry of the faulting address + * @address - The faulting address itself + * @write_access - True if it was a write fault + * + * This function makes the decision whether a pte page needs to be + * unshared or not. Note that page_count() == 1 isn't even tested + * here. The assumption is that if the pmd entry is marked writeable, + * then the page is either already unshared or doesn't need to be + * unshared. This catches the situation where task B unshares the pte + * page, then task A faults and needs to unprotect the pmd entry. + * This is actually done in pte_unshare. + * + * This function should be called with the page_table_lock held. + */ +static int pte_needs_unshare(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address, + int write_access) +{ + struct ptpage *ptepage; + + /* It's not even there, nothing to unshare. */ + if (!pmd_present(*pmd)) + return 0; + + /* + * If it's already writable, then it doesn't need to be unshared. + * It's either already not shared or it's part of a large shared + * region that will never need to be unshared. + */ + if (pmd_write(*pmd)) + return 0; + + /* If this isn't a write fault we don't need to unshare. */ + if (!write_access) + return 0; + + /* + * If this page fits entirely inside a shared region, don't unshare it. + */ + ptepage = pmd_ptpage(*pmd); + if ((vma->vm_flags & VM_SHARED) && + (vma->vm_start <= ptepage->virtual) && + (vma->vm_end >= (ptepage->virtual + PMD_SIZE))) { + return 0; + } + /* + * Ok, we have to unshare. + */ + return 1; +} + +/** + * pte_unshare - Unshare a pte page + * @mm: the mm_struct that gets an unshared pte page + * @pmd: a pointer to the pmd entry that needs unsharing + * @address: the virtual address that triggered the unshare + * + * Here is where a pte page is actually unshared. It actually covers + * a couple of possible conditions. If the page_count() is already 1, + * then that means it just needs to be set writeable. Otherwise, a + * new page needs to be allocated. + * + * When each pte entry is copied, it is evaluated for COW protection, + * as well as checking whether the swap count needs to be incremented. + * + * This function must be called with the page_table_lock held. It + * will release and reacquire the lock when it allocates a new page. + * + * The function must also be called with the pte_page_lock held on the + * old page. This lock will also be dropped, then reacquired when we + * allocate a new page. The pte_page_lock will be taken on the new + * page. Whichever pte page is returned will have its pte_page_lock + * held. + */ + +static pte_t *pte_unshare(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + pte_t *src_ptb, *dst_ptb; + struct ptpage *oldpage, *newpage, *tmppage; + struct vm_area_struct *vma; + struct pte_chain *pte_chain = NULL; + int base, addr; + int end, page_end; + int src_unshare; + +retry: + tmppage = oldpage = pmd_ptpage(*pmd); + + /* If it's already unshared, we just need to set it writeable */ + if (!is_pte_shared(oldpage)) + goto is_unshared; + + pte_page_unlock(oldpage); + spin_unlock(&mm->page_table_lock); + newpage = pte_alloc_one(mm, address); + if (newpage) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (pte_chain) { + down(&oldpage->sem); + } + } + spin_lock(&mm->page_table_lock); + if (unlikely(!newpage)) + return NULL; + if (!pte_chain) { + put_page((struct page *)newpage); + return NULL; + } + + /* + * Fetch the ptepage pointer again in case it changed while + * the lock was dropped. + */ + oldpage = pmd_ptpage(*pmd); + pte_page_lock(oldpage); + if (tmppage != oldpage) { + up(&tmppage->sem); + pte_free(newpage); + pte_chain_free(pte_chain); + goto retry; + } + + /* See if it got unshared while we dropped the lock */ + if (!is_pte_shared(oldpage)) { + pte_free(newpage); + up(&oldpage->sem); + goto is_unshared; + } + + pte_page_lock(newpage); + + init_MUTEX(&newpage->sem); + newpage->mapcount = newpage->swapcount = 0; + + base = addr = oldpage->virtual; + page_end = base + PMD_SIZE; + vma = find_vma(mm, base); + src_unshare = page_count(oldpage) == 2; + dst_ptb = pte_page_map((struct page *)newpage, base); + + if (!vma || (page_end <= vma->vm_start)) { + goto no_vma; + } + + if (vma->vm_start > addr) + addr = vma->vm_start; + + if (vma->vm_end < page_end) + end = vma->vm_end; + else + end = page_end; + + src_ptb = pte_page_map_nested((struct page *)oldpage, base); + + do { + unsigned int cow = 0; + pte_t *src_pte = src_ptb + pte_index(addr); + pte_t *dst_pte = dst_ptb + pte_index(addr); + + cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + + do { + pte_t pte = *src_pte; + struct page *page; + + if (pte_none(pte)) + goto unshare_skip_set; + + if (!pte_present(pte)) { + swap_duplicate(pte_to_swp_entry(pte)); + set_pte(dst_pte, pte); + newpage->swapcount++; + goto unshare_skip_set; + } + page = pte_page(pte); + if (!PageReserved(page)) { + /* COW mappings require write protecting both sides */ + if (cow) { + pte = pte_wrprotect(pte); + if (src_unshare) + set_pte(src_pte, pte); + } + /* If it's a shared mapping, + * mark it clean in the new mapping + */ + if (vma->vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + get_page(page); + newpage->mapcount++; + } + set_pte(dst_pte, pte); + pte_chain = page_add_rmap(page, dst_pte, pte_chain); + if (!pte_chain) + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { + pte_unmap_nested(src_ptb); + pte_unmap(dst_ptb); + pte_page_unlock(newpage); + pte_page_unlock(oldpage); + spin_unlock(&mm->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + spin_lock(&mm->page_table_lock); + return NULL; + } + spin_lock(&mm->page_table_lock); + oldpage = pmd_ptpage(*pmd); + pte_page_lock(oldpage); + pte_page_lock(newpage); + dst_ptb = pte_page_map((struct page *)newpage, addr); + src_ptb = pte_page_map_nested((struct page *)oldpage, addr); + } +unshare_skip_set: + src_pte++; + dst_pte++; + addr += PAGE_SIZE; + } while (addr < end); + + if (addr >= page_end) + break; + + vma = vma->vm_next; + if (!vma) + break; + + if (page_end <= vma->vm_start) + break; + + addr = vma->vm_start; + if (vma->vm_end < page_end) + end = vma->vm_end; + else + end = page_end; + } while (1); + + pte_unmap_nested(src_ptb); + +no_vma: + up(&oldpage->sem); + SetPagePtepage(newpage); + pgtable_remove_rmap_locked(oldpage, mm); + pgtable_add_rmap_locked(newpage, mm, base); + pmd_populate(mm, pmd, newpage); + inc_page_state(nr_page_table_pages); + + flush_tlb_mm(mm); + + put_page((struct page *)oldpage); + + pte_page_unlock(oldpage); + pte_chain_free(pte_chain); + return dst_ptb + pte_index(address); + +is_unshared: + pmd_populate(mm, pmd, oldpage); + flush_tlb_mm(mm); + pte_chain_free(pte_chain); + return pte_offset_map(pmd, address); +} + +/** + * pte_try_to_share - Attempt to find a pte page that can be shared + * @mm: the mm_struct that needs a pte page + * @vma: the vm_area the address is in + * @pmd: a pointer to the pmd entry that needs filling + * @address: the address that caused the fault + * + * This function is called during a page fault. If there is no pte + * page for this address, it checks the vma to see if it is shared, + * and if it spans the pte page. If so, it goes to the address_space + * structure and looks through for matching vmas from other tasks that + * already have a pte page that can be shared. If it finds one, it + * attaches it and makes it a shared page. + */ + +static pte_t *pte_try_to_share(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address) +{ + struct address_space *as; + struct vm_area_struct *lvma; + struct ptpage *ptepage; + unsigned long base; + pte_t *pte = NULL; + + /* It's not even shared memory. We definitely can't share the page. */ + if (!(vma->vm_flags & VM_SHARED)) + return NULL; + + /* Areas with nonlinear mappings can't be shared */ + if (vma->vm_flags & VM_NONLINEAR) + return NULL; + + /* We can only share if the entire pte page fits inside the vma */ + base = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); + if ((base < vma->vm_start) || (vma->vm_end < (base + PMD_SIZE))) + return NULL; + + as = vma->vm_file->f_dentry->d_inode->i_mapping; + + down(&as->i_shared_sem); + + list_for_each_entry(lvma, &as->i_mmap_shared, shared) { + pgd_t *lpgd; + pmd_t *lpmd; + pmd_t pmdval; + + /* Skip the one we're working on */ + if (lvma == vma) + continue; + + /* We can't share with a nonlinear vma */ + if (lvma->vm_flags & VM_NONLINEAR) + return NULL; + + /* It has to be mapping to the same address */ + if ((lvma->vm_start != vma->vm_start) || + (lvma->vm_end != vma->vm_end) || + (lvma->vm_pgoff != vma->vm_pgoff)) + continue; + + lpgd = pgd_offset(lvma->vm_mm, address); + lpmd = pmd_offset(lpgd, address); + + /* This page table doesn't have a pte page either, so skip it. */ + if (!pmd_present(*lpmd)) + continue; + + /* Ok, we can share it. */ + + ptepage = pmd_ptpage(*lpmd); + pte_page_lock(ptepage); + get_page(ptepage); + pgtable_add_rmap_locked(ptepage, mm, address); + /* + * If this vma is only mapping it read-only, set the + * pmd entry read-only to protect it from writes. + * Otherwise set it writeable. + */ + pmdval = *lpmd; + pmdval = pmd_modify(pmdval, protection_pmd[vma->vm_flags & 0x7]); + set_pmd(pmd, pmdval); + pte = pte_page_map((struct page *)ptepage, address); + break; + } + up(&as->i_shared_sem); + return pte; +} + +#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) + +/** + * share_page_range - share a range of pages at the pte page level at fork time + * @dst: the mm_struct of the forked child + * @src: the mm_struct of the forked parent + * @vma: the vm_area to be shared + * @prev_pmd: A pointer to the pmd entry we did at last invocation + * + * This function shares pte pages between parent and child at fork. + * If the vm_area is shared and spans the page, it sets it + * writeable. Otherwise it sets it read-only. The prev_pmd parameter + * is used to keep track of pte pages we've already shared, since this + * function can be called with multiple vmas that point to the same + * pte page. + */ +int share_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma, pmd_t **prev_pmd) +{ + pgd_t *src_pgd, *dst_pgd; + unsigned long address = vma->vm_start; + unsigned long end = vma->vm_end; + + if (is_vm_hugetlb_page(vma)) + return copy_hugetlb_page_range(dst, src, vma); + + src_pgd = pgd_offset(src, address)-1; + dst_pgd = pgd_offset(dst, address)-1; + + for (;;) { + pmd_t * src_pmd, * dst_pmd; + + src_pgd++; dst_pgd++; + + if (pgd_none(*src_pgd)) + goto skip_share_pmd_range; + if (pgd_bad(*src_pgd)) { + pgd_ERROR(*src_pgd); + pgd_clear(src_pgd); +skip_share_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; + if (!address || (address >= end)) + goto out; + continue; + } + + src_pmd = pmd_offset(src_pgd, address); + dst_pmd = pmd_alloc(dst, dst_pgd, address); + if (!dst_pmd) + goto nomem; + + spin_lock(&src->page_table_lock); + + do { + pmd_t pmdval = *src_pmd; + struct ptpage *ptepage = pmd_ptpage(pmdval); + + if (pmd_none(pmdval)) + goto skip_share_pte_range; + if (pmd_bad(pmdval)) { + pmd_ERROR(*src_pmd); + pmd_clear(src_pmd); + goto skip_share_pte_range; + } + + /* + * We set the pmd read-only in both the parent and the + * child unless it's a writeable shared region that + * spans the entire pte page. + */ + if ((((vma->vm_flags & (VM_SHARED|VM_WRITE)) != + (VM_SHARED|VM_WRITE)) || + (ptepage->virtual < vma->vm_start) || + ((ptepage->virtual + PMD_SIZE) > vma->vm_end)) && + pmd_write(pmdval)) { + pmdval = pmd_wrprotect(pmdval); + set_pmd(src_pmd, pmdval); + } + set_pmd(dst_pmd, pmdval); + + /* Only do this if we haven't seen this pte page before */ + if (src_pmd != *prev_pmd) { + get_page(ptepage); + pgtable_add_rmap(ptepage, dst, address); + atomic_inc(&dst->ptepages); + *prev_pmd = src_pmd; + dst->rss += ptepage->mapcount; + } + +skip_share_pte_range: address = (address + PMD_SIZE) & PMD_MASK; + if (address >= end) + goto out_unlock; + + src_pmd++; + dst_pmd++; + } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + spin_unlock(&src->page_table_lock); + } + +out_unlock: + spin_unlock(&src->page_table_lock); + +out: + return 0; +nomem: + return -ENOMEM; +} + +/** + * fork_page_range - Either copy or share a page range at fork time + * @dst: the mm_struct of the forked child + * @src: the mm_struct of the forked parent + * @vma: the vm_area to be shared + * @prev_pmd: A pointer to the pmd entry we did at last invocation + * + * This wrapper decides whether to share page tables on fork or just make + * a copy. The current criterion is whether a page table has more than 3 + * pte pages, since all forked processes will unshare 3 pte pages after fork, + * even the ones doing an immediate exec. Tests indicate that if a page + * table has more than 3 pte pages, it's a performance win to share. + */ +int fork_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma, pmd_t **prev_pmd) +{ + if (atomic_read(&src->ptepages) > 3) + return share_page_range(dst, src, vma, prev_pmd); + + return copy_page_range(dst, src, vma); +} + +/** + * unshare_page_range - Make sure no pte pages are shared in a given range + * @mm: the mm_struct whose page table we unshare from + * @address: the base address of the range + * @len: the size of the range + * + * This function is called when a memory region is mapped. It makes sure there + * are no shared pte pages in the region. This is necessary to make sure the + * parent and child don't try to map competing regions into the same shared + * pte page. + */ +void unshare_page_range(struct mm_struct *mm, unsigned long address, unsigned long len) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + struct ptpage *ptepage; + unsigned long end = address + len; + unsigned long pmd_end; + + spin_lock(&mm->page_table_lock); + + do { + pmd_end = (address + PGDIR_SIZE) & PGDIR_MASK; + if (pmd_end > end) + pmd_end = end; + + pgd = pgd_offset(mm, address); + if (pgd_present(*pgd)) do { + pmd = pmd_offset(pgd, address); + if (pmd_present(*pmd)) { + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + pte = pte_unshare(mm, pmd, address); + pte_unmap(pte); + ptepage = pmd_ptpage(*pmd); + } + pte_page_unlock(ptepage); + } + address = (address + PMD_SIZE) & PMD_MASK; + } while (address < pmd_end); + /* The end of the last time around is the start of the next one */ + address = pmd_end; + } while (address < end); + spin_unlock(&mm->page_table_lock); +} + +/** + * pte_alloc_unshare - Map and return an unshared pte page, allocating one if necessary + * @mm - The current mm_struct + * @pmd - The pmd entry that needs to be mapped and/or allocated + * @address - The current address, needed if a new pte page is allocated + * + * For a given pmd entry, make sure a pte page exists and is not shared, then map + * it and return it locked. + * + * This function must be called with the page_table_lock held. It takes the + * pte_page_lock for the pte page being returned and returns with it locked. + * It is up to the caller to unlock it. If the pte_alloc_map fails, NULL is + * returned and no lock is taken. + */ +pte_t *pte_alloc_unshare(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + pte_t *pte; + + if (pmd_present(*pmd)) { + struct ptpage *ptepage; + + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + pte = pte_unshare(mm, pmd, address); + } else + pte = pte_offset_map(pmd, address); + } else { + pte = pte_alloc_map(mm, pmd, address); + if (pte) + pte_page_lock(pmd_ptpage(*pmd)); + } + return pte; +} + +/** + * pte_map_unshare - if a pmd_entry exists, make sure it is unshared and map it + * @mm - The current mm_struct + * @pmd - The pmd entry that needs to be mapped + * @address - The current address, needed if it's unshared. + * + * If a pmd entry is valid, make sure the pte page is unshared, then map it + * and return it locked. If none exists, return NULL. + * + * This function must be called with the page_table_lock held. It takes the + * pte_page_lock for the pte page being returned and returns with it locked + * if one exists. It is up to the caller to unlock it. if no pte page exists + * no lock is taken. + */ +pte_t *pte_map_unshare(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + pte_t *pte; + + if (pmd_present(*pmd)) { + struct ptpage *ptepage; + + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + pte = pte_unshare(mm, pmd, address); + } else + pte = pte_offset_map(pmd, address); + } else + pte = NULL; + + return pte; +} + +/** + * zap_shared_range - helper function for zap_pmd_range in mm/memory.c + * @tlb - The mmu_gather_t being used to coalesce deleted pages + * @pmd - The pmd entry currently being worked on + * @address - The start of the current range + * @end - The end of the current range + * + * Returns false if the pte page was shared and the count decremented, + * true if the page wasn't shared or was unshared. + * + * This function is called as part of deleting a range of pages from a page + * table. It takes care of detecting when a pmd entry points to a shared pte + * page. + * + * If the pte page is shared and the range covers the entire pte page, + * the share count is decremented and the function returns false. If + * the range does not cover the entire range, the pte page is unshared. + * If the pte page is not shared or was unshared, the pte_page_lock is taken + * and the function returns true. It is the responsibility of the caller + * to unlock it. + */ +int zap_shared_range(struct mmu_gather **tlb, pmd_t *pmd, + unsigned long address, unsigned long end) +{ + struct mm_struct *mm = (*tlb)->mm; + struct ptpage *ptepage; + int ret = 1; + + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + if ((address <= ptepage->virtual) && + (end >= (ptepage->virtual + PMD_SIZE))) { + pmd_clear(pmd); + pgtable_remove_rmap_locked(ptepage, mm); + mm->rss -= ptepage->mapcount; + atomic_dec(&mm->ptepages); + put_page((struct page *)ptepage); + pte_page_unlock(ptepage); + ret = 0; + } else { + pte_t *pte; + + tlb_finish_mmu(*tlb, address, end); + pte = pte_unshare(mm, pmd, address); + pte_unmap(pte); + *tlb = tlb_gather_mmu(mm, 0); + } + + } + return ret; +} + +/** + * zap_shared_pmd - helper function for unmap_all_pages in mm/memory.c + * @mm - The mm_struct this page table is associated with + * @pmd - The pmd entry currently being worked on + * + * Returns false if the pte page was shared and the count decremented, + * true if the page wasn't shared. + * + * This function is called when an entire page table is being removed. It + * detects when a pte page is shared and takes care of decrementing the count. + */ +int zap_shared_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + struct ptpage *ptepage; + int ret = 1; + + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + pmd_clear(pmd); + pgtable_remove_rmap_locked(ptepage, mm); + mm->rss -= ptepage->mapcount; + atomic_dec(&mm->ptepages); + put_page((struct page *)ptepage); + ret = 0; + } + pte_page_unlock(ptepage); + return ret; +} + +/** + * mprotect_shared_range - Helper function for change_pte_range in mm/mprotect.c + * @vma - The memory area being changed + * @pmd - The current pmd entry + * @address - The base of the current range + * @end - The end of the current range + * + * If the current range spans the entire pte page, set protections at the pmd entry + * level and return NULL to show nothing else needs to be done. Otherwise lock and + * map the pte page to be worked on. It is up to the caller to unmap the pte pointer + * and unlock the pte_page_lock if the pte page is returned. + */ +pte_t *mprotect_shared_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, unsigned long end) +{ + struct ptpage *ptepage; + pte_t *pte; + + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + + if (is_pte_shared(ptepage)) { + if (((address & ~PMD_MASK) == 0) && ((end & ~PMD_MASK) == 0)) { + pmd_t pmdval = *pmd; + + pmdval = pmd_modify(pmdval, protection_pmd[vma->vm_flags & 0x7]); + set_pmd(pmd, pmdval); + pte_page_unlock(ptepage); + pte = NULL; + } else + pte = pte_unshare(vma->vm_mm, pmd, address); + } else + pte = pte_offset_map(pmd, address); + + return pte; +} + +/** + * mremap_unshare - Helper function for move_one_page in mm/mremap.c + * @mm - The current mm_struct + * @src_pmd - The originating pmd entry + * @dst_pmd - The target pmd entry + * @src_addr - The source address + * @dst_addr - The destination address + * + * Make sure both source and destination are unshared for mremap. Note that + * the existence of src_pmd is guaranteed by the caller, but dst_pmd may + * not exist. The mapping is discarded here since mremap needs them mapped + * differently. + * + * Both the page_table_lock and the mmap_sem are held when this function is called, + * so it is safe to not keep the pte_page_locks for these pages when it's finished. + */ + +void mremap_unshare(struct mm_struct *mm, pmd_t *src_pmd, pmd_t *dst_pmd, + unsigned long src_addr, unsigned long dst_addr) +{ + struct ptpage *ptepage; + pte_t *pte; + + ptepage = pmd_ptpage(*src_pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + pte = pte_unshare(mm, src_pmd, src_addr); + pte_unmap(pte); + ptepage = pmd_ptpage(*src_pmd); + } + pte_page_unlock(ptepage); + + if ((src_pmd != dst_pmd) && + (pmd_present(*dst_pmd))) { + ptepage = pmd_ptpage(*dst_pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + pte = pte_unshare(mm, dst_pmd, dst_addr); + pte_unmap(pte); + ptepage = pmd_ptpage(*dst_pmd); + } + pte_page_unlock(ptepage); + } +} + +/** + * pte_fault_alloc - Helper function for handle_mm_fault in mm/memory.c + * @mm - The faulting mm_struct + * @vma The area the fault is in + * @pmd - The pmd entry that needs handling + * @address - The faulting addresss + * @write_access - True if it's a write fault + * + * This function takes care of allocating and/or sharing/unsharing the pte + * page on a page fault. It determines the shareability of the pte page based + * on the type of fault and the flags in the vma. It then locks and maps + * the pte page before returning a pointer to the pte entry that needs to + * be filled in by the fault. + */ +pte_t *pte_fault_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address, int write_access) +{ + pte_t *pte; + + if (pmd_present(*pmd)) { + pte_page_lock(pmd_ptpage(*pmd)); + if (pte_needs_unshare(mm, vma, pmd, address, write_access)) + pte = pte_unshare(mm, pmd, address); + else + pte = pte_offset_map(pmd, address); + } else { + pte = pte_try_to_share(mm, vma, pmd, address); + if (!pte) { + pte = pte_alloc_map(mm, pmd, address); + if (pte) + pte_page_lock(pmd_ptpage(*pmd)); + } + } + return pte; +} diff -urpN -X /home/fletch/.diff.exclude 001-bk10/mm/rmap.c 900-mjb5/mm/rmap.c --- 001-bk10/mm/rmap.c Thu Jan 9 19:16:15 2003 +++ 900-mjb5/mm/rmap.c Sun Mar 16 13:39:06 2003 @@ -14,11 +14,11 @@ /* * Locking: * - the page->pte.chain is protected by the PG_chainlock bit, - * which nests within the zone->lru_lock, then the - * mm->page_table_lock, and then the page lock. + * which nests within the zone->lru_lock, then the pte_page_lock, + * and then the page lock. * - because swapout locking is opposite to the locking order * in the page fault path, the swapout path uses trylocks - * on the mm->page_table_lock + * on the pte_page_lock. */ #include #include @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -47,11 +48,17 @@ */ #define NRPTE ((L1_CACHE_BYTES - sizeof(void *))/sizeof(pte_addr_t)) +struct mm_chain { + struct mm_chain *next; + struct mm_struct *mm; +}; + struct pte_chain { struct pte_chain *next; pte_addr_t ptes[NRPTE]; } ____cacheline_aligned; +kmem_cache_t *mm_chain_cache; kmem_cache_t *pte_chain_cache; /* @@ -75,6 +82,25 @@ kmem_cache_t *pte_chain_cache; ** VM stuff below this comment **/ +static inline struct mm_chain *mm_chain_alloc(void) +{ + struct mm_chain *ret; + + ret = kmem_cache_alloc(mm_chain_cache, GFP_ATOMIC); + return ret; +} + +static void mm_chain_free(struct mm_chain *mc, + struct mm_chain *prev_mc, struct ptpage *ptepage) +{ + if (prev_mc) + prev_mc->next = mc->next; + else if (ptepage) + ptepage->pte.mmchain = mc->next; + + kmem_cache_free(mm_chain_cache, mc); +} + /** * page_referenced - test if the page was referenced * @page: the page to test @@ -86,6 +112,89 @@ kmem_cache_t *pte_chain_cache; * If the page has a single-entry pte_chain, collapse that back to a PageDirect * representation. This way, it's only done under memory pressure. */ +static inline int +page_referenced_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long loffset; + unsigned long address; + int referenced = 0; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + if (loffset < vma->vm_pgoff) + goto out; + + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + + if (address >= vma->vm_end) + goto out; + + if (!spin_trylock(&mm->page_table_lock)) { + referenced = 1; + goto out; + } + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) { + goto out_unlock; + } + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) { + goto out_unlock; + } + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) { + goto out_unmap; + } + if (page_to_pfn(page) != pte_pfn(*pte)) { + goto out_unmap; + } + if (ptep_test_and_clear_young(pte)) + referenced++; +out_unmap: + pte_unmap(pte); + +out_unlock: + spin_unlock(&mm->page_table_lock); + +out: + return referenced; +} + +static int +page_referenced_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int referenced = 0; + + if (!page->pte.mapcount) + return 0; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return 1; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + referenced += page_referenced_obj_one(vma, page); + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + referenced += page_referenced_obj_one(vma, page); + } + + up(&mapping->i_shared_sem); + + return referenced; +} + int page_referenced(struct page * page) { struct pte_chain * pc; @@ -94,6 +203,10 @@ int page_referenced(struct page * page) if (TestClearPageReferenced(page)) referenced++; + if (!PageAnon(page)) { + referenced += page_referenced_obj(page); + goto out; + } if (PageDirect(page)) { pte_t *pte = rmap_ptep_map(page->pte.direct); if (ptep_test_and_clear_young(pte)) @@ -127,16 +240,144 @@ int page_referenced(struct page * page) __pte_chain_free(pc); } } +out: return referenced; } +/* + * pgtable_add_rmap_locked - Add an mm_struct to the chain for a pte page. + * @ptepage: The pte page to add the mm_struct to + * @mm: The mm_struct to add + * @address: The address of the page we're mapping + * + * Pte pages maintain a chain of mm_structs that use it. This adds a new + * mm_struct to the chain. + * + * This function must be called with the pte_page_lock held for the page + */ +void pgtable_add_rmap_locked(struct ptpage * ptepage, struct mm_struct * mm, + unsigned long address) +{ + struct mm_chain *mc; + +#ifdef BROKEN_PPC_PTE_ALLOC_ONE + /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ + extern int mem_init_done; + + if (!mem_init_done) + return; +#endif +#ifdef RMAP_DEBUG + BUG_ON(mm == NULL); + BUG_ON(!PagePtepage(ptepage)); +#endif + + if (PageDirect(ptepage)) { + mc = mm_chain_alloc(); + mc->mm = ptepage->pte.mmdirect; + mc->next = NULL; + ptepage->pte.mmchain = mc; + ClearPageDirect(ptepage); + } + if (ptepage->pte.mmchain) { + /* Hook up the mm_chain to the page. */ + mc = mm_chain_alloc(); + mc->mm = mm; + mc->next = ptepage->pte.mmchain; + ptepage->pte.mmchain = mc; + } else { + ptepage->pte.mmdirect = mm; + SetPageDirect(ptepage); + ptepage->virtual = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); + } +} + +/* + * pgtable_remove_rmap_locked - Remove an mm_struct from the chain for a pte page. + * @ptepage: The pte page to remove the mm_struct from + * @mm: The mm_struct to remove + * + * Pte pages maintain a chain of mm_structs that use it. This removes an + * mm_struct from the chain. + * + * This function must be called with the pte_page_lock held for the page + */ +void pgtable_remove_rmap_locked(struct ptpage *ptepage, struct mm_struct *mm) +{ + struct mm_chain * mc, * prev_mc = NULL; + +#ifdef DEBUG_RMAP + BUG_ON(mm == NULL); + BUG_ON(!PagePtepage(ptepage)); +#endif + + if (PageDirect(ptepage)) { + if (ptepage->pte.mmdirect == mm) { + ptepage->pte.mmdirect = NULL; + ClearPageDirect(ptepage); + ptepage->virtual = 0; + goto out; + } + } else { +#ifdef DEBUG_RMAP + BUG_ON(ptepage->pte.mmchain->next == NULL); +#endif + for (mc = ptepage->pte.mmchain; mc; prev_mc = mc, mc = mc->next) { + if (mc->mm == mm) { + mm_chain_free(mc, prev_mc, ptepage); + /* Check whether we can convert to direct */ + mc = ptepage->pte.mmchain; + if (!mc->next) { + ptepage->pte.mmdirect = mc->mm; + SetPageDirect(ptepage); + mm_chain_free(mc, NULL, NULL); + } + goto out; + } + } + } + BUG(); +out: + return; +} + +/* + * pgtable_add_rmap - Add an mm_struct to the chain for a pte page. + * @ptepage: The pte page to add the mm_struct to + * @mm: The mm_struct to add + * @address: The address of the page we're mapping + * + * This is a wrapper for pgtable_add_rmap_locked that takes the lock + */ +void pgtable_add_rmap(struct ptpage *ptepage, struct mm_struct *mm, + unsigned long address) +{ + pte_page_lock(ptepage); + pgtable_add_rmap_locked(ptepage, mm, address); + pte_page_unlock(ptepage); +} + +/* + * pgtable_remove_rmap_locked - Remove an mm_struct from the chain for a pte page. + * @ptepage: The pte page to remove the mm_struct from + * @mm: The mm_struct to remove + * + * This is a wrapper for pgtable_remove_rmap_locked that takes the lock + */ +void pgtable_remove_rmap(struct ptpage *ptepage, struct mm_struct *mm) +{ + pte_page_lock(ptepage); + pgtable_remove_rmap_locked(ptepage, mm); + pte_page_unlock(ptepage); +} + /** * page_add_rmap - add reverse mapping entry to a page * @page: the page to add the mapping to * @ptep: the page table entry mapping this page * * Add a new pte reverse mapping to a page. - * The caller needs to hold the mm->page_table_lock. + * The caller needs to hold the pte_page_lock. */ struct pte_chain * page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain) @@ -150,8 +391,7 @@ page_add_rmap(struct page *page, pte_t * BUG(); if (!pte_present(*ptep)) BUG(); - if (!ptep_to_mm(ptep)) - BUG(); + BUG_ON(PagePtepage(page)); #endif if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) @@ -159,6 +399,18 @@ page_add_rmap(struct page *page, pte_t * pte_chain_lock(page); + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + inc_page_state(nr_mapped); + page->pte.mapcount++; + pte_chain_unlock(page); + return pte_chain; + } + #ifdef DEBUG_RMAP /* * This stuff needs help to get up to highmem speed. @@ -169,12 +421,15 @@ page_add_rmap(struct page *page, pte_t * if (page->pte.direct == pte_paddr) BUG(); } else { + int count = 0; for (pc = page->pte.chain; pc; pc = pc->next) { - for (i = 0; i < NRPTE; i++) { + for (i = 0; i < NRPTE; i++, count++) { pte_addr_t p = pc->ptes[i]; - if (p && p == pte_paddr) + if (p && p == pte_paddr) { + printk(KERN_ERR "page_add_rmap: page %08lx (count %d), ptep %08lx, rmap count %d\n", page, page_count(page), ptep, count); BUG(); + } } } } @@ -231,7 +486,7 @@ out: * Removes the reverse mapping from the pte_chain of the page, * after that the caller can clear the page table entry and free * the page. - * Caller needs to hold the mm->page_table_lock. + * Caller needs to hold the pte_page_lock. */ void page_remove_rmap(struct page * page, pte_t * ptep) { @@ -245,8 +500,26 @@ void page_remove_rmap(struct page * page if (!page_mapped(page)) return; /* remap_page_range() from a driver? */ +#ifdef DEBUG_RMAP + BUG_ON(PagePtepage(page)); +#endif + pte_chain_lock(page); + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + BUG(); + page->pte.mapcount--; + if (!page->pte.mapcount) + dec_page_state(nr_mapped); + pte_chain_unlock(page); + return; + } + if (PageDirect(page)) { if (page->pte.direct == pte_paddr) { page->pte.direct = 0; @@ -310,6 +583,234 @@ out: return; } +static int pgtable_check_mlocked_mm(struct mm_struct *mm, unsigned long address) +{ + struct vm_area_struct *vma; + int ret = SWAP_SUCCESS; + + /* + * If this mm is in the process of exiting, skip this page + * for now to let the exit finish. + */ + if (atomic_read(&mm->mm_users) == 0) { + ret = SWAP_AGAIN; + goto out; + } + + /* During mremap, it's possible pages are not in a VMA. */ + vma = find_vma(mm, address); + if (!vma) { + ret = SWAP_FAIL; + goto out; + } + + /* The page is mlock()d, we cannot swap it out. */ + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + } +out: + return ret; +} + +static int pgtable_check_mlocked(struct ptpage *ptepage, unsigned long address) +{ + struct mm_chain *mc; + int ret = SWAP_SUCCESS; + +#ifdef DEBUG_RMAP + BUG_ON(!PagePtepage(ptepage)); +#endif + if (PageDirect(ptepage)) { + ret = pgtable_check_mlocked_mm(ptepage->pte.mmdirect, address); + goto out; + } + + for (mc = ptepage->pte.mmchain; mc; mc = mc->next) { +#ifdef DEBUG_RMAP + BUG_ON(mc->mm == NULL); +#endif + ret = pgtable_check_mlocked_mm(mc->mm, address); + if (ret != SWAP_SUCCESS) + goto out; + } +out: + return ret; +} + +/** + * pgtable_unmap_one_mm - Decrement the rss count and flush for an mm_struct + * @mm: - the mm_struct to decrement + * @address: - The address of the page we're removing + * + * All pte pages keep a chain of mm_struct that are using it. This does a flush + * of the address for that mm_struct and decrements the rss count. + */ +static int pgtable_unmap_one_mm(struct mm_struct *mm, unsigned long address) +{ + struct vm_area_struct *vma; + int ret = SWAP_SUCCESS; + + /* During mremap, it's possible pages are not in a VMA. */ + vma = find_vma(mm, address); + if (!vma) { + ret = SWAP_FAIL; + goto out; + } + flush_tlb_page(vma, address); + flush_cache_page(vma, address); + mm->rss--; + +out: + return ret; +} + +/** + * pgtable_unmap_one - Decrement all rss counts and flush caches for a pte page + * @ptepage: the pte page to decrement the count for + * @address: the address of the page we're removing + * + * This decrements the rss counts of all mm_structs that map this pte page + * and flushes the tlb and cache for these mm_structs and address + */ +static int pgtable_unmap_one(struct ptpage *ptepage, unsigned long address) +{ + struct mm_chain *mc; + int ret = SWAP_SUCCESS; + +#ifdef DEBUG_RMAP + BUG_ON(!PagePtepage(ptepage)); +#endif + + if (PageDirect(ptepage)) { + ret = pgtable_unmap_one_mm(ptepage->pte.mmdirect, address); + if (ret != SWAP_SUCCESS) + goto out; + } else for (mc = ptepage->pte.mmchain; mc; mc = mc->next) { + ret = pgtable_unmap_one_mm(mc->mm, address); + if (ret != SWAP_SUCCESS) + goto out; + } +out: + return ret; +} + +static inline int +try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + struct ptpage *ptepage; + unsigned long loffset; + unsigned long address; + int ret = SWAP_SUCCESS; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + if (loffset < vma->vm_pgoff) + goto out; + + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + + if (address >= vma->vm_end) + goto out; + + if (!spin_trylock(&mm->page_table_lock)) { + ret = SWAP_AGAIN; + goto out; + } + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) { + goto out_unlock; + } + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) { + goto out_unlock; + } + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) { + goto out_unmap; + } + if (page_to_pfn(page) != pte_pfn(*pte)) { + goto out_unmap; + } + ptepage = pmd_ptpage(*pmd); + if (!pte_page_trylock(ptepage)) { + ret = SWAP_AGAIN; + goto out_unmap; + } + + ret = pgtable_check_mlocked(ptepage, address); + if (ret != SWAP_SUCCESS) + goto out_unlock_pt; + + pteval = ptep_get_and_clear(pte); + ret = pgtable_unmap_one(ptepage, address); + if (ret != SWAP_SUCCESS) { + set_pte(pte, pteval); + goto out_unlock_pt; + } + + if (pte_dirty(pteval)) + set_page_dirty(page); + + if (!page->pte.mapcount) + BUG(); + + ptepage->mapcount--; + page->pte.mapcount--; + page_cache_release(page); + +out_unlock_pt: + pte_page_unlock(ptepage); + +out_unmap: + pte_unmap(pte); + +out_unlock: + spin_unlock(&mm->page_table_lock); + +out: + return ret; +} + +static int +try_to_unmap_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int ret = SWAP_SUCCESS; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return SWAP_AGAIN; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret != SWAP_SUCCESS) + goto out; + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret != SWAP_SUCCESS) + goto out; + } + + if (page->pte.mapcount) + BUG(); + +out: + up(&mapping->i_shared_sem); + return ret; +} + /** * try_to_unmap_one - worker function for try_to_unmap * @page: page to unmap @@ -322,67 +823,59 @@ out: * zone->lru_lock page_launder() * page lock page_launder(), trylock * pte_chain_lock page_launder() - * mm->page_table_lock try_to_unmap_one(), trylock + * pte_page_lock try_to_unmap_one(), trylock */ static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); static int try_to_unmap_one(struct page * page, pte_addr_t paddr) { pte_t *ptep = rmap_ptep_map(paddr); - unsigned long address = ptep_to_address(ptep); - struct mm_struct * mm = ptep_to_mm(ptep); - struct vm_area_struct * vma; pte_t pte; + struct ptpage *ptepage = (struct ptpage *)kmap_atomic_to_page(ptep); + unsigned long address = ptep_to_address(ptep); int ret; - if (!mm) - BUG(); - - /* - * We need the page_table_lock to protect us from page faults, - * munmap, fork, etc... - */ - if (!spin_trylock(&mm->page_table_lock)) { +#ifdef DEBUG_RMAP + BUG_ON(!PagePtepage(ptepage)); +#endif + if (!pte_page_trylock(ptepage)) { rmap_ptep_unmap(ptep); return SWAP_AGAIN; } - - /* During mremap, it's possible pages are not in a VMA. */ - vma = find_vma(mm, address); - if (!vma) { - ret = SWAP_FAIL; - goto out_unlock; - } - - /* The page is mlock()d, we cannot swap it out. */ - if (vma->vm_flags & VM_LOCKED) { - ret = SWAP_FAIL; + ret = pgtable_check_mlocked(ptepage, address); + if (ret != SWAP_SUCCESS) goto out_unlock; - } - /* Nuke the page table entry. */ - flush_cache_page(vma, address); pte = ptep_get_and_clear(ptep); - flush_tlb_page(vma, address); + ret = pgtable_unmap_one(ptepage, address); + if (ret != SWAP_SUCCESS) { + set_pte(ptep, pte); + goto out_unlock; + } /* Store the swap location in the pte. See handle_pte_fault() ... */ if (PageSwapCache(page)) { swp_entry_t entry = { .val = page->index }; swap_duplicate(entry); set_pte(ptep, swp_entry_to_pte(entry)); + increment_swapcount(ptepage); } + ptepage->mapcount--; + pte_page_unlock(ptepage); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pte)) set_page_dirty(page); - mm->rss--; page_cache_release(page); ret = SWAP_SUCCESS; + goto out; out_unlock: + pte_page_unlock(ptepage); + +out: rmap_ptep_unmap(ptep); - spin_unlock(&mm->page_table_lock); return ret; } @@ -414,6 +907,11 @@ int try_to_unmap(struct page * page) if (!page->mapping) BUG(); + if (!PageAnon(page)) { + ret = try_to_unmap_obj(page); + goto out; + } + if (PageDirect(page)) { ret = try_to_unmap_one(page, page->pte.direct); if (ret == SWAP_SUCCESS) { @@ -478,6 +976,58 @@ out: } /** + * increment_rss - increment the rss count by one + * @ptepage: The pte page that's getting a new paged mapped + * + * Since mapping a page into a pte page can increment the rss + * for multiple mm_structs, this function iterates through all + * the mms and increments them. It also keeps an rss count + * per pte page. + */ +void increment_rss(struct ptpage *ptepage) +{ + struct mm_chain *mc; + + if (PageDirect(ptepage)) + ptepage->pte.mmdirect->rss++; + else for (mc = ptepage->pte.mmchain; mc; mc = mc->next) + mc->mm->rss++; + + ptepage->mapcount++; +} + +/** + * decrement_rss - decrement the rss count by one + * @ptepage: The pte page that's unmapping a page + * + * Since unmapping a page can decrement the rss + * for multiple mm_structs, this function iterates through all + * the mms and decrements them. It also keeps an rss count + * per pte page. + */ +void decrement_rss(struct ptpage *ptepage) +{ + struct mm_chain *mc; + + if (PageDirect(ptepage)) + ptepage->pte.mmdirect->rss--; + else for (mc = ptepage->pte.mmchain; mc; mc = mc->next) + mc->mm->rss--; + + ptepage->mapcount--; +} + +void increment_swapcount(struct ptpage *ptepage) +{ + ptepage->swapcount++; +} + +void decrement_swapcount(struct ptpage *ptepage) +{ + ptepage->swapcount--; +} + +/** ** No more VM stuff below this comment, only pte_chain helper ** functions. **/ @@ -543,6 +1093,17 @@ struct pte_chain *pte_chain_alloc(int gf void __init pte_chain_init(void) { + + mm_chain_cache = kmem_cache_create( "mm_chain", + sizeof(struct mm_chain), + 0, + 0, + NULL, + NULL); + + if (!mm_chain_cache) + panic("failed to create mm_chain cache!\n"); + pte_chain_cache = kmem_cache_create( "pte_chain", sizeof(struct pte_chain), 0, diff -urpN -X /home/fletch/.diff.exclude 001-bk10/mm/swapfile.c 900-mjb5/mm/swapfile.c --- 001-bk10/mm/swapfile.c Thu Jan 9 19:16:15 2003 +++ 900-mjb5/mm/swapfile.c Sun Mar 16 13:39:06 2003 @@ -21,8 +21,10 @@ #include #include #include +#include #include +#include #include spinlock_t swaplock = SPIN_LOCK_UNLOCKED; @@ -379,7 +381,7 @@ void free_swap_and_cache(swp_entry_t ent */ /* mmlist_lock and vma->vm_mm->page_table_lock are held */ static void -unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, +unuse_pte(struct vm_area_struct *vma, struct ptpage *ptepage, pte_t *dir, swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { pte_t pte = *dir; @@ -390,9 +392,11 @@ unuse_pte(struct vm_area_struct *vma, un return; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + SetPageAnon(page); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); + increment_rss(ptepage); + decrement_swapcount(ptepage); swap_free(entry); - ++vma->vm_mm->rss; } /* mmlist_lock and vma->vm_mm->page_table_lock are held */ @@ -400,6 +404,7 @@ static void unuse_pmd(struct vm_area_str unsigned long address, unsigned long size, unsigned long offset, swp_entry_t entry, struct page* page) { + struct ptpage *ptepage; pte_t * pte; unsigned long end; struct pte_chain *pte_chain = NULL; @@ -411,6 +416,8 @@ static void unuse_pmd(struct vm_area_str pmd_clear(dir); return; } + ptepage = pmd_ptpage(*dir); + pte_page_lock(ptepage); pte = pte_offset_map(dir, address); offset += address & PMD_MASK; address &= ~PMD_MASK; @@ -423,11 +430,11 @@ static void unuse_pmd(struct vm_area_str */ if (pte_chain == NULL) pte_chain = pte_chain_alloc(GFP_ATOMIC); - unuse_pte(vma, offset+address-vma->vm_start, - pte, entry, page, &pte_chain); + unuse_pte(vma, ptepage, pte, entry, page, &pte_chain); address += PAGE_SIZE; pte++; } while (address && (address < end)); + pte_page_unlock(ptepage); pte_unmap(pte - 1); pte_chain_free(pte_chain); } diff -urpN -X /home/fletch/.diff.exclude 001-bk10/net/core/dev.c 900-mjb5/net/core/dev.c --- 001-bk10/net/core/dev.c Wed Mar 5 07:37:08 2003 +++ 900-mjb5/net/core/dev.c Sun Mar 16 13:39:02 2003 @@ -1250,8 +1250,6 @@ int netif_rx(struct sk_buff *skb) struct softnet_data *queue; unsigned long flags; - if (!skb->stamp.tv_sec) - do_gettimeofday(&skb->stamp); /* * The code is rearranged so that the path is the most @@ -1261,6 +1259,13 @@ int netif_rx(struct sk_buff *skb) this_cpu = smp_processor_id(); queue = &softnet_data[this_cpu]; + if (skb->dev->rx_hook) + goto rx_hook; +rx_hook_continue: + + if (!skb->stamp.tv_sec ) + do_gettimeofday(&skb->stamp); + netdev_rx_stat[this_cpu].total++; if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { if (queue->input_pkt_queue.qlen) { @@ -1303,6 +1308,15 @@ drop: kfree_skb(skb); return NET_RX_DROP; +rx_hook: + { + int ret; + + ret = skb->dev->rx_hook(skb); + if (ret == NET_RX_DROP) + goto drop; + goto rx_hook_continue; + } } /* Deliver skb to an old protocol, which is not threaded well diff -urpN -X /home/fletch/.diff.exclude 001-bk10/scripts/schedcapture 900-mjb5/scripts/schedcapture --- 001-bk10/scripts/schedcapture Wed Dec 31 16:00:00 1969 +++ 900-mjb5/scripts/schedcapture Sun Mar 16 13:38:53 2003 @@ -0,0 +1,6 @@ +while true +do + cat /proc/schedstat + echo + sleep 20 +done diff -urpN -X /home/fletch/.diff.exclude 001-bk10/scripts/schedstat 900-mjb5/scripts/schedstat --- 001-bk10/scripts/schedstat Wed Dec 31 16:00:00 1969 +++ 900-mjb5/scripts/schedstat Sun Mar 16 13:38:53 2003 @@ -0,0 +1,168 @@ +#!/usr/bin/perl + +$slice = 20; # seconds +while (<>) { + @curr = split; + if ($curr[0] =~ /cpu(\d)/) { + $per_cpu_curr[$1] = [ @curr ]; + $max_cpu = $1 if ($1 > $max_cpu); + next; + } + next if (/^$/); + if ($curr[0] eq "version") { + if ($curr[1] != 2) { + die "Version mismatch. Update this tool.\n"; + } + next; + } + # + # format of line in /proc/schedstat + # + # tag 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + # + # tag is "cpuN" or "cpu". Right now, we ignore "cpuN" lines (this tool + # doesn't collate per-cpu statistics, although it would be trivial to + # do so.) + # + # version == 1 + # NOTE: the active queue is considered empty if it has only one process + # in it, since obviously the process calling sched_yield is that process. + # + # First four are sched_yield statistics: + # 1) # of times both the active and the expired queue were empty + # 2) # of times just the active queue was empty + # 3) # of times just the expired queue was empty + # 4) # of times sched_yield() was called + # + # Next two are schedule() statistics: + # 5) # of times the active queue had at least one other process on it. + # 6) # of times we switched to the expired queue and reused it + # 7) # of times schedule() was called + # + # Next seven are statistics dealing with load balancing: + # 8) # of times load_balance was called at an idle tick + # 9) # of times load_balance was called at an busy tick + # 10) # of times load_balance was called from schedule() + # 11) # of times load_balance was called + # 12) sum of imbalances discovered (if any) with each call to + # load_balance + # 13) # of times load_balance was called when we did not find a + # "busiest" queue + # 14) # of times load_balance was called from balance_node() + # + # Next four are statistics dealing with pull_task(): + # 15) # of times pull_task was called at an idle tick + # 16) # of times pull_task was called at an busy tick + # 17) # of times pull_task was called from schedule() + # 18) # of times pull_task was called + # + # Next two are statistics dealing with balance_node(): + # 19) # of times balance_node was called + # 20) # of times balance_node was called at an idle tick + # + #$curr[7] = $sched_cnt; + foreach $i (1..20) { + $diff[$i] = $curr[$i] - $prev[$i]; + } + + for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { + @arr_curr = @{$per_cpu_curr[$cpu]}; + @arr_prev = @{$per_cpu_prev[$cpu]}; + foreach $i (1..20) { + $arr_diff[$i] = $arr_curr[$i] - $arr_prev[$i]; + } + $per_cpu_diff[$cpu] = [ @arr_diff ]; + } + + #for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { +# print "@{$per_cpu_curr[$cpu]}\n"; +# } +# print "@curr\n"; + printf "%02d:%02d:%02d--------------------------------------------------------------\n", + $tick*$slice/3600, ($tick*$slice/60)%60, ($tick*$slice)%60; + + # + # sched_yield() stats + # + printf " %7d sys_sched_yield()\n", $diff[4]; + printf " %7d(%6.2f%%) found (only) active queue empty on current cpu\n", + $diff[2]-$diff[1], $diff[4] ? (100*($diff[2]-$diff[1])/$diff[4]) : 0; + printf " %7d(%6.2f%%) found (only) expired queue empty on current cpu\n", + $diff[3], $diff[4] ? (100*$diff[3]/$diff[4]) : 0; + printf " %7d(%6.2f%%) found both queues empty on current cpu\n", + $diff[1], $diff[4] ? (100*$diff[1]/$diff[4]) : 0; + printf " %7d(%6.2f%%) found neither queue empty on current cpu\n\n", + $diff[4]-($diff[3]+$diff[2]), + $diff[4] ? 100*($diff[4]-($diff[3]+$diff[2]))/$diff[4] : 0; + + # + # schedule() stats + # + printf " %7d schedule()\n", $diff[7]; + printf " %7d(%6.2f%%) switched active and expired queues\n", + $diff[6], $diff[7] ? (100*$diff[6]/$diff[7]) : 0; + printf " %7d(%6.2f%%) used existing active queue\n\n", + $diff[5]-$diff[6], $diff[7] ? (100*($diff[5]-$diff[6])/$diff[7]) : 0; + + # + # load_balance() stats + # + printf " %7d load_balance()\n", $diff[11]; + printf " %7d(%6.2f%%) called while idle\n", $diff[8], + 100*$diff[8]/$diff[11]; + printf " %7d(%6.2f%%) called while busy\n", $diff[9], + 100*($diff[9])/$diff[11]; + printf " %7d(%6.2f%%) called from schedule()\n", $diff[10], + 100*$diff[10]/$diff[11]; + printf " %7d(%6.2f%%) called from balance_node()\n", $diff[14], + 100*$diff[14]/$diff[11]; + printf " %7d no \"busiest\" queue found\n",$diff[13]; + if ($diff[11]-$diff[13]) { + $imbalance = $diff[12] / ($diff[11]-$diff[13]); + if ($imbalance < 10) { + printf " %7.3f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } elsif ($imbalance < 100) { + printf " %8.2f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } else { + printf " %9.1f average imbalance (over %d)\n", + $imbalance, $diff[11]-$diff[13]; + } + } + else { + printf " no imbalances\n"; + } + + # + # pull_task() stats + # + print "\n"; + printf " %7d pull_task()\n", $diff[15]; + for ($cpu = 0; $cpu <= $max_cpu; $cpu++) { + @arr = @{$per_cpu_diff[$cpu]}; + if ($arr[15] || $arr[16]) { + printf " %7d/%-7d cpu %d lost/gained task to/from another cpu\n", + $arr[15], $arr[16], $cpu; + } + if ($arr[17] || $arr[18]) { + printf " %7d/%-7d cpu %d lost/gained task to/from another node\n", + $arr[17], $arr[18], $cpu; + } + } + print "\n"; + + # + # balance_node() stats + # + printf " %7d balance_node()\n", $diff[19]; + printf " %7d(%6.2f%%) called while idle\n", $diff[20], + $diff[19] ? 100*$diff[20]/$diff[19] : 0; + printf " %7d(%6.2f%%) called while busy\n", $diff[19] - $diff[20], + $diff[19] ? 100*(($diff[19]-$diff[20]))/$diff[19] : 0; + + printf("\n"); + @prev = @curr; + @per_cpu_prev = @per_cpu_curr; + $tick++; +}