diff -urpN -X /home/fletch/.diff.exclude 000-virgin/Documentation/filesystems/proc.txt 901-mjb1.1/Documentation/filesystems/proc.txt --- 000-virgin/Documentation/filesystems/proc.txt Sat Jun 14 18:37:23 2003 +++ 901-mjb1.1/Documentation/filesystems/proc.txt Wed Aug 13 20:27:51 2003 @@ -37,6 +37,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/sched - scheduler tunables ------------------------------------------------------------------------------ Preface @@ -1751,6 +1752,104 @@ IPX. The /proc/net/ipx_route table holds a list of IPX routes. For each route it gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. + +2.11 /proc/sys/sched - scheduler tunables +----------------------------------------- + +Useful knobs for tuning the scheduler live in /proc/sys/sched. + +child_penalty +------------- + +Percentage of the parent's sleep_avg that children inherit. sleep_avg is +a running average of the time a process spends sleeping. Tasks with high +sleep_avg values are considered interactive and given a higher dynamic +priority and a larger timeslice. You typically want this some value just +under 100. + +exit_weight +----------- + +When a CPU hog task exits, its parent's sleep_avg is reduced by a factor of +exit_weight against the exiting task's sleep_avg. + +interactive_delta +----------------- + +If a task is "interactive" it is reinserted into the active array after it +has expired its timeslice, instead of being inserted into the expired array. +How "interactive" a task must be in order to be deemed interactive is a +function of its nice value. This interactive limit is scaled linearly by nice +value and is offset by the interactive_delta. + +max_sleep_avg +------------- + +max_sleep_avg is the largest value (in ms) stored for a task's running sleep +average. The larger this value, the longer a task needs to sleep to be +considered interactive (maximum interactive bonus is a function of +max_sleep_avg). + +max_timeslice +------------- + +Maximum timeslice, in milliseconds. This is the value given to tasks of the +highest dynamic priority. + +min_timeslice +------------- + +Minimum timeslice, in milliseconds. This is the value given to tasks of the +lowest dynamic priority. Every task gets at least this slice of the processor +per array switch. + +parent_penalty +-------------- + +Percentage of the parent's sleep_avg that it retains across a fork(). +sleep_avg is a running average of the time a process spends sleeping. Tasks +with high sleep_avg values are considered interactive and given a higher +dynamic priority and a larger timeslice. Normally, this value is 100 and thus +task's retain their sleep_avg on fork. If you want to punish interactive +tasks for forking, set this below 100. + +prio_bonus_ratio +---------------- + +Middle percentage of the priority range that tasks can receive as a dynamic +priority. The default value of 25% ensures that nice values at the +extremes are still enforced. For example, nice +19 interactive tasks will +never be able to preempt a nice 0 CPU hog. Setting this higher will increase +the size of the priority range the tasks can receive as a bonus. Setting +this lower will decrease this range, making the interactivity bonus less +apparent and user nice values more applicable. + +starvation_limit +---------------- + +Sufficiently interactive tasks are reinserted into the active array when they +run out of timeslice. Normally, tasks are inserted into the expired array. +Reinserting interactive tasks into the active array allows them to remain +runnable, which is important to interactive performance. This could starve +expired tasks, however, since the interactive task could prevent the array +switch. To prevent starving the tasks on the expired array for too long. the +starvation_limit is the longest (in ms) we will let the expired array starve +at the expense of reinserting interactive tasks back into active. Higher +values here give more preferance to running interactive tasks, at the expense +of expired tasks. Lower values provide more fair scheduling behavior, at the +expense of interactivity. The units are in milliseconds. + +idle_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N idle rebalance ticks, where N = idle_node_rebalance_ratio. + +busy_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N busy rebalance ticks, where N = busy_node_rebalance_ratio. ------------------------------------------------------------------------------ Summary diff -urpN -X /home/fletch/.diff.exclude 000-virgin/Documentation/i386/gdb-serial.txt 901-mjb1.1/Documentation/i386/gdb-serial.txt --- 000-virgin/Documentation/i386/gdb-serial.txt Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/Documentation/i386/gdb-serial.txt Wed Aug 13 20:29:29 2003 @@ -0,0 +1,386 @@ +Version +======= + +This version of the gdbstub package was developed and tested on +kernel version 2.3.48. It will not install on a 2.2 kernel. It may +not work on earlier versions of 2.3 kernels. It is possible that +it will continue to work on later versions of 2.3 and then +versions of 2.4 (I hope). + + +Debugging Setup +=============== + +Designate one machine as the "development" machine. This is the +machine on which you run your compiles and which has your source +code for the kernel. Designate a second machine as the "target" +machine. This is the machine that will run your experimental +kernel. + +The two machines will be connected together via a serial line out +one or the other of the COM ports of the PC. You will need a modem +eliminator and the appropriate cables. + +On the DEVELOPMENT machine you need to apply the patch for the gdb +hooks. You have probably already done that if you are reading this +file. + +On your DEVELOPMENT machine, go to your kernel source directory and +do "make menuconfig". Go down to the kernel hacking menu item and +open it up. Enable the kernel gdb stub code by selecting that item. + +Save and exit the menuconfig program. Then do "make clean" and +"make bzImage" (or whatever target you want to make). This gets +the kernel compiled with the "-g" option set -- necessary for +debugging. + +You have just built the kernel on your DEVELOPMENT machine that you +intend to run on our TARGET machine. + +To install this new kernel, use the following installation procedure. +Remember, you are on the DEVELOPMENT machine patching the kernel source +for the kernel that you intend to run on the TARGET machine. + +Copy this kernel to your target machine using your usual procedures. +I usually arrange to copy development:/usr/src/linux/arch/i386/boot/zImage +to /vmlinuz on the TARGET machine via a LAN based NFS access. That is, +I run the cp command on the target and copy from the development machine +via the LAN. Run Lilo on the new kernel on the target machine so that it +will boot! Then boot the kernel on the target machine. + +There is an utility program named "gdbstart" in the +development:/usr/src/linux/arch/i386/kernel directory. +You should copy this program over to your target machine, probably into +/sbin. This utility program is run on the target machine to +activate the kernel hooks for the debugger. It is invoked as follows: + + gdbstart [-s speed] [-t tty-dev] + defaults: /dev/ttyS0 with speed unmodified by gdbstart + +Don't run the program just yet. We'll get to that in a bit. + +Decide on which tty port you want the machines to communicate, then +cable them up back-to-back using the null modem. COM1 is /dev/ttyS0 +and COM2 is /dev/ttyS1. + +On the DEVELOPMENT machine, create a file called .gdbinit in the +directory /usr/src/linux. An example .gdbinit file looks like this: + +define rmt +set remotebaud 38400 +target remote /dev/ttyS0 +end + +Assuming that you added my gdbinit stuff to your .gdbinit, edit .gdbinit +and find the section that looks like this: + + define rmt + set remotebaud 38400 + target remote /dev/ttyS0 + end + +Change the "target" definition so that it specifies the tty port that +you intend to use. Change the "remotebaud" definition to match the +data rate that you are going to use for the com line. + +On the TARGET machine I find it helpful to create shell script file +named "debug" in the root home directory with the following contents: + + gdbstart -s 38400 -t /dev/ttyS0 < + EOF + +This runs the gdbstart program and gives it the carriage return that +it prompts for. This sets the data rate from the target machine's side. + +You are now ready to try it out. + +On your TARGET machine, freshly rebooted with your gdbstub-equipped +kernel, type "debug" in the root home directory. The system will appear +to hang with some messages on the screen from the debug stub. What +it is doing is waiting for contact from the development machine. + +On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". +When gdb gets the symbols loaded and prompts you, enter "rmt" (that's +the macro from the .gdbinit file that you just edited). If everything +is working correctly you should see gdb print out a few lines indicating +that a breakpoint has been taken. It will actually show a line of +code in the target kernel inside the gdbstub activation code. + +The gdb interaction should look something like this: + + linux-dev:/usr/src/linux# gdb vmlinux + GDB is free software and you are welcome to distribute copies of it + under certain conditions; type "show copying" to see the conditions. + There is absolutely no warranty for GDB; type "show warranty" for details. + GDB 4.15.1 (i486-slackware-linux), + Copyright 1995 Free Software Foundation, Inc... + (gdb) rmt + breakpoint () at i386-stub.c:750 + 750 } + (gdb) + + +You can now use whatever gdb commands you like to set breakpoints. +Enter "continue" to start your target machine executing again. At this +point the target system will run at full speed until it encounters +your breakpoint or gets a segment violation in the kernel, or whatever. + + +Triggering gdbstub at Kernel Boot Time +====================================== + +The gdbstub patch now has the ability for gdb to connect to the kernel during +bootup (as opposed to waiting for the system to come all the way up and then +running the gdbstart program on the target machine). This new functionality was +added by Scott Foehner at SGI. + +To force a kernel that has been compiled with gdbstub to pause during the boot +process and wait for a connection from gdb, the paramter "gdb" should be passed +to the kernel. This can be done by typing "gdb" after the name of the kernel +on the LILO command line. The patch defaults to use ttyS1 at a baud rate of +38400. These parameters can be changed by using "gdbttyS=" and +"gdbbaud=" on the command line. + +Example: + +LILO boot: linux gdb gdbttyS=1 gdbbaud=38400 + +Note that this command is entered on the TARGET machine as it is booting +the kernel that was compiled on the DEVELOPMENT machine. + +An alternate approach is to place a line in the /etc/lilo.conf file on +your TARGET machine. Under the heading for the kernel that you intend +to boot, place a line that looks like this: + + append = "gdb gdbttyS=1 gdbbaud=38400" + +This will cause the kernel to enter the gdbstub automatically at boot +time. + +BE SURE to run "lilo" after changing the /etc/lilo.conf file. + + +The "gdbstart" Program +===================== + +This utility program is used to set up the com port and data rate +for the connection from the target system to the development system. +Its usage has been described above. + +This version of the patch uses the same tty ioctl for kernel versions +2.0.30 onwards. Thus, the gdbstart utility does not need to be re-compiled +to install the patch in a later version of the kernel. The ioctl added +to the kernel for this purpose is far enough "off the end" of existing +ioctls (as of 2.1.120) that it should not interfere with any new kernel +tty ioctls for quite some time (famous last words). + +The source for the gdbstart program resides in the arch/i386/kernel directory. + + +Debugging hints +=============== + +You can break into the target machine at any time from the development +machine by typing ^C. If the target machine has interrupts enabled +this will stop it in the kernel and enter the debugger. + +There is unfortunately no way of breaking into the kernel if it is +in a loop with interrupts disabled, so if this happens to you then +you need to place exploratory breakpoints or printk's into the kernel +to find out where it is looping. + +There is a copy of an e-mail in the kgdb distribution directory which +describes how to create an NMI on an ISA bus machine using a paper +clip. I have a sophisticated version of this made by wiring a push +button switch into a PC104/ISA bus adapter card. The adapter card +nicely furnishes wire wrap pins for all the ISA bus signals. + +When you are done debugging the kernel on the target machine it is +a good idea to leave it in a running state. This makes reboots +faster, bypassing the fsck. So do a gdb "continue" as the last gdb +command if this is possible. To terminate gdb itself on the development +machine and leave the target machine running, type ^Z to suspend gdb +and then kill it with "kill %1" or something similar. + +If gdbstub Does Not Work +======================== + +If it doesn't work, you will have to troubleshoot it. Do the easy things +first like double checking your cabling and data rates. You might +try some non-kernel based programs to see if the back-to-back connection +works properly. Just something simple like cat /etc/hosts >/dev/ttyS0 +on one machine and cat /dev/ttyS0 on the other will tell you if you +can send data from one machine to the other. There is no point in tearing +out your hair in the kernel if the line doesn't work. + +All of the real action takes place in the file +/usr/src/linux/arch/i386/kernel/gdbstub.c. That is the code on the target +machine that interacts with gdb on the development machine. In gdb you can +turn on a debug switch with the following command: + + set remotedebug + +This will print out the protocol messages that gdb is exchanging with +the target machine. + +Another place to look is /usr/src/linux/drivers/char/gdbserial.c +That is the code that talks to the serial port on the target side. +There might be a problem there. + +If you are really desperate you can use printk debugging in the +gdbstub code in the target kernel until you get it working. In particular, +there is a global variable in /usr/src/linux/arch/i386/kernel/gdbstub.c +named "remote_debug". Compile your kernel with this set to 1, rather +than 0 and the debug stub will print out lots of stuff as it does +what it does. + + +Debugging Loadable Modules +========================== + +This technique comes courtesy of Edouard Parmelan + + +When you run gdb, enter the command + +source gdbinit-modules + +This will read in a file of gdb macros that was installed in your +kernel source directory with kgdb was installed. This file implements +the following commands: + +mod-list + Lists the loaded modules in the form + +mod-print-symbols + Prints all the symbols in the indicated module. + +mod-add-symbols + Loads the symbols from the object file and associates them + with the indicated module. + +After you have loaded the module that you want to debug, use the command +mod-list to find the of your module. Then use that +address in the mod-add-symbols command to load your module's symbols. +From that point onward you can debug your module as if it were a part +of the kernel. + +The file gdbinit-modules also contains a command named mod-add-lis as +an example of how to construct a command of your own to load your +favorite module. The idea is to "can" the pathname of the module +in the command so you don't have to type so much. + +Threads +======= + +Each process in a target machine is seen as a gdb thread. gdb thread related +commands (info threads, thread n) can be used. + +ia-32 hardware breakpoints +========================== + +gdb stub contains support for hardware breakpoints using debugging features +of ia-32(x86) processors. These breakpoints do not need code modification. +They use debugging registers. 4 hardware breakpoints are available in ia-32 +processors. + +Each hardware breakpoint can be of one of the following three types. +1. Execution breakpoint - An Execution breakpoint is triggered when code at the + breakpoint address is executed. + + As limited number of hardware breakpoints are available, it is advisable + to use software breakpoints ( break command ) instead of execution + hardware breakpoints, unless modification of code is to be avoided. + +2. Write breakpoint - A write breakpoint is triggered when memory location at the + breakpoint address is written. + + A write or can be placed for data of variable length. Length of a write + breakpoint indicates length of the datatype to be watched. Length is 1 + for 1 byte data , 2 for 2 byte data, 3 for 4 byte data. + +3. Access breakpoint - An access breakpoint is triggered when memory location at + the breakpoint address is either read or written. + + Access breakpoints also have lengths similar to write breakpoints. + +IO breakpoints in ia-32 are not supported. + +Since gdb stub at present does not use the protocol used by gdb for hardware +breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros +for hardware breakpoints are described below. + +hwebrk - Places an execution breakpoint + hwebrk breakpointno address +hwwbrk - Places a write breakpoint + hwwbrk breakpointno length address +hwabrk - Places an access breakpoint + hwabrk breakpointno length address +hwrmbrk - Removes a breakpoint + hwrmbrk breakpointno +exinfo - Tells whether a software or hardware breakpoint has occured. + Prints number of the hardware breakpoint if a hardware breakpoint has + occured. + +Arguments required by these commands are as follows +breakpointno - 0 to 3 +length - 1 to 3 +address - Memory location in hex digits ( without 0x ) e.g c015e9bc + +MP support +========== + +When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb client, +all the processors are forced to enter the debugger. Current thread +corresponds to the thread running on the processor where breakpoint occured. +Threads running on other processor(s) appear similar to other non running +threads in the 'info threads' output. + +ia-32 hardware debugging registers on all processors are set to same values. +Hence any hardware breakpoints may occur on any processor. + +gdb troubleshooting +=================== + +1. gdb hangs +Kill it. restart gdb. Connect to target machine. + +2. gdb cannot connect to target machine (after killing a gdb and restarting +another) +If the target machine was not inside debugger when you killed gdb, gdb cannot +connect because the target machine won't respond. +In this case echo "Ctrl+C"(ascii 3) in the serial line. +e.g. echo -e "\003" > /dev/ttyS1 +This forces that target machine into debugger after which you can connect. + +3. gdb cannot connect even after echoing Ctrl+C into serial line +Try changing serial line settings min to 1 and time to 0 +e.g. stty min 1 time 0 < /dev/ttyS1 +Try echoing again + +check serial line speed and set it to correct value if required +e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 + +Final Items +=========== + +I picked up this code from Dave Grothe and enhanced it. + +If you make some really cool modification to this stuff, or if you +fix a bug, please let me know. + +Amit S. Kale + + +(First kgdb by David Grothe ) + +(modified by Tigran Aivazian ) + Putting gdbstub into the kernel config menu. + +(modified by Scott Foehner ) + Hooks for entering gdbstub at boot time. + +(modified by Amit S. Kale ) + Threads, ia-32 hw debugging, mp support, console support, + nmi watchdog handling. diff -urpN -X /home/fletch/.diff.exclude 000-virgin/Documentation/sysrq.txt 901-mjb1.1/Documentation/sysrq.txt --- 000-virgin/Documentation/sysrq.txt Wed Mar 26 22:54:28 2003 +++ 901-mjb1.1/Documentation/sysrq.txt Wed Aug 13 20:29:29 2003 @@ -77,6 +77,8 @@ On all - write a character to /proc/sys 'l' - Send a SIGKILL to all processes, INCLUDING init. (Your system will be non-functional after this.) +'g' - Enter the kernel debugger (if configured and supported). + 'h' - Will display help ( actually any other key than those listed above will display help. but 'h' is easy to remember :-) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/Makefile 901-mjb1.1/Makefile --- 000-virgin/Makefile Wed Aug 13 20:24:17 2003 +++ 901-mjb1.1/Makefile Wed Aug 13 21:09:46 2003 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 0 -EXTRAVERSION = -test3 +EXTRAVERSION = -test3-mjb1 # *DOCUMENTATION* # To see a list of typical targets execute "make help" @@ -76,6 +76,8 @@ HOSTCXX = g++ HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer HOSTCXXFLAGS = -O2 +GCOV_FLAGS = -fprofile-arcs -ftest-coverage + # That's our default target when none is given on the command line # Note that 'modules' will be added as a prerequisite as well, @@ -235,6 +237,8 @@ export VERSION PATCHLEVEL SUBLEVEL EXTRA export CPPFLAGS NOSTDINC_FLAGS OBJCOPYFLAGS LDFLAGS export CFLAGS CFLAGS_KERNEL CFLAGS_MODULE export AFLAGS AFLAGS_KERNEL AFLAGS_MODULE +export CFLAGS_NOGCOV + export MODVERDIR := .tmp_versions @@ -326,6 +330,10 @@ ifdef CONFIG_DEBUG_INFO CFLAGS += -g endif +ifdef CONFIG_DEBUG_SYMBOLS +CFLAGS += -g +endif + # # INSTALL_PATH specifies where to place the updated kernel and system map # images. Uncomment if you want to place them anywhere other than root. @@ -534,6 +542,11 @@ depend dep: # --------------------------------------------------------------------------- # Modules +CFLAGS_NOGCOV := $(CFLAGS) +ifdef CONFIG_GCOV_ALL +CFLAGS += $(GCOV_FLAGS) +endif + ifdef CONFIG_MODULES # By default, build modules as well @@ -716,6 +729,7 @@ clean: archclean $(clean-dirs) $(call cmd,rmclean) @find . $(RCS_FIND_IGNORE) \ \( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \ + -o -name '*.bb' -o -name '*.bbg' -o -name '*.da' \ -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \) \ -type f -print | xargs rm -f diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/Kconfig 901-mjb1.1/arch/i386/Kconfig --- 000-virgin/arch/i386/Kconfig Wed Aug 13 20:24:18 2003 +++ 901-mjb1.1/arch/i386/Kconfig Wed Aug 13 21:05:43 2003 @@ -449,17 +449,17 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +# config PREEMPT +# bool "Preemptible Kernel" +# help +# This option reduces the latency of the kernel when reacting to +# real-time or interactive events by allowing a low priority process to +# be preempted even if it is in kernel mode executing a system call. +# This allows applications to run more reliably even when the system is +# under load. +# +# Say Y here if you are building a kernel for a desktop, embedded +# or real-time system. Say N if you are unsure. config X86_UP_APIC bool "Local APIC support on uniprocessors" if !SMP @@ -680,6 +680,43 @@ config HIGHMEM64G endchoice +choice + help + On i386, a process can only virtually address 4GB of memory. This + lets you select how much of that virtual space you would like to + devoted to userspace, and how much to the kernel. + + Some userspace programs would like to address as much as possible and + have few demands of the kernel other than it get out of the way. These + users may opt to use the 3.5GB option to give their userspace program + as much room as possible. Due to alignment issues imposed by PAE, + the "3.5GB" option is unavailable if "64GB" high memory support is + enabled. + + Other users (especially those who use PAE) may be running out of + ZONE_NORMAL memory. Those users may benefit from increasing the + kernel's virtual address space size by taking it away from userspace, + which may not need all of its space. An indicator that this is + happening is when /proc/Meminfo's "LowFree:" is a small percentage of + "LowTotal:" while "HighFree:" is very large. + + If unsure, say "3GB" + prompt "User address space size" + default 1GB + +config 05GB + bool "3.5 GB" + +config 1GB + bool "3 GB" + +config 2GB + bool "2 GB" + +config 3GB + bool "1 GB" +endchoice + config HIGHMEM bool depends on HIGHMEM64G || HIGHMEM4G @@ -697,6 +734,11 @@ config NUMA default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) +config NUMA_SCHED + bool "Numa Scheduling Support" + depends on NUMA + default y + # Need comments to help the hapless user trying to turn on NUMA support comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" depends on X86_NUMAQ && (!HIGHMEM64G || !SMP) @@ -782,6 +824,33 @@ config MTRR See for more information. +choice + help + This is unrelated to your processor's speed. This variable alters + how often the system is asked to generate timer interrupts. A larger + value can lead to a more responsive system, but also causes extra + overhead from the increased number of context switches. + + If in doubt, leave it at the default of 1000. + + prompt "Kernel HZ" + default 1000HZ + +config 100HZ + bool "100 Hz" + +config 1000HZ + bool "1000 Hz" +endchoice + +config IRQBALANCE + bool "Enable kernel irq balancing" + depends on SMP + default y + help + The defalut yes will allow the kernel to do irq load balancing. + Saying no will keep the kernel from doing irq load balancing. + config HAVE_DEC_LOCK bool depends on (SMP || PREEMPT) && X86_CMPXCHG @@ -1245,6 +1314,36 @@ source "net/bluetooth/Kconfig" source "arch/i386/oprofile/Kconfig" +menu "GCOV coverage profiling" + +config GCOV_PROFILE + bool "GCOV coverage profiling" + ---help--- + Provide infrastructure for coverage support for the kernel. This + will not compile the kernel by default with the necessary flags. + To obtain coverage information for the entire kernel, one should + enable the subsequent option (Profile entire kernel). If only + particular files or directories of the kernel are desired, then + one must provide the following compile options for such targets: + "-fprofile-arcs -ftest-coverage" in the CFLAGS. To obtain + access to the coverage data one must insmod the gcov-proc kernel + module. + +config GCOV_ALL + bool "GCOV_ALL" + depends on GCOV_PROFILE + ---help--- + If you say Y here, it will compile the entire kernel with coverage + option enabled. + +config GCOV_PROC + tristate "gcov-proc module" + depends on GCOV_PROFILE && PROC_FS + ---help--- + This is the gcov-proc module that exposes gcov data through the + /proc filesystem + +endmenu menu "Kernel hacking" @@ -1254,6 +1353,14 @@ config DEBUG_KERNEL Say Y here if you are developing drivers or trying to debug and identify kernel problems. +config DEBUG_SYMBOLS_PROMPT + bool "Get debug symbols (turns on -g)" + depends on DEBUG_KERNEL + +config DEBUG_SYMBOLS + bool + depends on DEBUG_SYMBOLS_PROMPT || X86_REMOTE_DEBUG + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL @@ -1266,6 +1373,17 @@ config DEBUG_SLAB allocation as well as poisoning memory on free to catch use of freed memory. +config X86_REMOTE_DEBUG + bool "KGDB: Remote (serial) kernel debugging with gdb" + +config KGDB_THREAD + bool "KGDB: Thread analysis" + depends on X86_REMOTE_DEBUG + +config GDB_CONSOLE + bool "KGDB: Console messages through gdb" + depends on X86_REMOTE_DEBUG + config DEBUG_IOVIRT bool "Memory mapped I/O debugging" depends on DEBUG_KERNEL @@ -1291,6 +1409,26 @@ config MAGIC_SYSRQ keys are documented in . Don't say Y unless you really know what this hack does. +config X86_EARLY_PRINTK + bool "Early console support" + default n + depends on DEBUG_KERNEL + help + Write kernel log output directly into the VGA buffer or serial port. + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server.You should normally N here, + unless you want to debug such a crash. + + Syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. + config DEBUG_SPINLOCK bool "Spinlock debugging" depends on DEBUG_KERNEL @@ -1308,6 +1446,15 @@ config DEBUG_PAGEALLOC This results in a large slowdown, but helps to find certain types of memory corruptions. +config SPINLINE + bool "Spinlock inlining" + depends on DEBUG_KERNEL + help + This will change spinlocks from out of line to inline, making them + account cost to the callers in readprofile, rather than the lock + itself (as ".text.lock.filename"). This can be helpful for finding + the callers of locks. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM @@ -1330,8 +1477,18 @@ config DEBUG_SPINLOCK_SLEEP If you say Y here, various routines which may sleep will become very noisy if they are called with a spinlock held. +config LOCKMETER + bool "Kernel lock metering" + depends on SMP + help + Say Y to enable kernel lock metering, which adds overhead to SMP + locks, but allows you to see various statistics using the lockstat + command + config FRAME_POINTER - bool "Compile the kernel with frame pointers" + bool + default y if X86_REMOTE_DEBUG + default n if !X86_REMOTE_DEBUG help If you say Y here the resulting kernel image will be slightly larger and slower, but it will give very useful debugging information. diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/Makefile 901-mjb1.1/arch/i386/Makefile --- 000-virgin/arch/i386/Makefile Tue Aug 5 20:01:48 2003 +++ 901-mjb1.1/arch/i386/Makefile Wed Aug 13 20:27:43 2003 @@ -97,6 +97,7 @@ drivers-$(CONFIG_OPROFILE) += arch/i386 CFLAGS += $(mflags-y) AFLAGS += $(mflags-y) +AFLAGS_vmlinux.lds.o += -imacros $(TOPDIR)/include/asm-i386/page.h boot := arch/i386/boot diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/boot/compressed/Makefile 901-mjb1.1/arch/i386/boot/compressed/Makefile --- 000-virgin/arch/i386/boot/compressed/Makefile Mon Mar 17 21:43:38 2003 +++ 901-mjb1.1/arch/i386/boot/compressed/Makefile Wed Aug 13 20:51:56 2003 @@ -7,6 +7,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o EXTRA_AFLAGS := -traditional +CFLAGS := $(CFLAGS_NOGCOV) LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 $(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/Makefile 901-mjb1.1/arch/i386/kernel/Makefile --- 000-virgin/arch/i386/kernel/Makefile Wed Aug 13 20:24:18 2003 +++ 901-mjb1.1/arch/i386/kernel/Makefile Wed Aug 13 20:29:29 2003 @@ -17,6 +17,7 @@ obj-$(CONFIG_MCA) += mca.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbstub.o obj-$(CONFIG_PM) += suspend.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o @@ -31,6 +32,14 @@ obj-$(CONFIG_EDD) += edd.o obj-$(CONFIG_MODULES) += module.o obj-y += sysenter.o vsyscall.o obj-$(CONFIG_ACPI_SRAT) += srat.o + +ifdef CONFIG_X86_REMOTE_DEBUG +GDBSTART=gdbstart +GDBCLEAN= -rm -f gdbstart /sbin/gdbstart +else +GDBSTART= +GDBCLEAN= +endif EXTRA_AFLAGS := -traditional diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/apic.c 901-mjb1.1/arch/i386/kernel/apic.c --- 000-virgin/arch/i386/kernel/apic.c Wed Jul 2 21:59:03 2003 +++ 901-mjb1.1/arch/i386/kernel/apic.c Wed Aug 13 20:51:40 2003 @@ -985,7 +985,7 @@ int setup_profiling_timer(unsigned int m * multiplier is 1 and it can be changed by writing the new multiplier * value into /proc/profile. */ - +extern void calc_load_cpu(int cpu); inline void smp_local_timer_interrupt(struct pt_regs * regs) { int cpu = smp_processor_id(); @@ -1013,6 +1013,7 @@ inline void smp_local_timer_interrupt(st #ifdef CONFIG_SMP update_process_times(user_mode(regs)); + calc_load_cpu(cpu); #endif } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/cpu/common.c 901-mjb1.1/arch/i386/kernel/cpu/common.c --- 000-virgin/arch/i386/kernel/cpu/common.c Wed Aug 13 20:24:18 2003 +++ 901-mjb1.1/arch/i386/kernel/cpu/common.c Wed Aug 13 20:48:49 2003 @@ -454,9 +454,9 @@ void __init early_cpu_init(void) } /* * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. + * initialized (naturally) in the bootstrap process, such as the GDT. + * We reload them nevertheless, this function acts as a 'CPU state barrier', + * nothing should get across. */ void __init cpu_init (void) { @@ -480,8 +480,8 @@ void __init cpu_init (void) } /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: + * Initialize the per-CPU GDTs with the boot equivalents, + * and set up the descriptors: */ if (cpu) { memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); @@ -494,7 +494,6 @@ void __init cpu_init (void) memcpy(thread->tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8); __asm__ __volatile__("lgdt %0": "=m" (cpu_gdt_descr[cpu])); - __asm__ __volatile__("lidt %0": "=m" (idt_descr)); /* * Delete NT @@ -538,3 +537,31 @@ void __init cpu_init (void) current->used_math = 0; stts(); } + +/* + * copy over the boot node idt across all nodes, we currently only have + * non-unique idt entries for device io interrupts. + */ +void __init setup_node_idts(void) +{ + int node = MAX_NUMNODES; + + /* we can skip setting up node0 since it's done in head.S */ + while (--node) { + memcpy(node_idt_table[node], node_idt_table[0], IDT_SIZE); + node_idt_descr[node].size = IDT_SIZE - 1; + node_idt_descr[node].address = (unsigned long)node_idt_table[node]; + } +} + +void __init setup_cpu_idt(void) +{ + int cpu = smp_processor_id(), node = cpu_to_node(cpu); + + printk(KERN_DEBUG "CPU%d IDT at 0x%08lx\n", + cpu, node_idt_descr[node].address); + + /* reload the idt on all processors as they come up */ + __asm__ __volatile__("lidt %0": "=m" (node_idt_descr[node])); +} + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/doublefault.c 901-mjb1.1/arch/i386/kernel/doublefault.c --- 000-virgin/arch/i386/kernel/doublefault.c Tue Feb 25 23:03:43 2003 +++ 901-mjb1.1/arch/i386/kernel/doublefault.c Wed Aug 13 20:48:49 2003 @@ -16,7 +16,7 @@ static unsigned long doublefault_stack[D static void doublefault_fn(void) { - struct Xgt_desc_struct gdt_desc = {0, 0}; + struct Xdt_desc_struct gdt_desc = {0, 0}; unsigned long gdt, tss; __asm__ __volatile__("sgdt %0": "=m" (gdt_desc): :"memory"); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/entry.S 901-mjb1.1/arch/i386/kernel/entry.S --- 000-virgin/arch/i386/kernel/entry.S Tue Aug 5 20:01:41 2003 +++ 901-mjb1.1/arch/i386/kernel/entry.S Wed Aug 13 20:51:50 2003 @@ -49,6 +49,10 @@ #include #include "irq_vectors.h" +#ifndef CONFIG_KGDB_THREAD +#define user_schedule schedule +#endif + EBX = 0x00 ECX = 0x04 EDX = 0x08 @@ -224,7 +228,7 @@ need_resched: jz restore_all movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) sti - call schedule + call user_schedule movl $0,TI_PRE_COUNT(%ebp) cli jmp need_resched @@ -306,7 +310,7 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: - call schedule + call user_schedule cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -572,6 +576,31 @@ ENTRY(invalid_TSS) pushl $do_invalid_TSS jmp error_code +#ifdef CONFIG_KGDB_THREAD +ENTRY(kern_schedule) + pushl %ebp + movl %esp, %ebp + pushl %ss + pushl %ebp + pushfl + pushl %cs + pushl 4(%ebp) + pushl %eax + pushl %es + pushl %ds + pushl %eax + pushl (%ebp) + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + call kern_do_schedule + movl %ebp, %esp + pop %ebp + ret +#endif + ENTRY(segment_not_present) pushl $do_segment_not_present jmp error_code @@ -829,7 +858,7 @@ ENTRY(sys_call_table) .long sys_getdents64 /* 220 */ .long sys_fcntl64 .long sys_ni_syscall /* reserved for TUX */ - .long sys_ni_syscall + .long sys_mbind .long sys_gettid .long sys_readahead /* 225 */ .long sys_setxattr diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/gdbstart.c 901-mjb1.1/arch/i386/kernel/gdbstart.c --- 000-virgin/arch/i386/kernel/gdbstart.c Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/arch/i386/kernel/gdbstart.c Wed Aug 13 20:29:29 2003 @@ -0,0 +1,147 @@ +/* + * This program opens a tty file and issues the GDB stub activating + * ioctl on it. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +char *tty_name = "/dev/ttyS0" ; /* COM1 port */ +int speed = 9600 ; /* default speed */ +struct termios save_ts ; /* original term struct */ + +void print_usage(void) +{ + printf("gdbstub [-s speed] [-t tty-dev]\n") ; + printf(" defaults: /dev/ttyS0 with speed unmodified by this program\n"); + +} /* print_usage */ + +void tty_err(char *msg) +{ + char buf[100] ; + + strcpy(buf, msg) ; + strcat(buf, ": ") ; + strcat(buf, tty_name) ; + perror(buf) ; + exit(1) ; + +} /* tty_err */ + + +void setup_term(int fd) +{ + struct termios ts ; + int speed_code ; + + if (tcgetattr(fd, &ts) < 0) tty_err("tcgetattr") ; + + save_ts = ts ; + switch (speed) + { + case 4800: + speed_code = B4800 ; + break ; + case 9600: + speed_code = B9600 ; + break ; + case 19200: + speed_code = B19200 ; + break ; + case 38400: + speed_code = B38400 ; + break ; + case 57600: + speed_code = B57600 ; + break ; + case 115200: + speed_code = B115200 ; + break ; + case 230400: + speed_code = B230400 ; + break ; + default: + printf("Invalid speed: %d\n", speed) ; + exit(1) ; + } + + ts.c_cflag = CS8 | CREAD | CLOCAL ; + if (cfsetospeed(&ts, speed_code) < 0) tty_err("cfsetospeed") ; + if (cfsetispeed(&ts, speed_code) < 0) tty_err("cfsetispeed") ; + + if (tcsetattr(fd, TCSANOW, &ts) < 0) tty_err("tcsetattr") ; + +} /* setup_term */ + +int main(int argc, char **argv) +{ + int opt ; + int fil ; + int rslt ; + + while ((opt = getopt(argc, argv, "hs:t:")) > 0) + { + switch (opt) + { + case 's': + speed = atol(optarg) ; + break ; + case 't': + tty_name = optarg ; + break ; + case ':': + printf("Invalid option\n") ; + break ; + case '?': + case 'h': + default: + print_usage() ; + return 1; + } + } + + fil = open(tty_name, O_RDWR) ; + if (fil < 0) + { + perror(tty_name) ; + return 1; + } + + + setup_term(fil) ; + + /* + * When we issue this ioctl, control will not return until + * the debugger running on the remote host machine says "go". + */ + printf("\nAbout to activate GDB stub in the kernel on %s\n", tty_name) ; + printf("Hit CR to continue, kill program to abort -- ") ; + getchar() ; + sync() ; + rslt = ioctl(fil, TIOCGDB, 0) ; + if (rslt < 0) + { + perror("TIOCGDB ioctl") ; + return 1; + } + + printf("\nGDB stub successfully activated\n") ; + + for (;;) + { + pause() ; + } + + if (tcsetattr(fil, TCSANOW, &save_ts) < 0) tty_err("tcsetattr") ; + + exit(0); +} /* main */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/gdbstub.c 901-mjb1.1/arch/i386/kernel/gdbstub.c --- 000-virgin/arch/i386/kernel/gdbstub.c Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/arch/i386/kernel/gdbstub.c Wed Aug 13 20:29:29 2003 @@ -0,0 +1,1208 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (C) 2000-2001 VERITAS Software Corporation. + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: Amit Kale + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Origianl kgdb, compatibility with 2.1.xx kernel by David Grothe + * Integrated into 2.2.5 kernel by Tigran Aivazian + * thread support, + * support for multiple processors, + * support for ia-32(x86) hardware debugging, + * Console support, + * handling nmi watchdog + * Amit S. Kale ( akale@veritas.com ) + * + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif +#include + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int putDebugChar(int); /* write a single character */ +extern int getDebugChar(void); /* read and return a single char */ + +extern int pid_max; + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 1024 + +static char initialized; /* boolean flag. != 0 means we've been initialized */ + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES 64 +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +enum regnames { _EAX, /* 0 */ + _ECX, /* 1 */ + _EDX, /* 2 */ + _EBX, /* 3 */ + _ESP, /* 4 */ + _EBP, /* 5 */ + _ESI, /* 6 */ + _EDI, /* 7 */ + _PC /* 8 also known as eip */ , + _PS /* 9 also known as eflags */ , + _CS, /* 10 */ + _SS, /* 11 */ + _DS, /* 12 */ + _ES, /* 13 */ + _FS, /* 14 */ + _GS +}; /* 15 */ + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* */ + +#define BREAKPOINT() asm(" int $3"); + +/* Put the error code here just in case the user cares. */ +int gdb_i386errcode; +/* Likewise, the vector number here (since GDB only gets the signal + number through the usual means, and that's not very specific). */ +int gdb_i386vector = -1; + +static spinlock_t slavecpulocks[KGDB_MAX_NO_CPUS]; +volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#ifdef CONFIG_SMP +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +spinlock_t kgdb_nmispinlock = SPIN_LOCK_UNLOCKED; +#else +unsigned kgdb_spinlock = 0; +unsigned kgdb_nmispinlock = 0; +#endif + +static void +kgdb_usercode(void) +{ +} + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + do { + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + if (!putDebugChar(ch)) + return; + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + + } while ((getDebugChar() & 0x7f) != '+'); + +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static short error; + +static void +regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_EAX] = regs->eax; + gdb_regs[_EBX] = regs->ebx; + gdb_regs[_ECX] = regs->ecx; + gdb_regs[_EDX] = regs->edx; + gdb_regs[_ESI] = regs->esi; + gdb_regs[_EDI] = regs->edi; + gdb_regs[_EBP] = regs->ebp; + gdb_regs[_DS] = regs->xds; + gdb_regs[_ES] = regs->xes; + gdb_regs[_PS] = regs->eflags; + gdb_regs[_CS] = regs->xcs; + gdb_regs[_PC] = regs->eip; + gdb_regs[_ESP] = (int) (®s->esp); + gdb_regs[_SS] = __KERNEL_DS; + gdb_regs[_FS] = 0xFFFF; + gdb_regs[_GS] = 0xFFFF; +} /* regs_to_gdb_regs */ + +static void +gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) +{ + regs->eax = gdb_regs[_EAX]; + regs->ebx = gdb_regs[_EBX]; + regs->ecx = gdb_regs[_ECX]; + regs->edx = gdb_regs[_EDX]; + regs->esi = gdb_regs[_ESI]; + regs->edi = gdb_regs[_EDI]; + regs->ebp = gdb_regs[_EBP]; + regs->xds = gdb_regs[_DS]; + regs->xes = gdb_regs[_ES]; + regs->eflags = gdb_regs[_PS]; + regs->xcs = gdb_regs[_CS]; + regs->eip = gdb_regs[_PC]; +#if 0 /* can't change these */ + regs->esp = gdb_regs[_ESP]; + regs->xss = gdb_regs[_SS]; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif + +} /* gdb_regs_to_regs */ + +/* Indicate to caller of mem2hex or hex2mem that there has been an + error. */ +static volatile int kgdb_memerr = 0; +volatile int kgdb_memerr_expected = 0; +static volatile int kgdb_memerr_cnt = 0; +static int garbage_loc = -1; + +int +get_char(char *addr) +{ + return *addr; +} + +void +set_char(char *addr, int val) +{ + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set kgdb_memerr in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + + ch = get_char(mem++); + + if (may_fault && kgdb_memerr) { + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + kgdb_memerr_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch); + + if (may_fault && kgdb_memerr) { + return (mem); + } + } + if (may_fault) + kgdb_memerr_expected = 0; + return (mem); +} + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToInt(char **ptr, int *intValue) +{ + int numChars = 0; + int hexValue; + + *intValue = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *intValue = (*intValue << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#ifdef CONFIG_KGDB_THREAD +static int +stubhex(int ch) +{ + if (ch >= 'a' && ch <= 'f') + return ch - 'a' + 10; + if (ch >= '0' && ch <= '9') + return ch - '0'; + if (ch >= 'A' && ch <= 'F') + return ch - 'A' + 10; + return -1; +} + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif + +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +#ifdef CONFIG_KGDB_THREAD +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif + +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} + +#ifdef CONFIG_KGDB_THREAD +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } +#if 0 + thread = init_tasks[0]; + do { + if (thread->pid == pid) { + return thread; + } + thread = thread->next_task; + } while (thread != init_tasks[0]); +#endif + return NULL; +} +#endif + +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned addr; +} breakinfo[4] = { { +enabled:0}, { +enabled:0}, { +enabled:0}, { +enabled:0}}; + +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned dr7; + + asm volatile ("movl %%db7, %0\n":"=r" (dr7) + :); + do { + unsigned addr0, addr1, addr2, addr3; + asm volatile ("movl %%db0, %0\n" + "movl %%db1, %1\n" + "movl %%db2, %2\n" + "movl %%db3, %3\n":"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3):); + } while (0); + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movl %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movl %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movl %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movl %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movl %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +void +gdb_wait(void *arg) +{ + unsigned long flags; + int processor; + + local_irq_save(flags); + processor = smp_processor_id(); + procindebug[processor] = 1; + current->thread.kgdbregs = arg; + spin_lock(slavecpulocks + processor); + correct_hw_break(); + procindebug[processor] = 0; + local_irq_restore(flags); +} + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + int addr, length; + int breakno, breaktype; + char *ptr; + int newPC; + unsigned long flags = ~0UL; + int gdb_regs[NUMREGBYTES / 4]; + int i; + int dr6; + int reboot = 0; +#ifdef CONFIG_KGDB_THREAD + int nothreads; + int maxthreads; + int threadid; + threadref thref; + struct task_struct *thread = NULL; +#endif +#define regs (*linux_regs) + + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { + return (0); + } + + if (kgdb_memerr_expected) { + /* + * This fault occured because of the get_char or set_char + * routines. These two routines use either eax of edx to + * indirectly reference the location in memory that they + * are working with. For a page fault, when we return + * the instruction will be retried, so we have to make + * sure that these registers point to valid memory. + */ + kgdb_memerr = 1; /* set mem error flag */ + kgdb_memerr_expected = 0; + kgdb_memerr_cnt++; /* helps in debugging */ + regs.eax = (long) &garbage_loc; /* make valid address */ + regs.edx = (long) &garbage_loc; /* make valid address */ + return (0); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_nmispinlock)) +#else + if (!kgdb_nmispinlock) +#endif + { + + /* Get kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_lock(&kgdb_spinlock); +#else + kgdb_spinlock = 1; +#endif + + local_irq_save(flags); + + /* Disable hardware debugging while we are in kgdb */ + __asm__("movl %0,%%db7": /* no output */ + :"r"(0)); + + for (i = 0; i < NR_CPUS; i++) { + spin_lock_init(&slavecpulocks[i]); + _raw_spin_lock(&slavecpulocks[i]); + } + + if (num_online_cpus() > 1) { + /* Force other cpus in debugger */ + if (smp_call_function(gdb_wait, NULL, 0, 99) != 0) { + return (1); + } + } + + procindebug[smp_processor_id()] = 1; + } + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'g': /* return the value of the CPU registers */ + if (!usethread || usethread == current) { + regs_to_gdb_regs(gdb_regs, ®s); + } else { + memset(gdb_regs, 0, NUMREGBYTES); + if (usethread->thread.kgdbregs) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + get_char((char *) usethread->thread. + kgdbregs); + kgdb_memerr_expected = 0; + if (kgdb_memerr) { + gdb_regs[_PC] = + (int) kgdb_usercode; + } else { + regs_to_gdb_regs(gdb_regs, + usethread-> + thread. + kgdbregs); + } + } else { + gdb_regs[_PC] = (int) kgdb_usercode; + } + } + mem2hex((char *) gdb_regs, remcomOutBuffer, NUMREGBYTES, + 0); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], (char *) gdb_regs, + NUMREGBYTES, 0); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) { + ptr = 0; + mem2hex((char *) addr, + remcomOutBuffer, length, + 1); + if (kgdb_memerr) { + strcpy(remcomOutBuffer, + "E03"); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + } + break; + + /* MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) + if (*(ptr++) == ':') { + hex2mem(ptr, + (char *) addr, + length, 1); + + if (kgdb_memerr) { + strcpy + (remcomOutBuffer, + "E03"); + } else { + strcpy + (remcomOutBuffer, + "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + } + break; + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + case 'c': + case 's': +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* try to read optional parameter, pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) { + regs.eip = addr; + } + + newPC = regs.eip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno)) { + if (breakinfo[breakno].type == + 0) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + } + correct_hw_break(); + asm volatile ("movl %0, %%db6\n"::"r" (0)); + for (i = 0; i < NR_CPUS; i++) { + _raw_spin_unlock(&slavecpulocks[i]); + } + + procindebug[smp_processor_id()] = 0; + /* Release kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_unlock(&kgdb_spinlock); +#else + kgdb_spinlock = 0; +#endif + if (flags != ~0UL) + local_irq_restore(flags); + return (0); + + /* kill the program */ + case 'k': + break; + + /* query */ + case 'q': + switch (remcomInBuffer[1]) { +#ifdef CONFIG_KGDB_THREAD + case 'L': + /* List threads */ + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads + && threadid < pid_max; threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + } + } + if (threadid == pid_max) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; + + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; +#endif + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, err_code, + remcomOutBuffer); + break; + } + break; + +#ifdef CONFIG_KGDB_THREAD + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + usethread = thread; + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; +#endif + + case 'r': + reboot = 1; + strcpy(remcomOutBuffer, "OK"); + break; + case 'Y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + ptr++; + hexToInt(&ptr, &breaktype); + ptr++; + hexToInt(&ptr, &length); + ptr++; + hexToInt(&ptr, &addr); + if (set_hw_break + (breakno & 0x3, breaktype & 0x3, length & 0x3, addr) + == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + if (reboot == 1) { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt)); + __asm__ __volatile__("int3"); + } + } +} + +/* this function is used to set up exception handlers for tracing and + breakpoints */ +void +set_debug_traps(void) +{ + /* + * linux_debug_hook is defined in traps.c. We store a pointer + * to our own exception handler into it. + */ + linux_debug_hook = handle_exception; + + /* + * In case GDB is started before us, ack any packets (presumably + * "$?#xx") sitting there. */ + putDebugChar('+'); + + initialized = 1; +} + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ + +void +breakpoint(void) +{ + if (initialized) + BREAKPOINT(); +} + +#ifdef CONFIG_GDB_CONSOLE +char gdbconbuf[BUFMAX]; + +void +gdb_console_write(struct console *co, const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + + if (!gdb_initialized) { + return; + } + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } +} +#endif +static int __init +kgdb_opt_gdb(char *dummy) +{ + gdb_enter = 1; + return 1; +} +static int __init +kgdb_opt_gdbttyS(char *str) +{ + gdb_ttyS = simple_strtoul(str, NULL, 10); + return 1; +} +static int __init +kgdb_opt_gdbbaud(char *str) +{ + gdb_baud = simple_strtoul(str, NULL, 10); + return 1; +} + +/* + * Sequence of these lines has to be maintained because gdb option is a prefix + * of the other two options + */ + +__setup("gdbttyS=", kgdb_opt_gdbttyS); +__setup("gdbbaud=", kgdb_opt_gdbbaud); +__setup("gdb", kgdb_opt_gdb); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/head.S 901-mjb1.1/arch/i386/kernel/head.S --- 000-virgin/arch/i386/kernel/head.S Fri May 30 19:01:58 2003 +++ 901-mjb1.1/arch/i386/kernel/head.S Wed Aug 13 20:51:56 2003 @@ -249,7 +249,7 @@ is386: movl $2,%ecx # set MP call check_x87 incb ready lgdt cpu_gdt_descr - lidt idt_descr + lidt node_idt_descr # we switch to the per-node IDTs later ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers movl %eax,%ss # after changing gdt. @@ -314,7 +314,7 @@ setup_idt: movw %dx,%ax /* selector = 0x0010 = cs */ movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ - lea idt_table,%edi + lea node_idt_table,%edi mov $256,%ecx rp_sidt: movl %eax,(%edi) @@ -359,14 +359,16 @@ ignore_int: * segment size, and 32-bit linear address value: */ -.globl idt_descr +.globl node_idt_descr .globl cpu_gdt_descr ALIGN .word 0 # 32-bit align idt_desc.address -idt_descr: +node_idt_descr: .word IDT_ENTRIES*8-1 # idt contains 256 entries - .long idt_table + .long node_idt_table + + .fill MAX_NUMNODES-1,8,0 # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address @@ -485,5 +487,26 @@ ENTRY(cpu_gdt_table) #ifdef CONFIG_SMP .fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */ +#endif + +#ifdef CONFIG_GCOV_PROFILE +/* + * The .ctors-section contains a list of pointers to constructor + * functions which are used to initialize gcov structures. + * + * Because there is no NULL at the end of the constructor list + * in the kernel we need the addresses of both the constructor + * as well as the destructor list which are supposed to be + * adjacent. + */ + +.section ".ctors","aw" +.globl __CTOR_LIST__ +.type __CTOR_LIST__,@object +__CTOR_LIST__: +.section ".dtors","aw" +.globl __DTOR_LIST__ +.type __DTOR_LIST__,@object +__DTOR_LIST__: #endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/i386_ksyms.c 901-mjb1.1/arch/i386/kernel/i386_ksyms.c --- 000-virgin/arch/i386/kernel/i386_ksyms.c Wed Jul 2 21:59:03 2003 +++ 901-mjb1.1/arch/i386/kernel/i386_ksyms.c Wed Aug 13 20:29:29 2003 @@ -146,6 +146,20 @@ EXPORT_SYMBOL(smp_num_siblings); EXPORT_SYMBOL(cpu_sibling_map); #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +void __this_fixmap_does_not_exist(void) +{ + BUG(); +} +EXPORT_SYMBOL(__this_fixmap_does_not_exist); + +void __br_lock_usage_bug(void) +{ + BUG(); +} +EXPORT_SYMBOL(__br_lock_usage_bug); +#endif + #ifdef CONFIG_SMP EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(cpu_online_map); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/io_apic.c 901-mjb1.1/arch/i386/kernel/io_apic.c --- 000-virgin/arch/i386/kernel/io_apic.c Tue Aug 5 20:01:48 2003 +++ 901-mjb1.1/arch/i386/kernel/io_apic.c Wed Aug 13 20:48:59 2003 @@ -272,7 +272,7 @@ static void set_ioapic_affinity (unsigne spin_unlock_irqrestore(&ioapic_lock, flags); } -#if defined(CONFIG_SMP) +#if defined(CONFIG_IRQBALANCE) # include /* kernel_thread() */ # include /* kstat */ # include /* kmalloc() */ @@ -667,8 +667,6 @@ static int __init irqbalance_disable(cha __setup("noirqbalance", irqbalance_disable); -static void set_ioapic_affinity (unsigned int irq, unsigned long mask); - static inline void move_irq(int irq) { /* note - we hold the desc->lock */ @@ -680,9 +678,11 @@ static inline void move_irq(int irq) __initcall(balanced_irq_init); -#else /* !SMP */ +#else /* !CONFIG_IRQBALANCE */ static inline void move_irq(int irq) { } +#endif /* CONFIG_IRQBALANCE */ +#ifndef CONFIG_SMP void send_IPI_self(int vector) { unsigned int cfg; @@ -697,7 +697,7 @@ void send_IPI_self(int vector) */ apic_write_around(APIC_ICR, cfg); } -#endif /* defined(CONFIG_SMP) */ +#endif /* !CONFIG_SMP */ /* @@ -1136,24 +1136,59 @@ static inline int IO_APIC_irq_trigger(in } int irq_vector[NR_IRQS] = { FIRST_DEVICE_VECTOR , 0 }; +int __initdata vector_allocated[MAX_NUMNODES][FIRST_SYSTEM_VECTOR]; -static int __init assign_irq_vector(int irq) -{ - static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; - if (IO_APIC_VECTOR(irq) > 0) - return IO_APIC_VECTOR(irq); +/* + * This is the per node vector allocator, it will only work for systems which + * have ioapics which can only deliver vectors to cpus on the same node and + * thus have hardware enforced ioapic/irq node affinity. + * + * However currently the only i386 systems which have this interrupt + * dispatching/servicing architecture are NUMAQ and x440. We try and 'share' + * vectors where possible to simplify cases where an irq can be serviced on + * multiple nodes due to it being present on multiple busses/nodes. + * The first pass on node0 will ensure we catch these node 'shared' irqs. + */ +static int __init assign_irq_vector(int irq, int node) +{ + static int offset[MAX_NUMNODES]; + static int nr_assigned[MAX_NUMNODES] = {[0 ... MAX_NUMNODES-1] = 1}; + static int current_vector[MAX_NUMNODES] = + {[0 ... MAX_NUMNODES-1] = FIRST_DEVICE_VECTOR}; + + int vector; + + Dprintk("requesting vector for node%d/irq%d\n", node, irq); + vector = IO_APIC_VECTOR(irq); + if (vector > 0) { + Dprintk("returning previous allocation vector0x%x\n", vector); + vector_allocated[node][vector]++; + return vector; + } + + if (++nr_assigned[node] > NR_IRQ_VECTORS) + return -ENOSPC; + next: - current_vector += 8; - if (current_vector == SYSCALL_VECTOR) + current_vector[node] += 8; + if (current_vector[node] == SYSCALL_VECTOR) goto next; - if (current_vector >= FIRST_SYSTEM_VECTOR) { - offset = (offset + 1) & 7; - current_vector = FIRST_DEVICE_VECTOR + offset; + if (current_vector[node] > FIRST_SYSTEM_VECTOR) { + offset[node] = (offset[node]+1) & 7; + current_vector[node] = FIRST_DEVICE_VECTOR + offset[node]; } - IO_APIC_VECTOR(irq) = current_vector; - return current_vector; + vector = current_vector[node]; + if (vector_allocated[node][vector]) + goto next; + + vector_allocated[node][vector]++; + IO_APIC_VECTOR(irq) = vector; + Dprintk("returning new allocation node%d/irq%d -> vector0x%x\n", + node, irq, vector); + + return vector; } static struct hw_interrupt_type ioapic_level_irq_type; @@ -1162,7 +1197,7 @@ static struct hw_interrupt_type ioapic_e void __init setup_IO_APIC_irqs(void) { struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; + int apic, pin, idx, irq, first_notcon = 1, vector, bus, node; unsigned long flags; printk(KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1194,12 +1229,21 @@ void __init setup_IO_APIC_irqs(void) entry.trigger = irq_trigger(idx); entry.polarity = irq_polarity(idx); + bus = mp_irqs[idx].mpc_srcbus; + node = mp_bus_id_to_node[bus]; + if (irq_trigger(idx)) { entry.trigger = 1; entry.mask = 1; } irq = pin_2_irq(idx, apic, pin); + if (irq >= NR_IRQS) { + printk("skipping irq%d on node%d/bus%d/ioapic%d out of IRQs!\n", + irq, node, bus, apic); + continue; + } + /* * skip adding the timer int on secondary nodes, which causes * a small but painful rift in the time-space continuum @@ -1213,7 +1257,10 @@ void __init setup_IO_APIC_irqs(void) continue; if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); + vector = assign_irq_vector(irq, node); + if (vector < 0) + continue; + entry.vector = vector; if (IO_APIC_irq_trigger(irq)) @@ -1221,11 +1268,15 @@ void __init setup_IO_APIC_irqs(void) else irq_desc[irq].handler = &ioapic_edge_irq_type; - set_intr_gate(vector, interrupt[irq]); - + Dprintk("irq_setup: node%d/bus%d/ioapic%d/vector0x%x - irq%d %p\n", + node, bus, apic, vector, irq, interrupt[irq]); + + node_set_intr_gate(node, vector, interrupt[irq]); + if (!apic && (irq < 16)) disable_8259A_irq(irq); } + spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); @@ -1938,6 +1989,7 @@ static inline void init_IO_APIC_traps(vo * so default to an old-fashioned 8259 * interrupt if we can.. */ + printk(KERN_DEBUG "irq%d not serviced by IOAPIC\n", irq); if (irq < 16) make_8259A_irq(irq); else @@ -2075,9 +2127,10 @@ static inline void check_timer(void) * get/set the timer IRQ vector: */ disable_8259A_irq(0); - vector = assign_irq_vector(0); + vector = assign_irq_vector(0, cpu_to_node(smp_processor_id())); + /* This gets reserved on all nodes as FIRST_DEVICE_VECTOR */ set_intr_gate(vector, interrupt[0]); - + /* * Subtle, code in do_timer_interrupt() expects an AEOI * mode for the 8259A whenever interrupts are routed @@ -2332,10 +2385,13 @@ int io_apic_set_pci_routing (int ioapic, { struct IO_APIC_route_entry entry; unsigned long flags; + int node, bus, vector; + + if (irq >= NR_IRQS) + return -ENOSPC; if (!IO_APIC_IRQ(irq)) { - printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0/n", - ioapic); + printk(KERN_ERR "ioapic%d invalid reference to IRQ0/n", ioapic); return -EINVAL; } @@ -2355,17 +2411,26 @@ int io_apic_set_pci_routing (int ioapic, entry.polarity = 1; /* Low active */ add_pin_to_irq(irq, ioapic, pin); + + /* XXX verify this with an x440 and plain ACPI/SMP -zwane */ + bus = mp_irqs[pin].mpc_srcbus; + node = mp_bus_id_to_node[bus]; + + vector = assign_irq_vector(irq, node); + if (vector < 0) + return -ENOSPC; - entry.vector = assign_irq_vector(irq); - - printk(KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " - "IRQ %d)\n", ioapic, + entry.vector = vector; + printk(KERN_DEBUG "NODE[%d] IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " + "IRQ %d)\n", node, ioapic, mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq); irq_desc[irq].handler = &ioapic_level_irq_type; - set_intr_gate(entry.vector, interrupt[irq]); - + printk(KERN_DEBUG "irq_route: node%d/bus%d/ioapic%d/vector0x%x - irq%d %p\n", + node, bus, ioapic, entry.vector, irq, interrupt[irq]); + node_set_intr_gate(node, entry.vector, interrupt[irq]); + if (!ioapic && (irq < 16)) disable_8259A_irq(irq); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/irq.c 901-mjb1.1/arch/i386/kernel/irq.c --- 000-virgin/arch/i386/kernel/irq.c Tue Aug 5 19:59:12 2003 +++ 901-mjb1.1/arch/i386/kernel/irq.c Wed Aug 13 20:29:22 2003 @@ -961,8 +961,9 @@ static int irq_affinity_write_proc (stru return -EINVAL; irq_affinity[irq] = new_value; +#ifndef CONFIG_X86_SUMMIT irq_desc[irq].handler->set_affinity(irq, new_value); - +#endif return full_count; } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/nmi.c 901-mjb1.1/arch/i386/kernel/nmi.c --- 000-virgin/arch/i386/kernel/nmi.c Wed Aug 13 20:24:18 2003 +++ 901-mjb1.1/arch/i386/kernel/nmi.c Wed Aug 13 20:29:29 2003 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -44,6 +45,20 @@ extern void show_registers(struct pt_reg */ static int nmi_active; +#ifdef CONFIG_X86_REMOTE_DEBUG +extern gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ +{ \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs); \ + after; \ + } \ +} +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + #define K7_EVNTSEL_ENABLE (1 << 22) #define K7_EVNTSEL_INT (1 << 20) #define K7_EVNTSEL_OS (1 << 17) @@ -422,12 +437,59 @@ void nmi_watchdog_tick (struct pt_regs * sum = irq_stat[cpu].apic_timer_irqs; if (last_irq_sums[cpu] == sum) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_spinlock)) +#else + if (kgdb_spinlock) +#endif + { + /* We are inside kgdb, this isn't a stuck cpu */ + alert_counter[cpu] = 0; + } else { +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + if (!procindebug[cpu]) { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } + return; + } + } +#endif /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; if (alert_counter[cpu] == 5*nmi_hz) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_trylock(&kgdb_nmispinlock)) +#else + kgdb_nmispinlock = 1; +#endif + { + procindebug[cpu] = 1; + CHK_REMOTE_DEBUG(2,SIGBUS,0,regs,) + } +#ifdef CONFIG_SMP + else { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } +#endif +#endif spin_lock(&nmi_print_lock); /* * We are in trouble anyway, lets at least try diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/setup.c 901-mjb1.1/arch/i386/kernel/setup.c --- 000-virgin/arch/i386/kernel/setup.c Tue Aug 5 20:01:48 2003 +++ 901-mjb1.1/arch/i386/kernel/setup.c Wed Aug 13 20:55:51 2003 @@ -989,9 +989,6 @@ void __init setup_arch(char **cmdline_p) if (smp_found_config) get_smp_config(); #endif -#ifdef CONFIG_X86_SUMMIT - setup_summit(); -#endif register_memory(max_low_pfn); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/smp.c 901-mjb1.1/arch/i386/kernel/smp.c --- 000-virgin/arch/i386/kernel/smp.c Sat Jun 14 18:37:24 2003 +++ 901-mjb1.1/arch/i386/kernel/smp.c Wed Aug 13 20:29:29 2003 @@ -498,10 +498,17 @@ int smp_call_function (void (*func) (voi { struct call_data_struct data; int cpus = num_online_cpus()-1; + int count = 0; + int gdb; - if (!cpus) + if (cpus <= 0) return 0; + gdb = 0; + if (wait == 99) { + wait = 0; + gdb = 1; + } data.func = func; data.info = info; atomic_set(&data.started, 0); @@ -517,12 +524,27 @@ int smp_call_function (void (*func) (voi send_IPI_allbutself(CALL_FUNCTION_VECTOR); /* Wait for response */ - while (atomic_read(&data.started) != cpus) + while (atomic_read(&data.started) != cpus) { + if (gdb) { + if (count++ == 2000000) { + printk("%s: timeout\n", __FUNCTION__); + break; + } + if (count == 1000000) { + printk("looks bad\n"); + printk("cpus=%d, started=%d\n", cpus, + atomic_read(&data.started)); + } + if (count > 1000000) + udelay(1); + } barrier(); + } if (wait) while (atomic_read(&data.finished) != cpus) barrier(); + spin_unlock(&call_lock); return 0; @@ -564,9 +586,9 @@ asmlinkage void smp_reschedule_interrupt ack_APIC_irq(); } -asmlinkage void smp_call_function_interrupt(void) +asmlinkage void smp_call_function_interrupt(struct pt_regs regs) { - void (*func) (void *info) = call_data->func; + void (*func) (void *info, struct pt_regs *) = (void (*)(void *, struct pt_regs*))call_data->func; void *info = call_data->info; int wait = call_data->wait; @@ -581,7 +603,7 @@ asmlinkage void smp_call_function_interr * At this point the info structure may be out of scope unless wait==1 */ irq_enter(); - (*func)(info); + (*func)(info, ®s); irq_exit(); if (wait) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/smpboot.c 901-mjb1.1/arch/i386/kernel/smpboot.c --- 000-virgin/arch/i386/kernel/smpboot.c Fri May 30 19:01:59 2003 +++ 901-mjb1.1/arch/i386/kernel/smpboot.c Wed Aug 13 20:48:49 2003 @@ -45,6 +45,7 @@ #include #include +#include #include #include #include @@ -62,7 +63,7 @@ int smp_num_siblings = 1; int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ /* Bitmask of currently online CPUs */ -unsigned long cpu_online_map; +unsigned long cpu_online_map = 1; static volatile unsigned long cpu_callin_map; volatile unsigned long cpu_callout_map; @@ -442,6 +443,7 @@ int __init start_secondary(void *unused) */ cpu_init(); smp_callin(); + setup_cpu_idt(); while (!test_bit(smp_processor_id(), &smp_commenced_mask)) rep_nop(); setup_secondary_APIC_clock(); @@ -949,7 +951,7 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); - + /* * If we couldn't find an SMP configuration at boot time, * get out of here now! diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/summit.c 901-mjb1.1/arch/i386/kernel/summit.c --- 000-virgin/arch/i386/kernel/summit.c Sat Jun 14 18:37:24 2003 +++ 901-mjb1.1/arch/i386/kernel/summit.c Wed Aug 13 20:55:51 2003 @@ -31,6 +31,7 @@ #include #include +#ifdef CONFIG_NUMA static void __init setup_pci_node_map_for_wpeg(int wpeg_num, struct rio_table_hdr *rth, struct scal_detail **scal_nodes, struct rio_detail **rio_nodes){ int twst_num = 0, node = 0, first_bus = 0; @@ -93,15 +94,21 @@ static void __init setup_pci_node_map_fo mp_bus_id_to_node[bus] = node; } -static void __init build_detail_arrays(struct rio_table_hdr *rth, +static int __init build_detail_arrays(struct rio_table_hdr *rth, struct scal_detail **sd, struct rio_detail **rd){ unsigned long ptr; int i, scal_detail_size, rio_detail_size; + if ((rth->num_scal_dev > MAX_NUMNODES) || + (rth->num_rio_dev > MAX_NUMNODES * 2)){ + printk("%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __FUNCTION__, MAX_NUMNODES, rth->num_scal_dev); + return 1; + } + switch (rth->version){ default: printk("%s: Bad Rio Grande Table Version: %d\n", __FUNCTION__, rth->version); - /* Fall through to default to version 2 spec */ + return 1; case 2: scal_detail_size = 11; rio_detail_size = 13; @@ -119,6 +126,8 @@ static void __init build_detail_arrays(s ptr += scal_detail_size * rth->num_scal_dev; for(i = 0; i < rth->num_rio_dev; i++) rd[i] = (struct rio_detail *)(ptr + (rio_detail_size * i)); + + return 0; } void __init setup_summit(void) @@ -152,11 +161,12 @@ void __init setup_summit(void) return; } - /* Deal with the ugly version 2/3 pointer arithmetic */ - build_detail_arrays(rio_table_hdr, scal_devs, rio_devs); + if (build_detail_arrays(rio_table_hdr, scal_devs, rio_devs)) + return; for(i = 0; i < rio_table_hdr->num_rio_dev; i++) if (is_WPEG(rio_devs[i]->type)) /* It's a Winnipeg, it's got PCI Busses */ setup_pci_node_map_for_wpeg(i, rio_table_hdr, scal_devs, rio_devs); } +#endif /* CONFIG_NUMA */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/kernel/traps.c 901-mjb1.1/arch/i386/kernel/traps.c --- 000-virgin/arch/i386/kernel/traps.c Wed Jul 2 21:59:04 2003 +++ 901-mjb1.1/arch/i386/kernel/traps.c Wed Aug 13 20:48:49 2003 @@ -30,6 +30,7 @@ #include #endif +#include #ifdef CONFIG_MCA #include #endif @@ -53,6 +54,24 @@ #include "mach_traps.h" +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + +#ifdef CONFIG_X86_REMOTE_DEBUG +gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs) ; \ + after; \ + } \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + asmlinkage int system_call(void); asmlinkage void lcall7(void); asmlinkage void lcall27(void); @@ -68,7 +87,9 @@ char ignore_fpu_irq = 0; * F0 0F bug workaround.. We have a special link segment * for this. */ -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; + +struct desc_struct node_idt_table[MAX_NUMNODES][IDT_ENTRIES] __attribute__((__section__(".data.idt"))) = + {[0 ... MAX_NUMNODES-1] = { {0, 0}, }}; asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -258,6 +279,7 @@ void die(const char * str, struct pt_reg bust_spinlocks(1); handle_BUG(regs); printk("%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); + CHK_REMOTE_DEBUG(1,SIGTRAP,err,regs,); show_registers(regs); bust_spinlocks(0); spin_unlock_irq(&die_lock); @@ -327,6 +349,7 @@ static inline void do_trap(int trapnr, i #define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,)\ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -344,7 +367,9 @@ asmlinkage void do_##name(struct pt_regs #define DO_VM86_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,return)\ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + return; \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -387,8 +412,10 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) + if (!fixup_exception(regs)) { + CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,) die("general protection fault", regs, error_code); + } } static void mem_parity_error(unsigned char reason, struct pt_regs * regs) @@ -550,8 +577,10 @@ asmlinkage void do_debug(struct pt_regs * allowing programs to debug themselves without the ptrace() * interface. */ +#ifndef CONFIG_X86_REMOTE_DEBUG if ((regs->xcs & 3) == 0) goto clear_TF_reenable; +#endif if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } @@ -563,11 +592,13 @@ asmlinkage void do_debug(struct pt_regs info.si_errno = 0; info.si_code = TRAP_BRKPT; - /* If this is a kernel mode trap, save the user PC on entry to - * the kernel, that's what the debugger can make sense of. - */ - info.si_addr = ((regs->xcs & 3) == 0) ? (void *)tsk->thread.eip : - (void *)regs->eip; + + /* If this is a kernel mode trap, we need to reset db7 to allow us + * to continue sanely */ + if ((regs->xcs & 3) == 0) + goto clear_dr7; + + info.si_addr = (void *)regs->eip; force_sig_info(SIGTRAP, &info, tsk); /* Disable additional traps. They'll be re-enabled when @@ -577,13 +608,16 @@ clear_dr7: __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); + CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) return; debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); return; +#ifndef CONFIG_X86_REMOTE_DEBUG clear_TF_reenable: +#endif set_tsk_thread_flag(tsk, TIF_SINGLESTEP); clear_TF: regs->eflags &= ~TF_MASK; @@ -773,14 +807,16 @@ asmlinkage void math_emulate(long arg) #ifdef CONFIG_X86_F00F_BUG void __init trap_init_f00f_bug(void) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + int node = cpu_to_node(smp_processor_id()); + + __set_fixmap(FIX_F00F_IDT, __pa(&node_idt_table[node]), PAGE_KERNEL_RO); /* * Update the IDT descriptor and reload the IDT so that * it uses the read-only mapped virtual address. */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); - __asm__ __volatile__("lidt %0": "=m" (idt_descr)); + node_idt_descr[node].address = fix_to_virt(FIX_F00F_IDT); + __asm__ __volatile__("lidt %0": "=m" (node_idt_descr[node])); } #endif @@ -799,24 +835,36 @@ do { \ /* - * This needs to use 'idt_table' rather than 'idt', and + * This needs to use 'node_idt_table' rather than 'idt', and * thus use the _nonmapped_ version of the IDT, as the * Pentium F0 0F bugfix can have resulted in the mapped * IDT being write-protected. */ + +void node_set_intr_gate(unsigned int node, unsigned int n, void *addr) +{ + _set_gate(&node_idt_table[node][n],14,0,addr,__KERNEL_CS); +} + void set_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); + int node; + for (node = 0; node < MAX_NUMNODES; node++) + node_set_intr_gate(node, n, addr); } static void __init set_trap_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); + int node; + for (node = 0; node < MAX_NUMNODES; node++) + _set_gate(&node_idt_table[node][n],15,0,addr,__KERNEL_CS); } static void __init set_system_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); + int node; + for (node = 0; node < MAX_NUMNODES; node++) + _set_gate(&node_idt_table[node][n],15,3,addr,__KERNEL_CS); } static void __init set_call_gate(void *a, void *addr) @@ -826,7 +874,9 @@ static void __init set_call_gate(void *a static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { - _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); + int node; + for (node = 0; node < MAX_NUMNODES; node++) + _set_gate(&node_idt_table[node][n],5,0,0,(gdt_entry<<3)); } @@ -877,6 +927,9 @@ void __init trap_init(void) */ set_call_gate(&default_ldt[0],lcall7); set_call_gate(&default_ldt[4],lcall27); + + /* setup the pernode idt tables */ + setup_node_idts(); /* * Should be a barrier for any external CPU state. diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/lib/dec_and_lock.c 901-mjb1.1/arch/i386/lib/dec_and_lock.c --- 000-virgin/arch/i386/lib/dec_and_lock.c Sun Nov 17 20:29:28 2002 +++ 901-mjb1.1/arch/i386/lib/dec_and_lock.c Wed Aug 13 20:29:36 2003 @@ -10,6 +10,7 @@ #include #include +#ifndef ATOMIC_DEC_AND_LOCK int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { int counter; @@ -38,3 +39,5 @@ slow_path: spin_unlock(lock); return 0; } +#endif + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/mm/fault.c 901-mjb1.1/arch/i386/mm/fault.c --- 000-virgin/arch/i386/mm/fault.c Fri May 30 19:01:59 2003 +++ 901-mjb1.1/arch/i386/mm/fault.c Wed Aug 13 20:48:49 2003 @@ -2,6 +2,11 @@ * linux/arch/i386/mm/fault.c * * Copyright (C) 1995 Linus Torvalds + * + * Change History + * + * Tigran Aivazian Remote debugging support. + * */ #include @@ -20,6 +25,9 @@ #include #include /* For unblank_screen() */ #include +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif #include #include @@ -112,6 +120,15 @@ asmlinkage void do_page_fault(struct pt_ if (in_atomic() || !mm) goto no_context; +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs) ; + return; /* return w/modified regs */ + } + } +#endif + down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -211,14 +228,27 @@ bad_area: return; } +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + return; /* Return with modified registers */ + } + } else { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + } + } +#endif + #ifdef CONFIG_X86_F00F_BUG /* * Pentium F0 0F C7 C8 bug workaround. */ if (boot_cpu_data.f00f_bug) { - unsigned long nr; - - nr = (address - idt_descr.address) >> 3; + unsigned long nr, node; + node = cpu_to_node(smp_processor_id()); + nr = (address - node_idt_descr[node].address) >> 3; if (nr == 6) { do_invalid_op(regs, 0); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/mm/hugetlbpage.c 901-mjb1.1/arch/i386/mm/hugetlbpage.c --- 000-virgin/arch/i386/mm/hugetlbpage.c Tue Jun 24 21:29:16 2003 +++ 901-mjb1.1/arch/i386/mm/hugetlbpage.c Wed Aug 13 20:51:52 2003 @@ -61,6 +61,27 @@ static struct page *alloc_fresh_huge_pag void free_huge_page(struct page *page); +#ifdef CONFIG_NUMA + +static inline void huge_inc_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss += (HPAGE_SIZE / PAGE_SIZE); + mm->pernode_rss[page_to_nid(page)] += (HPAGE_SIZE / PAGE_SIZE); +} + +static inline void huge_dec_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss -= (HPAGE_SIZE / PAGE_SIZE); + mm->pernode_rss[page_to_nid(page)] -= (HPAGE_SIZE / PAGE_SIZE); +} + +#else /* !CONFIG_NUMA */ + +#define huge_inc_rss(mm, page) ((mm)->rss += (HPAGE_SIZE / PAGE_SIZE)) +#define huge_dec_rss(mm, page) ((mm)->rss -= (HPAGE_SIZE / PAGE_SIZE)) + +#endif /* CONFIG_NUMA */ + static struct page *alloc_hugetlb_page(void) { int i; @@ -105,7 +126,7 @@ static void set_huge_pte(struct mm_struc { pte_t entry; - mm->rss += (HPAGE_SIZE / PAGE_SIZE); + huge_inc_rss(mm, page); if (write_access) { entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); @@ -145,7 +166,7 @@ int copy_hugetlb_page_range(struct mm_st ptepage = pte_page(entry); get_page(ptepage); set_pte(dst_pte, entry); - dst->rss += (HPAGE_SIZE / PAGE_SIZE); + huge_inc_rss(dst, ptepage); addr += HPAGE_SIZE; } return 0; @@ -314,8 +335,8 @@ void unmap_hugepage_range(struct vm_area page = pte_page(*pte); huge_page_release(page); pte_clear(pte); + huge_dec_rss(mm, page); } - mm->rss -= (end - start) >> PAGE_SHIFT; flush_tlb_range(vma, start, end); } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/mm/init.c 901-mjb1.1/arch/i386/mm/init.c --- 000-virgin/arch/i386/mm/init.c Wed Aug 13 20:24:18 2003 +++ 901-mjb1.1/arch/i386/mm/init.c Wed Aug 13 20:51:03 2003 @@ -121,6 +121,24 @@ static void __init page_table_range_init } } + +/* + * Abstract out using large pages when mapping KVA, or the SMP identity + * mapping + */ +void pmd_map_pfn_range(pmd_t* pmd_entry, unsigned long pfn, unsigned long max_pfn) +{ + int pte_ofs; + /* Map with big pages if possible, otherwise create normal page tables. */ + if (cpu_has_pse) { + set_pmd(pmd_entry, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + pfn += PTRS_PER_PTE; + } else { + pte_t* pte = one_page_table_init(pmd_entry); + for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_pfn; pte++, pfn++, pte_ofs++) + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + } +} /* * This maps the physical memory to kernel virtual address space, a total * of max_low_pfn pages, by creating page tables starting from address @@ -131,8 +149,7 @@ static void __init kernel_physical_mappi unsigned long pfn; pgd_t *pgd; pmd_t *pmd; - pte_t *pte; - int pgd_idx, pmd_idx, pte_ofs; + int pgd_idx, pmd_idx; pgd_idx = pgd_index(PAGE_OFFSET); pgd = pgd_base + pgd_idx; @@ -142,21 +159,48 @@ static void __init kernel_physical_mappi pmd = one_md_table_init(pgd); if (pfn >= max_low_pfn) continue; - for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { - /* Map with big pages if possible, otherwise create normal page tables. */ - if (cpu_has_pse) { - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); - pfn += PTRS_PER_PTE; - } else { - pte = one_page_table_init(pmd); - - for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); - } + + /* beware of starting KVA in the middle of a pmd. */ + if( pgd_idx == pgd_index(PAGE_OFFSET) ) { + pmd_idx = pmd_index(PAGE_OFFSET); + pmd = &pmd[pmd_idx]; + } else + pmd_idx = 0; + + for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { + pmd_map_pfn_range(pmd, pfn, max_low_pfn); + pfn += PTRS_PER_PTE; } } } +/* + * Add low memory identity-mappings - SMP needs it when + * starting up on an AP from real-mode. In the non-PAE + * case we already have these mappings through head.S. + * All user-space mappings are explicitly cleared after + * SMP startup in zap_low_mappings(). + */ +static void __init low_physical_mapping_init(pgd_t *pgd_base) +{ +#if CONFIG_X86_PAE + unsigned long pfn = 0; + int pmd_ofs = 0; + pmd_t *pmd = one_md_table_init(pgd_base); + + if(!cpu_has_pse) { + printk("PAE enabled, but no support for PSE (large pages)!\n"); + printk("this is likely to waste some RAM."); + } + + for (; pmd_ofs < PTRS_PER_PMD && pfn <= max_low_pfn; pmd++, pmd_ofs++) { + pmd_map_pfn_range(pmd, pfn, max_low_pfn); + pfn += PTRS_PER_PTE; + } +#endif +} + + static inline int page_kills_ppro(unsigned long pagenr) { if (pagenr >= 0x70000 && pagenr <= 0x7003F) @@ -217,7 +261,7 @@ void __init permanent_kmaps_init(pgd_t * pgd = swapper_pg_dir + pgd_index(vaddr); pmd = pmd_offset(pgd, vaddr); pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + pkmap_page_table = pte; } void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) @@ -282,6 +326,7 @@ static void __init pagetable_init (void) } kernel_physical_mapping_init(pgd_base); + low_physical_mapping_init(pgd_base); remap_numa_kva(); /* @@ -290,19 +335,7 @@ static void __init pagetable_init (void) */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; page_table_range_init(vaddr, 0, pgd_base); - permanent_kmaps_init(pgd_base); - -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; -#endif } void zap_low_mappings (void) @@ -314,7 +347,7 @@ void zap_low_mappings (void) * Note that "pgd_clear()" doesn't do it for * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) + for (i = 0; i < FIRST_KERNEL_PGD_PTR; i++) #ifdef CONFIG_X86_PAE set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else @@ -511,6 +544,7 @@ void __init mem_init(void) kmem_cache_t *pgd_cache; kmem_cache_t *pmd_cache; +kmem_cache_t *kernel_pmd_cache; void __init pgtable_cache_init(void) { @@ -523,6 +557,15 @@ void __init pgtable_cache_init(void) NULL); if (!pmd_cache) panic("pgtable_cache_init(): cannot create pmd cache"); + + kernel_pmd_cache = kmem_cache_create("pae_kernel_pmd", + (PTRS_PER_PMD*sizeof(pmd_t))*KERNEL_PGD_PTRS, + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + kernel_pmd_ctor, + NULL); + if (!kernel_pmd_cache) + panic("pgtable_cache_init(): cannot create kernel pmd cache"); } pgd_cache = kmem_cache_create("pgd", PTRS_PER_PGD*sizeof(pgd_t), diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/mm/pgtable.c 901-mjb1.1/arch/i386/mm/pgtable.c --- 000-virgin/arch/i386/mm/pgtable.c Wed Aug 13 20:24:18 2003 +++ 901-mjb1.1/arch/i386/mm/pgtable.c Wed Aug 13 20:51:03 2003 @@ -157,6 +157,28 @@ void pmd_ctor(void *pmd, kmem_cache_t *c memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); } +void kernel_pmd_ctor(void *__pmd, kmem_cache_t *kernel_pmd_cache, unsigned long flags) +{ + pmd_t *pmd = __pmd; + int i; + + /* + * you only need to memset the portion which isn't used by + * the kernel + */ + clear_page(__pmd); + + for (i=FIRST_KERNEL_PGD_PTR; i= 0; i--) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + for (i--; i >= 0; i--) { + pmd_t *pmd = pmd_offset(&pgd[i],0); + kmem_cache_free(pmd_cache, pmd); + } kmem_cache_free(pgd_cache, pgd); return NULL; } @@ -231,9 +264,18 @@ void pgd_free(pgd_t *pgd) int i; /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + if (PTRS_PER_PMD > 1) { + for (i = 0; i < PTRS_PER_PGD; i++) { + pmd_t *pmd_to_free = pmd_offset(&pgd[i],0); + + set_pgd(&pgd[i], __pgd(0)); + + if (i < FIRST_KERNEL_PGD_PTR) + kmem_cache_free(pmd_cache, pmd_to_free); + else if (i == FIRST_KERNEL_PGD_PTR) + kmem_cache_free(kernel_pmd_cache, pmd_to_free); + } + } /* in the non-PAE case, clear_page_tables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/i386/vmlinux.lds.S 901-mjb1.1/arch/i386/vmlinux.lds.S --- 000-virgin/arch/i386/vmlinux.lds.S Sat Jun 14 18:37:24 2003 +++ 901-mjb1.1/arch/i386/vmlinux.lds.S Wed Aug 13 20:27:43 2003 @@ -10,7 +10,7 @@ ENTRY(startup_32) jiffies = jiffies_64; SECTIONS { - . = 0xC0000000 + 0x100000; + . = __PAGE_OFFSET + 0x100000; /* read-only */ _text = .; /* Text and read-only data */ .text : { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc/Kconfig 901-mjb1.1/arch/ppc/Kconfig --- 000-virgin/arch/ppc/Kconfig Wed Aug 13 20:24:20 2003 +++ 901-mjb1.1/arch/ppc/Kconfig Wed Aug 13 20:51:56 2003 @@ -1378,6 +1378,36 @@ source "net/bluetooth/Kconfig" source "lib/Kconfig" +menu "GCOV coverage profiling" + +config GCOV_PROFILE + bool "GCOV coverage profiling" + ---help--- + Provide infrastructure for coverage support for the kernel. This + will not compile the kernel by default with the necessary flags. + To obtain coverage information for the entire kernel, one should + enable the subsequent option (Profile entire kernel). If only + particular files or directories of the kernel are desired, then + one must provide the following compile options for such targets: + "-fprofile-arcs -ftest-coverage" in the CFLAGS. To obtain + access to the coverage data one must insmod the gcov-prof kernel + module. + +config GCOV_ALL + bool "GCOV_ALL" + depends on GCOV_PROFILE + ---help--- + If you say Y here, it will compile the entire kernel with coverage + option enabled. + +config GCOV_PROC + tristate "gcov-proc module" + depends on GCOV_PROFILE && PROC_FS + ---help--- + This is the gcov-proc module that exposes gcov data through the + /proc filesystem + +endmenu menu "Kernel hacking" diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc/boot/openfirmware/common.c 901-mjb1.1/arch/ppc/boot/openfirmware/common.c --- 000-virgin/arch/ppc/boot/openfirmware/common.c Sun Nov 17 20:29:52 2002 +++ 901-mjb1.1/arch/ppc/boot/openfirmware/common.c Wed Aug 13 20:51:56 2003 @@ -30,6 +30,10 @@ struct memchunk { static struct memchunk *freechunks; +#ifdef CONFIG_GCOV_PROFILE +void __bb_init_func (void *ptr /* struct bb *blocks */) { } +#endif + static void *zalloc(void *x, unsigned items, unsigned size) { void *p; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc/boot/prep/misc.c 901-mjb1.1/arch/ppc/boot/prep/misc.c --- 000-virgin/arch/ppc/boot/prep/misc.c Thu Jan 9 19:15:57 2003 +++ 901-mjb1.1/arch/ppc/boot/prep/misc.c Wed Aug 13 20:51:56 2003 @@ -71,6 +71,10 @@ extern unsigned long serial_init(int cha extern void serial_fixups(void); extern unsigned long get_mem_size(void); +#ifdef CONFIG_GCOV_PROFILE +void __bb_init_func (void *ptr /* struct bb *blocks */) { } +#endif + void writel(unsigned int val, unsigned int address) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc/kernel/Makefile 901-mjb1.1/arch/ppc/kernel/Makefile --- 000-virgin/arch/ppc/kernel/Makefile Fri May 30 19:02:00 2003 +++ 901-mjb1.1/arch/ppc/kernel/Makefile Wed Aug 13 20:51:56 2003 @@ -15,8 +15,8 @@ extra-$(CONFIG_40x) := head_4xx.o extra-$(CONFIG_8xx) := head_8xx.o extra-$(CONFIG_6xx) += idle_6xx.o -obj-y := entry.o traps.o irq.o idle.o time.o misc.o \ - process.o signal.o ptrace.o align.o \ +obj-y := entry.o ptrace.o traps.o irq.o idle.o time.o misc.o \ + process.o signal.o align.o \ semaphore.o syscalls.o setup.o \ cputable.o ppc_htab.o obj-$(CONFIG_6xx) += l2cr.o cpu_setup_6xx.o diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc/kernel/entry.S 901-mjb1.1/arch/ppc/kernel/entry.S --- 000-virgin/arch/ppc/kernel/entry.S Wed Jul 2 21:59:07 2003 +++ 901-mjb1.1/arch/ppc/kernel/entry.S Wed Aug 13 20:51:56 2003 @@ -106,10 +106,26 @@ transfer_to_handler: mfspr r11,SPRN_HID0 mtcr r11 BEGIN_FTR_SECTION +#ifdef CONFIG_GCOV_PROFILE + bt- 8,near1_power_save_6xx_restore /* Check DOZE */ + b skip1_power_save_6xx_restore +near1_power_save_6xx_restore: + b power_save_6xx_restore +skip1_power_save_6xx_restore: +#else bt- 8,power_save_6xx_restore /* Check DOZE */ +#endif END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE) BEGIN_FTR_SECTION +#ifdef CONFIG_GCOV_PROFILE + bt- 9,near2_power_save_6xx_restore /* Check NAP */ + b skip2_power_save_6xx_restore +near2_power_save_6xx_restore: + b power_save_6xx_restore +skip2_power_save_6xx_restore: +#else bt- 9,power_save_6xx_restore /* Check NAP */ +#endif END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #endif /* CONFIG_6xx */ .globl transfer_to_handler_cont diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc/kernel/head.S 901-mjb1.1/arch/ppc/kernel/head.S --- 000-virgin/arch/ppc/kernel/head.S Sat Jun 14 18:37:25 2003 +++ 901-mjb1.1/arch/ppc/kernel/head.S Wed Aug 13 20:51:56 2003 @@ -1643,3 +1643,25 @@ intercept_table: */ abatron_pteptrs: .space 8 + +#ifdef CONFIG_GCOV_PROFILE +/* + * The .ctors-section contains a list of pointers to constructor + * functions which are used to initialize gcov structures. + * + * Because there is no NULL at the end of the constructor list + * in the kernel we need the addresses of both the constructor + * as well as the destructor list which are supposed to be + * adjacent. + */ + +.section ".ctors","aw" +.globl __CTOR_LIST__ +.type __CTOR_LIST__,@object +__CTOR_LIST__: +.section ".dtors","aw" +.globl __DTOR_LIST__ +.type __DTOR_LIST__,@object +__DTOR_LIST__: +#endif + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc/syslib/prom_init.c 901-mjb1.1/arch/ppc/syslib/prom_init.c --- 000-virgin/arch/ppc/syslib/prom_init.c Tue Aug 5 20:01:48 2003 +++ 901-mjb1.1/arch/ppc/syslib/prom_init.c Wed Aug 13 20:51:56 2003 @@ -667,7 +667,11 @@ prom_instantiate_rtas(void) * Actually OF has bugs so we just arbitrarily * use memory at the 6MB point. */ +#ifdef CONFIG_GCOV_PROFILE + rtas_data = 0x990000; +#else rtas_data = 6 << 20; +#endif prom_print(" at "); prom_print_hex(rtas_data); } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc64/Kconfig 901-mjb1.1/arch/ppc64/Kconfig --- 000-virgin/arch/ppc64/Kconfig Wed Aug 13 20:24:20 2003 +++ 901-mjb1.1/arch/ppc64/Kconfig Wed Aug 13 20:51:56 2003 @@ -342,6 +342,37 @@ config VIOPATH source "arch/ppc64/oprofile/Kconfig" +menu "GCOV coverage profiling" + +config GCOV_PROFILE + bool "GCOV coverage profiling" + ---help--- + Provide infrastructure for coverage support for the kernel. This + will not compile the kernel by default with the necessary flags. + To obtain coverage information for the entire kernel, one should + enable the subsequent option (Profile entire kernel). If only + particular files or directories of the kernel are desired, then + one must provide the following compile options for such targets: + "-fprofile-arcs -ftest-coverage" in the CFLAGS. To obtain + access to the coverage data one must insmod the gcov-prof kernel + module. + +config GCOV_ALL + bool "GCOV_ALL" + depends on GCOV_PROFILE + ---help--- + If you say Y here, it will compile the entire kernel with coverage + option enabled. + +config GCOV_PROC + tristate "gcov-proc module" + depends on GCOV_PROFILE && PROC_FS + ---help--- + This is the gcov-proc module that exposes gcov data through the + /proc filesystem + +endmenu + menu "Kernel hacking" config DEBUG_KERNEL diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/ppc64/kernel/head.S 901-mjb1.1/arch/ppc64/kernel/head.S --- 000-virgin/arch/ppc64/kernel/head.S Sat Jun 14 18:37:25 2003 +++ 901-mjb1.1/arch/ppc64/kernel/head.S Wed Aug 13 20:51:56 2003 @@ -2015,3 +2015,24 @@ stab_array: .globl cmd_line cmd_line: .space 512 + +#ifdef CONFIG_GCOV_PROFILE +/* + * The .ctors-section contains a list of pointers to constructor + * functions which are used to initialize gcov structures. + * + * Because there is no NULL at the end of the constructor list + * in the kernel we need the addresses of both the constructor + * as well as the destructor list which are supposed to be + * adjacent. + */ + +.section ".ctors","aw" +.globl __CTOR_LIST__ +.type __CTOR_LIST__,@object +__CTOR_LIST__: +.section ".dtors","aw" +.globl __DTOR_LIST__ +.type __DTOR_LIST__,@object +__DTOR_LIST__: +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/sparc64/kernel/devices.c 901-mjb1.1/arch/sparc64/kernel/devices.c --- 000-virgin/arch/sparc64/kernel/devices.c Sat May 10 18:34:35 2003 +++ 901-mjb1.1/arch/sparc64/kernel/devices.c Wed Aug 13 20:29:36 2003 @@ -31,6 +31,8 @@ int linux_num_cpus = 0; extern void cpu_probe(void); extern void central_probe(void); +unsigned long cpu_hz; + void __init device_scan(void) { char node_str[128]; @@ -68,6 +70,8 @@ void __init device_scan(void) prom_getproperty(scan, "portid", (char *) &thismid, sizeof(thismid)); } + if (!cpu_hz) + cpu_hz = prom_getint(scan, "clock-frequency"); linux_cpus[cpu_ctr].mid = thismid; printk("Found CPU %d (node=%08x,mid=%d)\n", cpu_ctr, (unsigned) scan, thismid); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/sparc64/kernel/rtrap.S 901-mjb1.1/arch/sparc64/kernel/rtrap.S --- 000-virgin/arch/sparc64/kernel/rtrap.S Sat May 10 18:34:35 2003 +++ 901-mjb1.1/arch/sparc64/kernel/rtrap.S Wed Aug 13 20:29:41 2003 @@ -15,6 +15,10 @@ #include #include +#ifndef CONFIG_KGDB_THREAD +#define user_schedule schedule +#endif + #define RTRAP_PSTATE (PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV|PSTATE_IE) #define RTRAP_PSTATE_IRQOFF (PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV) #define RTRAP_PSTATE_AG_IRQOFF (PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV|PSTATE_AG) @@ -33,7 +37,7 @@ __handle_softirq: ba,a,pt %xcc, __handle_softirq_continue nop __handle_preemption: - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate ba,pt %xcc, __handle_preemption_continue wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate @@ -48,7 +52,7 @@ __handle_user_windows: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -92,7 +96,7 @@ __handle_perfctrs: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -273,7 +277,7 @@ to_kernel: sethi %hi(PREEMPT_ACTIVE), %l6 stw %l6, [%g6 + TI_PRE_COUNT] wrpr 0, %pil - call schedule + call user_schedule nop ba,pt %xcc, rtrap stw %g0, [%g6 + TI_PRE_COUNT] diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/sparc64/lib/rwlock.S 901-mjb1.1/arch/sparc64/lib/rwlock.S --- 000-virgin/arch/sparc64/lib/rwlock.S Sun Nov 17 20:29:44 2002 +++ 901-mjb1.1/arch/sparc64/lib/rwlock.S Wed Aug 13 20:29:36 2003 @@ -63,5 +63,33 @@ __write_lock: /* %o0 = lock_ptr */ be,pt %icc, 99b membar #StoreLoad | #StoreStore ba,a,pt %xcc, 1b + + .globl __read_trylock +__read_trylock: /* %o0 = lock_ptr */ + ldsw [%o0], %g5 + brlz,pn %g5, 100f + add %g5, 1, %g7 + cas [%o0], %g5, %g7 + cmp %g5, %g7 + bne,pn %icc, __read_trylock + membar #StoreLoad | #StoreStore + retl + mov 1, %o0 + + .globl __write_trylock +__write_trylock: /* %o0 = lock_ptr */ + sethi %hi(0x80000000), %g2 +1: lduw [%o0], %g5 +4: brnz,pn %g5, 100f + or %g5, %g2, %g7 + cas [%o0], %g5, %g7 + cmp %g5, %g7 + bne,pn %icc, 1b + membar #StoreLoad | #StoreStore + retl + mov 1, %o0 +100: retl + mov 0, %o0 + rwlock_impl_end: diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/x86_64/Kconfig 901-mjb1.1/arch/x86_64/Kconfig --- 000-virgin/arch/x86_64/Kconfig Wed Aug 13 20:24:21 2003 +++ 901-mjb1.1/arch/x86_64/Kconfig Wed Aug 13 20:51:56 2003 @@ -451,6 +451,37 @@ source "net/bluetooth/Kconfig" source "arch/x86_64/oprofile/Kconfig" +menu "GCOV coverage profiling" + +config GCOV_PROFILE + bool "GCOV coverage profiling" + ---help--- + Provide infrastructure for coverage support for the kernel. This + will not compile the kernel by default with the necessary flags. + To obtain coverage information for the entire kernel, one should + enable the subsequent option (Profile entire kernel). If only + particular files or directories of the kernel are desired, then + one must provide the following compile options for such targets: + "-fprofile-arcs -ftest-coverage" in the CFLAGS. To obtain + access to the coverage data one must insmod the gcov-prof kernel + module. + +config GCOV_ALL + bool "GCOV_ALL" + depends on GCOV_PROFILE + ---help--- + If you say Y here, it will compile the entire kernel with coverage + option enabled. + +config GCOV_PROC + tristate "gcov-proc module" + depends on GCOV_PROFILE && PROC_FS + ---help--- + This is the gcov-proc module that exposes gcov data through the + /proc filesystem + +endmenu + menu "Kernel hacking" config DEBUG_KERNEL diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/x86_64/kernel/entry.S 901-mjb1.1/arch/x86_64/kernel/entry.S --- 000-virgin/arch/x86_64/kernel/entry.S Fri May 30 19:02:02 2003 +++ 901-mjb1.1/arch/x86_64/kernel/entry.S Wed Aug 13 20:29:41 2003 @@ -46,6 +46,10 @@ #define PDAREF(field) %gs:field +#ifndef CONFIG_KGDB_THREAD +#define user_schedule schedule +#endif + #ifdef CONFIG_PREEMPT #define preempt_stop cli #else @@ -187,7 +191,7 @@ sysret_careful: jnc sysret_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp sysret_check @@ -256,7 +260,7 @@ int_careful: jnc int_very_careful sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp int_with_check @@ -426,7 +430,7 @@ retint_careful: jnc retint_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi GET_THREAD_INFO(%rcx) cli @@ -460,7 +464,7 @@ retint_kernel: jc retint_restore_args movl $PREEMPT_ACTIVE,threadinfo_preempt_count(%rcx) sti - call schedule + call user_schedule cli GET_THREAD_INFO(%rcx) movl $0,threadinfo_preempt_count(%rcx) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/arch/x86_64/kernel/head.S 901-mjb1.1/arch/x86_64/kernel/head.S --- 000-virgin/arch/x86_64/kernel/head.S Wed Aug 13 20:24:21 2003 +++ 901-mjb1.1/arch/x86_64/kernel/head.S Wed Aug 13 20:51:56 2003 @@ -383,3 +383,23 @@ ENTRY(idt_table) .quad 0 .endr +#ifdef CONFIG_GCOV_PROFILE +/* + * The .ctors-section contains a list of pointers to constructor + * functions which are used to initialize gcov structures. + * + * Because there is no NULL at the end of the constructor list + * in the kernel we need the addresses of both the constructor + * as well as the destructor list which are supposed to be + * adjacent. + */ + +.section ".ctors","aw" +.globl __CTOR_LIST__ +.type __CTOR_LIST__,@object +__CTOR_LIST__: +.section ".dtors","aw" +.globl __DTOR_LIST__ +.type __DTOR_LIST__,@object +__DTOR_LIST__: +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/Makefile 901-mjb1.1/drivers/Makefile --- 000-virgin/drivers/Makefile Wed Jul 2 21:59:08 2003 +++ 901-mjb1.1/drivers/Makefile Wed Aug 13 20:51:56 2003 @@ -49,3 +49,4 @@ obj-$(CONFIG_ISDN_BOOL) += isdn/ obj-$(CONFIG_MCA) += mca/ obj-$(CONFIG_EISA) += eisa/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_GCOV_PROC) += gcov/ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/char/Makefile 901-mjb1.1/drivers/char/Makefile --- 000-virgin/drivers/char/Makefile Sat Jun 14 18:37:27 2003 +++ 901-mjb1.1/drivers/char/Makefile Wed Aug 13 20:29:29 2003 @@ -25,6 +25,7 @@ obj-$(CONFIG_COMPUTONE) += ip2.o ip2main obj-$(CONFIG_RISCOM8) += riscom8.o obj-$(CONFIG_ISI) += isicom.o obj-$(CONFIG_ESPSERIAL) += esp.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbserial.o obj-$(CONFIG_SYNCLINK) += synclink.o obj-$(CONFIG_SYNCLINKMP) += synclinkmp.o obj-$(CONFIG_N_HDLC) += n_hdlc.o diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/char/gdbserial.c 901-mjb1.1/drivers/char/gdbserial.c --- 000-virgin/drivers/char/gdbserial.c Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/drivers/char/gdbserial.c Wed Aug 13 20:29:29 2003 @@ -0,0 +1,274 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * + * Modified by Scott Foehner (sfoehner@engr.sgi.com) to allow connect + * on boot-up + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#undef PRNT /* define for debug printing */ + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +extern void set_debug_traps(void); /* GDB routine */ +extern int gdb_serial_setup(int ttyS, int baud, int *port, int *irq); +extern void shutdown_for_gdb(struct async_struct *info); + /* in serial.c */ + +int gdb_irq; +int gdb_port; +int gdb_ttyS = 1; /* Default: ttyS1 */ +int gdb_baud = 38400; +int gdb_enter = 0; /* Default: do not do gdb_hook on boot */ +int gdb_initialized = 0; + +static int initialized = -1; + +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(void) +{ + if (inb(gdb_port + UART_LSR) & UART_LSR_DR) + return (inb(gdb_port + UART_RX)); + + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + */ +static int +read_char(void) +{ + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + int chr; + + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + return (chr); + } + + return (read_data_bfr()); /* read from hardware */ + +} /* read_char */ + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(int chr) +{ + while (!(inb(gdb_port + UART_LSR) & UART_LSR_THRE)) ; + + outb(chr, gdb_port + UART_TX); + +} /* write_char */ + +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static irqreturn_t +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + int chr; + int iir; + + do { + chr = read_data_bfr(); + iir = inb(gdb_port + UART_IIR); +#ifdef PRNT + printk("gdb_interrupt: chr=%02x '%c' after read iir=%02x\n", + chr, chr > ' ' && chr < 0x7F ? chr : ' ', iir); +#endif + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + breakpoint(); + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { /* buffer overflow, clear it */ + gdb_buf_in_inx = 0; + atomic_set(&gdb_buf_in_cnt, 0); + gdb_buf_out_inx = 0; + break; + } + + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + atomic_inc(&gdb_buf_in_cnt); + } + while (iir & UART_IIR_RDI); + return IRQ_HANDLED; +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +extern int serial8250_init(void); + +int +gdb_hook(void) +{ + int retval; + +#ifdef CONFIG_SMP + if (NR_CPUS > KGDB_MAX_NO_CPUS) { + printk + ("kgdb: too manu cpus. Cannot enable debugger with more than 8 cpus\n"); + return (-1); + } +#endif + + /* + * Call first time just to get the ser ptr + */ + + serial8250_init(); + + if (gdb_serial_setup(gdb_ttyS, gdb_baud, &gdb_port, &gdb_irq)) { + printk("gdb_serial_setup() error"); + return (-1); + } + + retval = request_irq(gdb_irq, + gdb_interrupt, SA_INTERRUPT, "GDB-stub", NULL); + if (retval == 0) + initialized = 1; + else { + initialized = 0; + printk("gdb_hook: request_irq(irq=%d) failed: %d\n", gdb_irq, + retval); + } + + /* + * Call GDB routine to setup the exception vectors for the debugger + */ + set_debug_traps(); + + /* + * Call the breakpoint() routine in GDB to start the debugging + * session. + */ + printk("Waiting for connection from remote gdb... "); + breakpoint(); + gdb_null(); + + printk("Connected.\n"); + + gdb_initialized = 1; + return (0); + +} /* gdb_hook_interrupt2 */ + +/* + * getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. + */ +int +getDebugChar(void) +{ + volatile int chr; + +#ifdef PRNT + printk("getDebugChar: "); +#endif + + while ((chr = read_char()) < 0) + touch_nmi_watchdog(); + +#ifdef PRNT + printk("%c\n", chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + return (chr); + +} /* getDebugChar */ + +/* + * putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. + */ +void +putDebugChar(int chr) +{ +#ifdef PRNT + printk("putDebugChar: chr=%02x '%c'\n", chr, + chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + + write_char(chr); /* this routine will wait */ + +} /* putDebugChar */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/char/sysrq.c 901-mjb1.1/drivers/char/sysrq.c --- 000-virgin/drivers/char/sysrq.c Fri May 30 19:02:05 2003 +++ 901-mjb1.1/drivers/char/sysrq.c Wed Aug 13 20:29:29 2003 @@ -134,6 +134,18 @@ static struct sysrq_key_op sysrq_mountro /* END SYNC SYSRQ HANDLERS BLOCK */ +#ifdef CONFIG_X86_REMOTE_DEBUG +static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) { + int gdb_hook(void); + gdb_hook(); +} +static struct sysrq_key_op sysrq_gdb_op = { + handler: sysrq_handle_gdb, + help_msg: "Gdb", + action_msg: "Entering debugger", +}; +#endif /* SHOW SYSRQ HANDLERS BLOCK */ @@ -240,7 +252,11 @@ static struct sysrq_key_op *sysrq_key_ta /* d */ NULL, /* e */ &sysrq_term_op, /* f */ NULL, +#ifdef CONFIG_X86_REMOTE_DEBUG +/* g */ &sysrq_gdb_op, +#else /* CONFIG_X86_REMOTE_DEBUG */ /* g */ NULL, +#endif /* CONFIG_X86_REMOTE_DEBUG */ /* h */ NULL, /* i */ &sysrq_kill_op, /* j */ NULL, diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/char/tty_io.c 901-mjb1.1/drivers/char/tty_io.c --- 000-virgin/drivers/char/tty_io.c Wed Aug 13 20:24:22 2003 +++ 901-mjb1.1/drivers/char/tty_io.c Wed Aug 13 20:29:29 2003 @@ -91,6 +91,9 @@ #include #include #include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif #include #include @@ -2190,6 +2193,13 @@ void tty_register_device(struct tty_driv devfs_mk_cdev(dev, S_IFCHR | S_IRUSR | S_IWUSR, "%s%d", driver->devfs_name, index + driver->name_base); + { + extern int kgdb_not_ready_yet; + + if (kgdb_not_ready_yet) + return; + } + /* we don't care about the ptys */ /* how nice to hide this behind some crappy interface.. */ if (driver->type != TTY_DRIVER_TYPE_PTY) { @@ -2415,6 +2425,9 @@ void __init console_init(void) (*call)(); call++; } +#ifdef CONFIG_GDB_CONSOLE + gdb_console_init(); +#endif } #ifdef CONFIG_VT diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/gcov/Makefile 901-mjb1.1/drivers/gcov/Makefile --- 000-virgin/drivers/gcov/Makefile Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/drivers/gcov/Makefile Wed Aug 13 20:51:56 2003 @@ -0,0 +1,8 @@ +# +# Makefile for GCOV profiling kernel module +# + +obj-$(CONFIG_GCOV_PROC) += gcov-proc.o + +$(obj)/gcov-proc.o: $(obj)/gcov-proc.c + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/gcov/gcov-proc.c 901-mjb1.1/drivers/gcov/gcov-proc.c --- 000-virgin/drivers/gcov/gcov-proc.c Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/drivers/gcov/gcov-proc.c Wed Aug 13 20:51:56 2003 @@ -0,0 +1,713 @@ +/* + * This kernel module provides access to coverage data produced by + * an instrumented kernel via an entry in the proc file system + * at /proc/gcov/. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (c) International Business Machines Corp., 2002 + * + * Author: Hubertus Franke + * Rajan Ravindran + * + * Bugfixes by Peter.Oberparleiter@de.ibm.com: + * Changes by Paul Larson + * Automatically detect gcc version for gcov_type + * + */ + +#include +#include +#include + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +#define GCOV_PROF_PROC "gcov" + +static DECLARE_MUTEX_LOCKED(gcov_lock); +#define DOWN() down(&gcov_lock); +#define UP() up(&gcov_lock); +#define PAD8(x) ((x + 7) & ~7) + +//#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,4)) +//static inline struct proc_dir_entry *PDE(const struct inode *inode) +//{ +// return ((struct proc_dir_entry *) inode->u.generic_ip); +//} +//#endif + +/* ################################################################### + # NOTICE ########################################################## + ################################################################### + + GCOV_TYPE defines the count type used by the instrumentation code. + Kernels compiled with a gcc version prior to 3.1 should use LONG, + otherwise LONG LONG. */ + +#if __GNUC__ >= 3 && __GNUC_MINOR__ >= 1 +typedef long long gcov_type; +#else +typedef long gcov_type; +#endif + + +struct bb +{ + long zero_word; + const char *filename; + gcov_type *counts; + long ncounts; + struct bb *next; + const unsigned long *addresses; + + /* Older GCC's did not emit these fields. */ + long nwords; + const char **functions; + const long *line_nums; + const char **filenames; + char *flags; +}; + +extern struct bb *bb_head; +static struct file_operations proc_gcov_operations; +extern char *gcov_kernelpath; +extern void (*gcov_callback)(int cmd, struct bb *); +extern void do_global_ctors(char *, char *, struct module *, int); + +static int create_bb_links = 1; +static int kernel_path_len; + +int debug = 0; +#define PPRINTK(x) do { if (debug) { printk x ; } } while (0) + +struct gcov_ftree_node +{ + int isdir; /* directory or file */ + char *fname; /* only the name within the hierachy */ + struct gcov_ftree_node *sibling; /* sibling of tree */ + struct gcov_ftree_node *files; /* children of tree */ + struct gcov_ftree_node *parent; /* parent of current gcov_ftree_node */ + struct proc_dir_entry *proc[4]; + struct bb *bb; + /* below only valid for leaf nodes == files */ + unsigned long offset; /* offset in global file */ + struct gcov_ftree_node *next; /* next leave node */ +}; + +static struct proc_dir_entry *proc_vmlinux = NULL; +static struct gcov_ftree_node *leave_nodes = NULL; +static struct gcov_ftree_node *dumpall_cached_node = NULL; +static struct gcov_ftree_node tree_root = + { 1, GCOV_PROF_PROC, NULL, NULL, NULL, + { NULL, NULL, NULL, NULL} , NULL, 0,NULL }; +static char *endings[3] = { ".bb", ".bbg", ".c" }; + + +/* Calculate the header size of an entry in the vmlinux-tracefile which + contains the collection of trace data of all instrumented kernel objects. + + An entry header is defined as: + 0: length of filename of the respective .da file padded to 8 bytes + 8: filename padded to 8 bytes + + */ + +static inline unsigned long +hdr_ofs (struct gcov_ftree_node *tptr) +{ + return 8 + PAD8(strlen (tptr->bb->filename) + 1); +} + + +/* Calculate the total size of an entry in the vmlinux-tracefile. + An entry consists of the header, an 8 byte word for the number + of counts in this entry and the actual array of 8 byte counts. */ + +static inline unsigned long +dump_size(struct gcov_ftree_node *tptr) +{ + return (hdr_ofs(tptr) + (tptr->bb->ncounts+1)*8); +} + + +/* Store a portable representation of VALUE in DEST using BYTES*8-1 bits. + Return a non-zero value if VALUE requires more than BYTES*8-1 bits + to store (this is adapted code from gcc/gcov-io.h). */ + +static int +store_gcov_type (gcov_type value, void *buf, int offset, int len) +{ + const size_t bytes = 8; + char dest[10]; + int upper_bit = (value < 0 ? 128 : 0); + size_t i; + + if (value < 0) { + gcov_type oldvalue = value; + value = -value; + if (oldvalue != -value) + return 1; + } + + for(i = 0 ; + i < (sizeof (value) < bytes ? sizeof (value) : bytes) ; + i++) { + dest[i] = value & (i == (bytes - 1) ? 127 : 255); + value = value / 256; + } + + if (value && value != -1) + return 1; + + for(; i < bytes ; i++) + dest[i] = 0; + dest[bytes - 1] |= upper_bit; + copy_to_user(buf,&dest[offset],len); + return 0; +} + + +/* Create a directory entry in the proc file system and fill in + the respective fields in the provided tree node. Return a + non-zero value on error. */ + +int +create_dir_proc (struct gcov_ftree_node *bt, char *fname) +{ + bt->proc[0] = proc_mkdir(fname, bt->parent->proc[0]); + bt->proc[1] = bt->proc[2] = bt->proc[3] = NULL; + return (bt->proc[0] == NULL); +} + + +/* Replace file ending in with . Return a new + string containing the new filename or NULL on error. */ + +static +char* replace_ending (const char *fname,char *end, char *newend) +{ + char *newfname; + char *cptr = strstr(fname,end); + int len; + if (cptr == NULL) + return NULL; + len = cptr - fname; + newfname = (char*)kmalloc(len+strlen(newend)+1,GFP_KERNEL); + if (newfname == NULL) + return NULL; + memcpy(newfname,fname,len); + strcpy(newfname+len,newend); + return newfname; +} + + +/* Create a file entry in the proc file system and update the respective + fields on the tree node. Optionally try to create links to the + source, .bb and .bbg files. Return a non-zero value on error. */ + +int +create_file_proc (struct gcov_ftree_node *bt, struct bb *bptr, char *fname, + const char *fullname) +{ + bt->proc[0] = create_proc_entry(fname, S_IWUSR | S_IRUGO, + bt->parent->proc[0]); + if (!bt->proc[0]) { + PPRINTK(("error creating file proc <%s>\n", fname)); + return 1; + } + + bt->proc[0]->proc_fops = &proc_gcov_operations; + bt->proc[0]->size = 8 + (8 * bptr->ncounts); + + if (create_bb_links) { + int i; + for (i=0;i<3;i++) { + char *newfname; + char *newfullname; + newfname = replace_ending(fname,".da",endings[i]); + newfullname = replace_ending(fullname,".da",endings[i]); + if ((newfname) && (newfullname)) { + bt->proc[i+1] = proc_symlink(newfname,bt->parent->proc[0],newfullname); + } + if (newfname) kfree(newfname); + if (newfullname) kfree(newfullname); + } + } else { + bt->proc[1] = bt->proc[2] = bt->proc[3] = NULL; + } + return 0; +} + + +/* Recursively check and if necessary create the file specified by + and all its path components, both in the proc file-system as + well as in the internal tree structure. */ + +void +check_proc_fs(const char *fullname, struct gcov_ftree_node *parent, + char *name, struct bb *bbptr) +{ + char dirname[128]; + char *localname = name; + char *tname; + int isdir; + struct gcov_ftree_node *tptr; + + tname = strstr(name, "/"); + if ((isdir = (tname != NULL))) { + memcpy(dirname,name,tname-name); + dirname[tname-name] = '\0'; + localname = dirname; + } + + /* search the list of files in gcov_ftree_node and + * see whether file already exists in this directory level */ + for ( tptr = parent->files ; tptr ; tptr = tptr->sibling) { + if (!strcmp(tptr->fname,localname)) + break; + } + if (!tptr) { + /* no entry yet */ + tptr = (struct gcov_ftree_node*) + kmalloc(sizeof(struct gcov_ftree_node),GFP_KERNEL); + tptr->parent = parent; + + if (!isdir) { + if (create_file_proc(tptr, bbptr, localname,fullname)) { + kfree(tptr); + return; + } + tptr->bb = bbptr; + tptr->proc[0]->data = tptr; + tptr->next = leave_nodes; + leave_nodes = tptr; + } else { + int len = strlen(dirname)+1; + localname = (char*)kmalloc(len,GFP_KERNEL); + strncpy(localname,dirname,len); + if (create_dir_proc(tptr,localname)) { + kfree(tptr); + kfree(localname); + return; + } + tptr->bb = NULL; + tptr->proc[0]->data = NULL; + tptr->next = NULL; + } + tptr->isdir = isdir; + tptr->fname = localname; + tptr->files = NULL; + tptr->sibling = parent->files; + parent->files = tptr; + } + if (isdir) + check_proc_fs(fullname,tptr,tname+1,bbptr); +} + + +/* Read out tracefile data to user space. Return the number of bytes + read. */ + +static ssize_t +read_gcov(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + ssize_t read; + gcov_type ncnt; + struct bb *bbptr; + gcov_type slen; + gcov_type *wptr; + struct gcov_ftree_node *treeptr; + struct proc_dir_entry * de; + int dumpall; + unsigned int hdrofs; + unsigned long poffs; + + DOWN(); + + read = 0; + hdrofs = 0; + poffs = 0; + de = PDE(file->f_dentry->d_inode); + + /* Check whether this is a request to /proc/gcov/vmlinux in + which case we should dump the complete tracefile. */ + dumpall = (de == proc_vmlinux); + + + /* Have treeptr point to the tree node to be dumped. */ + + if (!dumpall) + treeptr = (struct gcov_ftree_node*) (de ? de->data : NULL); + else { + /* dumpall_cached_node will speed up things in case + of a sequential read. */ + if (dumpall_cached_node && (p >= dumpall_cached_node->offset)) { + treeptr = dumpall_cached_node; + } + else + treeptr = leave_nodes; + + /* Search the tree node that covers the requested + tracefile offset. */ + while (treeptr) { + struct gcov_ftree_node *next = treeptr->next; + if ((next == NULL) || (p < next->offset)) { + hdrofs = hdr_ofs(treeptr); + poffs = treeptr->offset; + break; + } + treeptr = next; + } + dumpall_cached_node = treeptr; + } + + bbptr = treeptr ? treeptr->bb : NULL; + + if (bbptr == NULL) + goto out; + + ncnt = (gcov_type) bbptr->ncounts; + p -= poffs; + + do { + if (p < hdrofs) { + /* User wants to read parts of the header. */ + + slen = PAD8(strlen(treeptr->bb->filename)+1); + + if (p >= 8) { + /* Read filename */ + if (slen > (gcov_type) count) slen = count; + copy_to_user (buf, &treeptr->bb->filename[p-8], + slen); + count-=slen;buf+= slen;read+=slen;p+=slen; + continue; + } + wptr = &slen; + } + else if (p < (hdrofs + 8)) { + /* User wants to read the number of counts in this + entry. */ + + wptr = &ncnt; + } + else if (p < (hdrofs) + (unsigned long) (ncnt+1)*8) { + /* User wants to read actual counters */ + + wptr = &bbptr->counts[((p-hdrofs)/8)-1]; + } + else + break; + + /* do we have to write partial word */ + + if ((count < 8) || (p & 0x7)) { + /* partial write */ + unsigned long offset = p & 0x7; + unsigned long length = (count+offset)<8?count:(8-offset); + + store_gcov_type(*wptr,buf, offset, length); + buf+=length;p+=length;count-=length;read+=length; + break; + } else { + store_gcov_type(*wptr,buf, 0, 8); + buf+=8;p+=8;count-=8;read+=8; + } + } while (count > 0); + *ppos = p + poffs; +out: + UP(); + return read; +} + + +/* A write to any of our proc file-system entries is interpreted + as a request to reset the data from that node. */ + +static ssize_t +write_gcov(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + struct bb *ptr; + struct proc_dir_entry * de; + int resetall, i; + struct gcov_ftree_node *tptr; + + DOWN(); + + de = PDE(file->f_dentry->d_inode); + + if (de == NULL) { + count = 0; + goto out; + } + + /* Check for a write to /proc/gcov/vmlinux */ + resetall = (de == proc_vmlinux); + + if (resetall) { + /* Reset all nodes */ + for (ptr = bb_head; ptr != (struct bb *) 0; ptr = ptr->next) + { + int i; + if (ptr->counts == NULL) continue; + for (i = 0; i < ptr->ncounts; i++) + ptr->counts[i]=0; + } + } else { + /* Reset a single node */ + tptr = (struct gcov_ftree_node*)(de->data); + if (tptr == NULL) + goto out; + ptr = tptr->bb; + if (ptr->ncounts != 0) { + for (i = 0; i < ptr->ncounts; i++) + ptr->counts[i]=0; + } + } +out: + UP(); + return count; +} + + +/* This struct identifies the functions to be used for proc file-system + interaction. */ + +static struct file_operations proc_gcov_operations = { + read: read_gcov, + write: write_gcov +}; + + +/* Recursively remove a node and all its children from the internal + data tree and from the proc file-system. */ + +void +cleanup_node(struct gcov_ftree_node *node, int delname, int del_in_parent) +{ + struct gcov_ftree_node *next,*tptr; + struct proc_dir_entry *par_proc; + + PPRINTK(("parent n:%p p:%p f:%p s:%p <%s>\n", node, + node->parent, node->files, node->sibling, node->fname)); + if ((tptr = node->parent)) { + if (del_in_parent) { + /* Remove node from parent's list of children */ + struct gcov_ftree_node *cptr,*prev_cptr; + for ( prev_cptr = NULL, cptr = tptr->files; cptr && (cptr != node); + prev_cptr = cptr, cptr = cptr->sibling); + if (prev_cptr == NULL) + tptr->files = cptr->sibling; + else + prev_cptr->sibling = cptr->sibling; + } + par_proc = (struct proc_dir_entry*)(tptr->proc[0]); + } else + par_proc = &proc_root; + + if (node->isdir) { + /* In case of a directory, clean up all child nodes. */ + next = node->files; + node->files = NULL; + for (tptr = next ; tptr; ) { + next = tptr->sibling; + cleanup_node(tptr,1,0); + tptr = next; + } + remove_proc_entry(node->fname, par_proc); + if (delname) kfree(node->fname); + } else { + /* Remove file entry and optional links. */ + remove_proc_entry(node->fname, par_proc); + if (create_bb_links) { + int i; + for (i=0;i<3;i++) { + char *newfname; + if (node->proc[i+1] == NULL) continue; + newfname = replace_ending(node->fname,".da",endings[i]); + if (newfname) { + PPRINTK(("remove_proc_entry <%s>\n", node->fname)); + remove_proc_entry(newfname, par_proc); + kfree(newfname); + } + } + } + } + /* free the data */ + if (node != &tree_root) + kfree(node); +} + + +/* Create a tree node for the given bb struct and initiate the + creation of a corresponding proc file-system entry. */ + +static void +create_node_tree(struct bb *bbptr) +{ + const char *tmp; + const char *filename = bbptr->filename; + char *modname; + int len; + + PPRINTK(("kernelpath <%s> <%s>\n", gcov_kernelpath, filename)); + + /* Check whether this is a file located in the kernel source + directory. */ + if (!strncmp (filename, gcov_kernelpath, kernel_path_len)) + { + /* Remove kernel path and create relative proc-file-system + entry. */ + tmp = filename + kernel_path_len+1; + if (*tmp == '0') return; + check_proc_fs(filename, &tree_root, (char*)tmp, bbptr); + } + else { + /* Insert entry to module sub-directory. */ + len = strlen(filename); + modname = (char *)kmalloc (len + 7, GFP_KERNEL); + strcpy(modname, "module"); + strcat (modname, filename); + check_proc_fs(filename, &tree_root, modname, bbptr); + } +} + + +/* This function will be used as gcov_callback, i.e. it is + called from constructor and destructor code of all instrumented + object files. It updates the local tree structure and the proc + file-system entries. */ + +static void +gcov_cleanup(int cmd, struct bb *bbptr) +{ + unsigned long offset = 0; + struct gcov_ftree_node *tptr; + struct gcov_ftree_node *parent; + struct gcov_ftree_node *prev_cptr; + + DOWN(); + switch (cmd) { + case 0: + /* remove leave node */ + prev_cptr = NULL; + for (tptr = leave_nodes; tptr ; prev_cptr = tptr, tptr = tptr->next) { + if (tptr->bb == bbptr) break; + } + if (!tptr) { + PPRINTK(("Can't find module in /proc/gcov\n")); + UP(); + return; + } + if (prev_cptr) + prev_cptr->next = tptr->next; + else + leave_nodes = tptr->next; + dumpall_cached_node = NULL; + + + /* Find highest level node without further siblings */ + + parent = tptr->parent; + do { + if (parent->files->sibling != NULL) break; + tptr = parent; + parent = parent->parent; + } while (parent); + cleanup_node(tptr,0,1); + + /* Update the offsets at which a certain node can + be found in the tracefile. */ + for (tptr = leave_nodes; tptr; tptr = tptr->next) { + tptr->offset = offset; + offset += dump_size(tptr); + } + break; + + case 1: + /* insert node */ + create_node_tree(bbptr); + + /* Update the offsets at which a certain node can + be found in the tracefile. */ + for (tptr = leave_nodes; tptr; tptr = tptr->next) { + tptr->offset = offset; + offset += dump_size(tptr); + } + + break; + } + UP(); +} + + +/* Initialize the data structure by calling the constructor code + of all instrumented object files and creating the proc + file-system entries. */ + +int +init_module(void) +{ + struct bb *bbptr; + unsigned long offset = 0; + struct gcov_ftree_node *tptr; + + PPRINTK(("init module <%s>\n\n", GCOV_PROF_PROC)); + + do_global_ctors(NULL, NULL, NULL, 0); + + tree_root.proc[0] = proc_mkdir(GCOV_PROF_PROC, 0); + kernel_path_len = strlen(gcov_kernelpath); + + for (bbptr = bb_head; bbptr ; bbptr = bbptr->next) { + create_node_tree(bbptr); + } + + /* Fill in the offset at which a certain node can + be found in the tracefile. */ + for (tptr = leave_nodes; tptr; tptr = tptr->next) { + tptr->offset = offset; + offset += dump_size(tptr); + } + + proc_vmlinux = create_proc_entry("vmlinux",S_IWUSR | S_IRUGO, + tree_root.proc[0]); + if (proc_vmlinux) + proc_vmlinux->proc_fops = &proc_gcov_operations; + + gcov_callback = gcov_cleanup; + UP(); + return 0; +} + + +void +cleanup_module(void) +{ + PPRINTK(("remove module <%s>\n\n", GCOV_PROF_PROC)); + gcov_callback = NULL; + DOWN(); + cleanup_node(&tree_root,0,0); +} + +//module_init(gcov_init_module); +//module_exit(gcov_cleanup_module); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/net/loopback.c 901-mjb1.1/drivers/net/loopback.c --- 000-virgin/drivers/net/loopback.c Sun Nov 17 20:29:25 2002 +++ 901-mjb1.1/drivers/net/loopback.c Wed Aug 13 20:46:15 2003 @@ -194,7 +194,7 @@ int __init loopback_init(struct net_devi /* Current netfilter will die with oom linearizing large skbs, * however this will be cured before 2.5.x is done. */ - dev->features |= NETIF_F_TSO; +/* dev->features |= NETIF_F_TSO; */ dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); if (dev->priv == NULL) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/pci/probe.c 901-mjb1.1/drivers/pci/probe.c --- 000-virgin/drivers/pci/probe.c Wed Aug 13 20:24:26 2003 +++ 901-mjb1.1/drivers/pci/probe.c Wed Aug 13 20:47:25 2003 @@ -176,7 +176,7 @@ void __devinit pci_read_bridge_bases(str limit |= (io_limit_hi << 16); } - if (base && base <= limit) { + if (base <= limit) { res->flags = (io_base_lo & PCI_IO_RANGE_TYPE_MASK) | IORESOURCE_IO; res->start = base; res->end = limit + 0xfff; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/scsi/sd.c 901-mjb1.1/drivers/scsi/sd.c --- 000-virgin/drivers/scsi/sd.c Tue Aug 5 20:01:53 2003 +++ 901-mjb1.1/drivers/scsi/sd.c Wed Aug 13 20:48:47 2003 @@ -61,7 +61,9 @@ * Remaining dev_t-handling stuff */ #define SD_MAJORS 16 -#define SD_DISKS (SD_MAJORS << 4) +#define SD_DISKS ((SD_MAJORS - 1) << 4) +#define LAST_MAJOR_DISKS (1 << (KDEV_MINOR_BITS - 4)) +#define TOTAL_SD_DISKS (SD_DISKS + LAST_MAJOR_DISKS) /* * Time out in seconds for disks and Magneto-opticals (which are slower). @@ -87,7 +89,7 @@ struct scsi_disk { unsigned RCD : 1; /* state of disk RCD bit, unused */ }; -static unsigned long sd_index_bits[SD_DISKS / BITS_PER_LONG]; +static unsigned long sd_index_bits[TOTAL_SD_DISKS / BITS_PER_LONG]; static spinlock_t sd_index_lock = SPIN_LOCK_UNLOCKED; static int sd_revalidate_disk(struct gendisk *disk); @@ -122,6 +124,9 @@ static int sd_major(int major_idx) return SCSI_DISK1_MAJOR + major_idx - 1; case 8 ... 15: return SCSI_DISK8_MAJOR + major_idx - 8; +#define MAX_IDX (TOTAL_SD_DISKS >> 4) + case 16 ... MAX_IDX: + return SCSI_DISK15_MAJOR; default: BUG(); return 0; /* shut up gcc */ @@ -1258,8 +1263,8 @@ static int sd_probe(struct device *dev) goto out_free; spin_lock(&sd_index_lock); - index = find_first_zero_bit(sd_index_bits, SD_DISKS); - if (index == SD_DISKS) { + index = find_first_zero_bit(sd_index_bits, TOTAL_SD_DISKS); + if (index == TOTAL_SD_DISKS) { spin_unlock(&sd_index_lock); error = -EBUSY; goto out_put; @@ -1274,15 +1279,25 @@ static int sd_probe(struct device *dev) sdkp->openers = 0; gd->major = sd_major(index >> 4); - gd->first_minor = (index & 15) << 4; +#define DISKS_PER_MINOR_MASK ((1 << (KDEV_MINOR_BITS - 4)) - 1) + if (index > SD_DISKS) + gd->first_minor = ((index - SD_DISKS) & DISKS_PER_MINOR_MASK) << 4; + else + gd->first_minor = (index & 15) << 4; gd->minors = 16; gd->fops = &sd_fops; - if (index >= 26) { + if (index < 26) { + sprintf(gd->disk_name, "sd%c", 'a' + index % 26); + } else if (index < (26*27)) { sprintf(gd->disk_name, "sd%c%c", 'a' + index/26-1,'a' + index % 26); } else { - sprintf(gd->disk_name, "sd%c", 'a' + index % 26); + const unsigned int m1 = (index/ 26 - 1) / 26 - 1; + const unsigned int m2 = (index / 26 - 1) % 26; + const unsigned int m3 = index % 26; + sprintf(gd->disk_name, "sd%c%c%c", + 'a' + m1, 'a' + m2, 'a' + m3); } strcpy(gd->devfs_name, sdp->devfs_name); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/serial/8250.c 901-mjb1.1/drivers/serial/8250.c --- 000-virgin/drivers/serial/8250.c Wed Jul 2 21:59:11 2003 +++ 901-mjb1.1/drivers/serial/8250.c Wed Aug 13 20:29:29 2003 @@ -2117,9 +2117,116 @@ void serial8250_resume_port(int line, u3 uart_resume_port(&serial8250_reg, &serial8250_ports[line].port, level); } -static int __init serial8250_init(void) +#ifdef CONFIG_X86_REMOTE_DEBUG +/* + * Takes: + * ttyS - integer specifying which serial port to use for debugging + * baud - baud rate of specified serial port + * Returns: + * port for use by the gdb serial driver + */ +int gdb_serial_setup(int ttyS, int baud, int *port, int *irq) +{ + struct uart_8250_port *up; + unsigned cval; + int bits = 8; + int parity = 'n'; + int cflag = CREAD | HUPCL | CLOCAL; + int quot = 0; + + /* + * Now construct a cflag setting. + */ + switch(baud) { + case 1200: + cflag |= B1200; + break; + case 2400: + cflag |= B2400; + break; + case 4800: + cflag |= B4800; + break; + case 19200: + cflag |= B19200; + break; + case 38400: + cflag |= B38400; + break; + case 57600: + cflag |= B57600; + break; + case 115200: + cflag |= B115200; + break; + case 9600: + default: + cflag |= B9600; + break; + } + switch(bits) { + case 7: + cflag |= CS7; + break; + default: + case 8: + cflag |= CS8; + break; + } + switch(parity) { + case 'o': case 'O': + cflag |= PARODD; + break; + case 'e': case 'E': + cflag |= PARENB; + break; + } + + /* + * Divisor, bytesize and parity + */ + + up = &serial8250_ports[ttyS]; +// ser->flags &= ~ASYNC_BOOT_AUTOCONF; + quot = ( 1843200 / 16 ) / baud; + cval = cflag & (CSIZE | CSTOPB); + cval >>= 4; + if (cflag & PARENB) + cval |= UART_LCR_PARITY; + if (!(cflag & PARODD)) + cval |= UART_LCR_EPAR; + + /* + * Disable UART interrupts, set DTR and RTS high + * and set speed. + */ + cval = 0x3; + serial_outp(up, UART_LCR, cval | UART_LCR_DLAB); /* set DLAB */ + serial_outp(up, UART_DLL, quot & 0xff); /* LS of divisor */ + serial_outp(up, UART_DLM, quot >> 8); /* MS of divisor */ + serial_outp(up, UART_LCR, cval); /* reset DLAB */ + serial_outp(up, UART_IER, UART_IER_RDI); /* turn on interrupts*/ + serial_outp(up, UART_MCR, UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS); + + /* + * If we read 0xff from the LSR, there is no UART here. + */ + if (serial_inp(up, UART_LSR) == 0xff) + return 1; + *port = up->port.iobase; + *irq = up->port.irq; +// serial8250_shutdown(&up->port); + return 0; +} +#endif + +int serial8250_init(void) { int ret, i; + static int didit = 0; + + if (didit++) + return 0; printk(KERN_INFO "Serial: 8250/16550 driver $Revision: 1.90 $ " "IRQ sharing %sabled\n", share_irqs ? "en" : "dis"); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/drivers/serial/core.c 901-mjb1.1/drivers/serial/core.c --- 000-virgin/drivers/serial/core.c Tue Aug 5 20:01:53 2003 +++ 901-mjb1.1/drivers/serial/core.c Wed Aug 13 20:29:29 2003 @@ -33,6 +33,10 @@ #include #include /* for serial_state and serial_icounter_struct */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + #include #include @@ -1130,6 +1134,16 @@ uart_ioctl(struct tty_struct *tty, struc * protected against the tty being hung up. */ switch (cmd) { +#ifdef CONFIG_X86_REMOTE_DEBUG + case TIOCGDB: + ret = -ENOTTY; + if (capable(CAP_SYS_ADMIN)) { + gdb_ttyS = MINOR(tty->device) & 0x03F; + gdb_baud = tty_get_baud_rate(tty); + ret = gdb_hook(); + } + break; +#endif case TIOCSERGETLSR: /* Get line status register */ ret = uart_get_lsr_info(state, (unsigned int *)arg); break; @@ -1146,6 +1160,30 @@ uart_ioctl(struct tty_struct *tty, struc out: return ret; } + + /* + * ------------------------------------------------------------ + * Serial GDB driver (most in gdbserial.c) + * ------------------------------------------------------------ + */ + +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_GDB_CONSOLE +static struct console gdbcons = { + name: "gdb", + write: gdb_console_write, + flags: CON_PRINTBUFFER | CON_ENABLED, + index: -1, +}; +#endif + +#ifdef CONFIG_GDB_CONSOLE +void __init gdb_console_init(void) +{ + register_console(&gdbcons); +} +#endif +#endif /* CONFIG_X86_REMOTE_DEBUG */ static void uart_set_termios(struct tty_struct *tty, struct termios *old_termios) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/aio.c 901-mjb1.1/fs/aio.c --- 000-virgin/fs/aio.c Tue Aug 5 20:01:42 2003 +++ 901-mjb1.1/fs/aio.c Wed Aug 13 20:48:56 2003 @@ -204,6 +204,7 @@ static struct kioctx *ioctx_alloc(unsign { struct mm_struct *mm; struct kioctx *ctx; + int ret = 0; /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || @@ -233,7 +234,8 @@ static struct kioctx *ioctx_alloc(unsign INIT_LIST_HEAD(&ctx->run_list); INIT_WORK(&ctx->wq, aio_kick_handler, ctx); - if (aio_setup_ring(ctx) < 0) + ret = aio_setup_ring(ctx); + if (unlikely(ret < 0)) goto out_freectx; /* limit the number of system wide aios */ @@ -259,7 +261,7 @@ out_cleanup: out_freectx: kmem_cache_free(kioctx_cachep, ctx); - ctx = ERR_PTR(-ENOMEM); + ctx = ERR_PTR(ret); dprintk("aio: error allocating ioctx %p\n", ctx); return ctx; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/binfmt_aout.c 901-mjb1.1/fs/binfmt_aout.c --- 000-virgin/fs/binfmt_aout.c Tue Aug 5 20:01:42 2003 +++ 901-mjb1.1/fs/binfmt_aout.c Wed Aug 13 20:51:52 2003 @@ -310,7 +310,7 @@ static int load_aout_binary(struct linux (current->mm->start_brk = N_BSSADDR(ex)); current->mm->free_area_cache = TASK_UNMAPPED_BASE; - current->mm->rss = 0; + zero_rss(current->mm); current->mm->mmap = NULL; compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/binfmt_elf.c 901-mjb1.1/fs/binfmt_elf.c --- 000-virgin/fs/binfmt_elf.c Tue Aug 5 20:01:54 2003 +++ 901-mjb1.1/fs/binfmt_elf.c Wed Aug 13 20:51:52 2003 @@ -634,7 +634,7 @@ static int load_elf_binary(struct linux_ /* Do this so that we can load the interpreter, if need be. We will change some of these later */ - current->mm->rss = 0; + zero_rss(current->mm); current->mm->free_area_cache = TASK_UNMAPPED_BASE; retval = setup_arg_pages(bprm); if (retval < 0) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/binfmt_flat.c 901-mjb1.1/fs/binfmt_flat.c --- 000-virgin/fs/binfmt_flat.c Wed Aug 13 20:24:28 2003 +++ 901-mjb1.1/fs/binfmt_flat.c Wed Aug 13 20:51:52 2003 @@ -643,7 +643,7 @@ static int load_flat_file(struct linux_b current->mm->start_brk = datapos + data_len + bss_len; current->mm->brk = (current->mm->start_brk + 3) & ~3; current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len; - current->mm->rss = 0; + zero_rss(current->mm); } if (flags & FLAT_FLAG_KTRACE) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/binfmt_som.c 901-mjb1.1/fs/binfmt_som.c --- 000-virgin/fs/binfmt_som.c Thu Feb 13 11:08:11 2003 +++ 901-mjb1.1/fs/binfmt_som.c Wed Aug 13 20:51:52 2003 @@ -259,7 +259,7 @@ load_som_binary(struct linux_binprm * bp create_som_tables(bprm); current->mm->start_stack = bprm->p; - current->mm->rss = 0; + zero_rss(current->mm); #if 0 printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/exec.c 901-mjb1.1/fs/exec.c --- 000-virgin/fs/exec.c Wed Aug 13 20:24:28 2003 +++ 901-mjb1.1/fs/exec.c Wed Aug 13 20:51:52 2003 @@ -317,10 +317,11 @@ void put_dirty_page(struct task_struct * } lru_cache_add_active(page); flush_dcache_page(page); + SetPageAnon(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); - tsk->mm->rss++; + inc_rss(tsk->mm, page); spin_unlock(&tsk->mm->page_table_lock); /* no need for flush_tlb */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/inode.c 901-mjb1.1/fs/inode.c --- 000-virgin/fs/inode.c Wed Aug 13 20:24:28 2003 +++ 901-mjb1.1/fs/inode.c Wed Aug 13 20:51:50 2003 @@ -145,6 +145,9 @@ static struct inode *alloc_inode(struct mapping->dirtied_when = 0; mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; +#ifdef CONFIG_NUMA + mapping->binding = NULL; +#endif if (sb->s_bdev) mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; memset(&inode->u, 0, sizeof(inode->u)); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/proc/array.c 901-mjb1.1/fs/proc/array.c --- 000-virgin/fs/proc/array.c Sat May 10 18:35:00 2003 +++ 901-mjb1.1/fs/proc/array.c Wed Aug 13 20:47:27 2003 @@ -336,7 +336,7 @@ int proc_pid_stat(struct task_struct *ta read_unlock(&tasklist_lock); res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %lu %lu %lu\n", task->pid, task->comm, state, @@ -382,7 +382,10 @@ int proc_pid_stat(struct task_struct *ta task->exit_signal, task_cpu(task), task->rt_priority, - task->policy); + task->policy, + jiffies_to_clock_t(task->sched_info.inter_arrival_time), + jiffies_to_clock_t(task->sched_info.service_time), + jiffies_to_clock_t(task->sched_info.response_time)); if(mm) mmput(mm); return res; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/proc/base.c 901-mjb1.1/fs/proc/base.c --- 000-virgin/fs/proc/base.c Tue Aug 5 20:01:42 2003 +++ 901-mjb1.1/fs/proc/base.c Wed Aug 13 20:51:45 2003 @@ -1390,62 +1390,38 @@ out: } #define PROC_NUMBUF 10 -#define PROC_MAXPIDS 20 -/* - * Get a few pid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. - */ -static int get_pid_list(int index, unsigned int *pids) -{ - struct task_struct *p; - int nr_pids = 0; - - index--; - read_lock(&tasklist_lock); - for_each_process(p) { - int pid = p->pid; - if (!pid_alive(p)) - continue; - if (--index >= 0) - continue; - pids[nr_pids] = pid; - nr_pids++; - if (nr_pids >= PROC_MAXPIDS) - break; - } - read_unlock(&tasklist_lock); - return nr_pids; -} int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int pid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - unsigned int nr_pids, i; + int pid; if (!nr) { ino_t ino = fake_ino(0,PROC_PID_INO); if (filldir(dirent, "self", 4, filp->f_pos, ino, DT_LNK) < 0) return 0; filp->f_pos++; - nr++; + nr = 1; } + pid = nr - 1; + for (;;) { + unsigned long i, j; + ino_t ino; - nr_pids = get_pid_list(nr, pid_array); - - for (i = 0; i < nr_pids; i++) { - int pid = pid_array[i]; - ino_t ino = fake_ino(pid,PROC_PID_INO); - unsigned long j = PROC_NUMBUF; + pid = find_next_pid(pid); + if (pid < 0) + break; - do buf[--j] = '0' + (pid % 10); while (pid/=10); + i = pid; + j = PROC_NUMBUF; + do buf[--j] = '0' + (i % 10); while (i/=10); + ino = fake_ino(pid,PROC_PID_INO); if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) break; - filp->f_pos++; + filp->f_pos = pid + 1 + FIRST_PROCESS_ENTRY; } return 0; } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/proc/proc_misc.c 901-mjb1.1/fs/proc/proc_misc.c --- 000-virgin/fs/proc/proc_misc.c Tue Aug 5 20:01:54 2003 +++ 901-mjb1.1/fs/proc/proc_misc.c Wed Aug 13 20:51:40 2003 @@ -134,6 +134,41 @@ static struct vmalloc_info get_vmalloc_i return vmi; } +static int real_loadavg_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int a, b, c, cpu; + int len; + + a = tasks_running[0] + (FIXED_1/200); + b = tasks_running[1] + (FIXED_1/200); + c = tasks_running[2] + (FIXED_1/200); + len = sprintf(page,"Domain load1 load2 load3 nr_run/nr_thrd\n"); + len += sprintf(page+len,"SYSTEM %5d.%02d %5d.%02d %5d.%02d %7ld/%7d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running(), nr_threads); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + unsigned long nr_running; + if (!cpu_online(cpu)) + continue; + preempt_disable(); + a = per_cpu(cpu_tasks_running,cpu)[0] + (FIXED_1/200); + b = per_cpu(cpu_tasks_running,cpu)[1] + (FIXED_1/200); + c = per_cpu(cpu_tasks_running,cpu)[2] + (FIXED_1/200); + nr_running = nr_running_cpu(cpu); + preempt_enable(); + len += sprintf(page+len, "%5d %5d.%02d %5d.%02d %5d.%02d %7ld/%7d\n", + cpu, + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running, nr_threads); + } + return proc_calc_metrics(page, start, off, count, eof, len); +} + static int uptime_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -342,6 +377,71 @@ static struct file_operations proc_modul }; #endif +#ifdef CONFIG_NUMA +#define K(x) ((x) << (PAGE_SHIFT - 10)) +static int show_meminfo_numa (struct seq_file *m, void *v) +{ + int *d = v; + int nid = *d; + struct sysinfo i; + si_meminfo_node(&i, nid); + seq_printf(m, "\n" + "Node %d MemTotal: %8lu kB\n" + "Node %d MemFree: %8lu kB\n" + "Node %d MemUsed: %8lu kB\n" + "Node %d HighTotal: %8lu kB\n" + "Node %d HighFree: %8lu kB\n" + "Node %d LowTotal: %8lu kB\n" + "Node %d LowFree: %8lu kB\n", + nid, K(i.totalram), + nid, K(i.freeram), + nid, K(i.totalram-i.freeram), + nid, K(i.totalhigh), + nid, K(i.freehigh), + nid, K(i.totalram-i.totalhigh), + nid, K(i.freeram-i.freehigh)); + + return 0; +} +#undef K + +extern struct seq_operations meminfo_numa_op; +static int meminfo_numa_open(struct inode *inode, struct file *file) +{ + return seq_open(file,&meminfo_numa_op); +} + +static struct file_operations proc_meminfo_numa_operations = { + open: meminfo_numa_open, + read: seq_read, + llseek: seq_lseek, + release: seq_release, +}; + +static void *meminfo_numa_start(struct seq_file *m, loff_t *pos) +{ + return *pos < numnodes ? pos : NULL; +} + +static void *meminfo_numa_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return meminfo_numa_start(m, pos); +} + +static void meminfo_numa_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations meminfo_numa_op = { + .start = meminfo_numa_start, + .next = meminfo_numa_next, + .stop = meminfo_numa_stop, + .show = show_meminfo_numa, +}; + +#endif + extern struct seq_operations slabinfo_op; extern ssize_t slabinfo_write(struct file *, const char *, size_t, loff_t *); static int slabinfo_open(struct inode *inode, struct file *file) @@ -399,14 +499,20 @@ static int kstat_read_proc(char *page, c jiffies_to_clock_t(idle), jiffies_to_clock_t(iowait)); for (i = 0 ; i < NR_CPUS; i++){ - if (!cpu_online(i)) continue; - len += sprintf(page + len, "cpu%d %u %u %u %u %u\n", + struct sched_info info; + if (!cpu_online(i)) + continue; + cpu_sched_info(&info, i); + len += sprintf(page + len, "cpu%d %u %u %u %u %u %u %u %u\n", i, jiffies_to_clock_t(kstat_cpu(i).cpustat.user), jiffies_to_clock_t(kstat_cpu(i).cpustat.nice), jiffies_to_clock_t(kstat_cpu(i).cpustat.system), jiffies_to_clock_t(kstat_cpu(i).cpustat.idle), - jiffies_to_clock_t(kstat_cpu(i).cpustat.iowait)); + jiffies_to_clock_t(kstat_cpu(i).cpustat.iowait), + (uint) jiffies_to_clock_t(info.inter_arrival_time), + (uint) jiffies_to_clock_t(info.service_time), + (uint) jiffies_to_clock_t(info.response_time)); } len += sprintf(page + len, "intr %u", sum); @@ -603,6 +709,36 @@ static void create_seq_entry(char *name, entry->proc_fops = f; } +#ifdef CONFIG_LOCKMETER +extern ssize_t get_lockmeter_info(char *, size_t, loff_t *); +extern ssize_t put_lockmeter_info(const char *, size_t); +extern int get_lockmeter_info_size(void); + +/* + * This function accesses lock metering information. + */ +static ssize_t read_lockmeter(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + return get_lockmeter_info(buf, count, ppos); +} + +/* + * Writing to /proc/lockmeter resets the counters + */ +static ssize_t write_lockmeter(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + return put_lockmeter_info(buf, count); +} + +static struct file_operations proc_lockmeter_operations = { + NULL, /* lseek */ + read: read_lockmeter, + write: write_lockmeter, +}; +#endif /* CONFIG_LOCKMETER */ + void __init proc_misc_init(void) { struct proc_dir_entry *entry; @@ -611,6 +747,7 @@ void __init proc_misc_init(void) int (*read_proc)(char*,char**,off_t,int,int*,void*); } *p, simple_ones[] = { {"loadavg", loadavg_read_proc}, + {"real_loadavg",real_loadavg_read_proc}, {"uptime", uptime_read_proc}, {"meminfo", meminfo_read_proc}, {"version", version_read_proc}, @@ -650,6 +787,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); #endif +#ifdef CONFIG_NUMA + create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations); +#endif proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { proc_root_kcore->proc_fops = &proc_kcore_operations; @@ -667,6 +807,13 @@ void __init proc_misc_init(void) entry = create_proc_entry("sysrq-trigger", S_IWUSR, NULL); if (entry) entry->proc_fops = &proc_sysrq_trigger_operations; +#endif +#ifdef CONFIG_LOCKMETER + entry = create_proc_entry("lockmeter", S_IWUSR | S_IRUGO, NULL); + if (entry) { + entry->proc_fops = &proc_lockmeter_operations; + entry->size = get_lockmeter_info_size(); + } #endif #ifdef CONFIG_PPC32 { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/fs/proc/task_mmu.c 901-mjb1.1/fs/proc/task_mmu.c --- 000-virgin/fs/proc/task_mmu.c Tue Jun 24 21:29:23 2003 +++ 901-mjb1.1/fs/proc/task_mmu.c Wed Aug 13 20:51:52 2003 @@ -3,6 +3,22 @@ #include #include +#ifdef CONFIG_NUMA +char *task_mem_pernode(struct mm_struct *mm, char *buffer) +{ + int nid; + + for (nid = 0; nid < MAX_NUMNODES; nid++){ + buffer += sprintf(buffer, "VmRSS-node_%d:\t%8lu kb\n", + nid, mm->pernode_rss[nid] << (PAGE_SHIFT-10)); + } + + return buffer; +} +#else /* !CONFIG_NUMA */ +#define task_mem_pernode(mm, buffer) (buffer) +#endif /* CONFIG_NUMA */ + char *task_mem(struct mm_struct *mm, char *buffer) { unsigned long data = 0, stack = 0, exec = 0, lib = 0; @@ -39,6 +55,7 @@ char *task_mem(struct mm_struct *mm, cha mm->rss << (PAGE_SHIFT-10), data - stack, stack, exec - lib, lib); + buffer = task_mem_pernode(mm, buffer); up_read(&mm->mmap_sem); return buffer; } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-alpha/lockmeter.h 901-mjb1.1/include/asm-alpha/lockmeter.h --- 000-virgin/include/asm-alpha/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/include/asm-alpha/lockmeter.h Wed Aug 13 20:29:36 2003 @@ -0,0 +1,90 @@ +/* + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Peter Rival (frival@zk3.dec.com) + */ + +#ifndef _ALPHA_LOCKMETER_H +#define _ALPHA_LOCKMETER_H + +#include +#define CPU_CYCLE_FREQUENCY hwrpb->cycle_freq + +#define get_cycles64() get_cycles() + +#define THIS_CPU_NUMBER smp_processor_id() + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) \ + __save_and_cli(x) +#define local_irq_restore(x) \ + __restore_flags(x) +#endif /* Linux version 2.2.x */ + +#define SPINLOCK_MAGIC_INIT /**/ + +/* + * Macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * We also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + * Note: although these defines and macros are the same as what is being used + * in include/asm-i386/lockmeter.h, they are present here to easily + * allow an alternate Alpha implementation. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Alpha is little endian */ + unsigned short lock; + unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) (((inst_rwlock_t *)rwlock_ptr)->lock & 1) +#define IABS(x) ((x) > 0 ? (x) : -(x)) + +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) ((inst_rwlock_t *)rwlock_ptr)->lock; + /* readers subtract 2, so we have to: */ + /* - andnot off a possible writer (bit 0) */ + /* - get the absolute value */ + /* - divide by 2 (right shift by one) */ + /* to find the number of readers */ + if (tmp == 0) return(0); + else return(IABS(tmp & ~1)>>1); +} + +#endif /* _ALPHA_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-alpha/pgtable.h 901-mjb1.1/include/asm-alpha/pgtable.h --- 000-virgin/include/asm-alpha/pgtable.h Tue Apr 8 14:38:20 2003 +++ 901-mjb1.1/include/asm-alpha/pgtable.h Wed Aug 13 20:51:03 2003 @@ -39,6 +39,7 @@ #define PTRS_PER_PMD (1UL << (PAGE_SHIFT-3)) #define PTRS_PER_PGD (1UL << (PAGE_SHIFT-3)) #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) #define FIRST_USER_PGD_NR 0 /* Number of pointers that fit on a page: this will go away. */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-alpha/spinlock.h 901-mjb1.1/include/asm-alpha/spinlock.h --- 000-virgin/include/asm-alpha/spinlock.h Fri May 30 19:02:20 2003 +++ 901-mjb1.1/include/asm-alpha/spinlock.h Wed Aug 13 20:29:36 2003 @@ -6,6 +6,10 @@ #include #include +#ifdef CONFIG_LOCKMETER +#undef DEBUG_SPINLOCK +#undef DEBUG_RWLOCK +#endif /* * Simple spin lock operations. There are two variants, one clears IRQ's @@ -95,9 +99,18 @@ static inline int _raw_spin_trylock(spin typedef struct { volatile int write_lock:1, read_counter:31; +#ifdef CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* need this storage for CPU and lock INDEX ............. */ + unsigned magic; +#endif } /*__attribute__((aligned(32)))*/ rwlock_t; +#ifdef CONFIG_LOCKMETER +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0 } +#else #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } +#endif #define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0) #define rwlock_is_locked(x) (*(volatile int *)(x) != 0) @@ -168,5 +181,42 @@ static inline void _raw_read_unlock(rwlo : "=m" (*lock), "=&r" (regx) : "m" (*lock) : "memory"); } + +#ifdef CONFIG_LOCKMETER +static inline int _raw_write_trylock(rwlock_t *lock) +{ + long temp,result; + + __asm__ __volatile__( + " ldl_l %1,%0\n" + " mov $31,%2\n" + " bne %1,1f\n" + " or $31,1,%2\n" + " stl_c %2,%0\n" + "1: mb\n" + : "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result) + : "m" (*(volatile int *)lock) + ); + + return (result); +} + +static inline int _raw_read_trylock(rwlock_t *lock) +{ + unsigned long temp,result; + + __asm__ __volatile__( + " ldl_l %1,%0\n" + " mov $31,%2\n" + " blbs %1,1f\n" + " subl %1,2,%2\n" + " stl_c %2,%0\n" + "1: mb\n" + : "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result) + : "m" (*(volatile int *)lock) + ); + return (result); +} +#endif /* CONFIG_LOCKMETER */ #endif /* _ALPHA_SPINLOCK_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-arm/pgtable.h 901-mjb1.1/include/asm-arm/pgtable.h --- 000-virgin/include/asm-arm/pgtable.h Mon Mar 17 21:43:48 2003 +++ 901-mjb1.1/include/asm-arm/pgtable.h Wed Aug 13 20:51:03 2003 @@ -45,6 +45,7 @@ extern void __pgd_error(const char *file #define FIRST_USER_PGD_NR 1 #define USER_PTRS_PER_PGD ((TASK_SIZE/PGDIR_SIZE) - FIRST_USER_PGD_NR) +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) /* * The table below defines the page protection levels that we insert into our diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-cris/pgtable.h 901-mjb1.1/include/asm-cris/pgtable.h --- 000-virgin/include/asm-cris/pgtable.h Tue Aug 5 19:59:16 2003 +++ 901-mjb1.1/include/asm-cris/pgtable.h Wed Aug 13 20:51:03 2003 @@ -69,6 +69,7 @@ extern void paging_init(void); */ #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) #define FIRST_USER_PGD_NR 0 /* zero page used for uninitialized stuff */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-generic/tlb.h 901-mjb1.1/include/asm-generic/tlb.h --- 000-virgin/include/asm-generic/tlb.h Fri May 30 19:02:20 2003 +++ 901-mjb1.1/include/asm-generic/tlb.h Wed Aug 13 20:51:52 2003 @@ -39,7 +39,6 @@ struct mmu_gather { unsigned int nr; /* set to ~0U means fast mode */ unsigned int need_flush;/* Really unmapped some ptes? */ unsigned int fullmm; /* non-zero means full mm flush */ - unsigned long freed; struct page * pages[FREE_PTE_NR]; }; @@ -60,7 +59,6 @@ tlb_gather_mmu(struct mm_struct *mm, uns tlb->nr = num_online_cpus() > 1 ? 0U : ~0U; tlb->fullmm = full_mm_flush; - tlb->freed = 0; return tlb; } @@ -85,13 +83,6 @@ tlb_flush_mmu(struct mmu_gather *tlb, un static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - int freed = tlb->freed; - struct mm_struct *mm = tlb->mm; - int rss = mm->rss; - - if (rss < freed) - freed = rss; - mm->rss = rss - freed; tlb_flush_mmu(tlb, start, end); /* keep the page table cache within bounds */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/bug.h 901-mjb1.1/include/asm-i386/bug.h --- 000-virgin/include/asm-i386/bug.h Sat Jun 14 18:37:35 2003 +++ 901-mjb1.1/include/asm-i386/bug.h Wed Aug 13 20:29:29 2003 @@ -9,6 +9,11 @@ * undefined" opcode for parsing in the trap handler. */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#define BUG() do { \ + asm ("int $0x3"); \ +} while (0) +#else #if 1 /* Set to zero for a slightly smaller kernel */ #define BUG() \ __asm__ __volatile__( "ud2\n" \ @@ -17,6 +22,7 @@ : : "i" (__LINE__), "i" (__FILE__)) #else #define BUG() __asm__ __volatile__("ud2\n") +#endif #endif #define BUG_ON(condition) do { if (unlikely((condition)!=0)) BUG(); } while(0) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/cpu.h 901-mjb1.1/include/asm-i386/cpu.h --- 000-virgin/include/asm-i386/cpu.h Sat Jun 14 18:37:35 2003 +++ 901-mjb1.1/include/asm-i386/cpu.h Wed Aug 13 20:48:49 2003 @@ -23,4 +23,6 @@ static inline int arch_register_cpu(int return register_cpu(&cpu_devices[num].cpu, num, parent); } +extern void setup_cpu_idt(void); +extern void setup_node_idts(void); #endif /* _ASM_I386_CPU_H_ */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/desc.h 901-mjb1.1/include/asm-i386/desc.h --- 000-virgin/include/asm-i386/desc.h Tue Feb 25 23:03:50 2003 +++ 901-mjb1.1/include/asm-i386/desc.h Wed Aug 13 20:48:49 2003 @@ -2,6 +2,7 @@ #define __ARCH_DESC_H #include +#include #include #ifndef __ASSEMBLY__ @@ -12,14 +13,15 @@ #include extern struct desc_struct cpu_gdt_table[NR_CPUS][GDT_ENTRIES]; +extern struct desc_struct node_idt_table[MAX_NUMNODES][IDT_ENTRIES]; -struct Xgt_desc_struct { +struct Xdt_desc_struct { unsigned short size; unsigned long address __attribute__((packed)); unsigned short pad; } __attribute__ ((packed)); -extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS]; +extern struct Xdt_desc_struct node_idt_descr[MAX_NUMNODES], cpu_gdt_descr[NR_CPUS]; #define load_TR_desc() __asm__ __volatile__("ltr %%ax"::"a" (GDT_ENTRY_TSS*8)) #define load_LDT_desc() __asm__ __volatile__("lldt %%ax"::"a" (GDT_ENTRY_LDT*8)) @@ -29,7 +31,8 @@ extern struct Xgt_desc_struct idt_descr, * something other than this. */ extern struct desc_struct default_ldt[]; -extern void set_intr_gate(unsigned int irq, void * addr); +extern void node_set_intr_gate(unsigned int node, unsigned int vector, void * addr); +extern void set_intr_gate(unsigned int n, void *addr); #define _set_tssldt_desc(n,addr,limit,type) \ __asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/early_printk.h 901-mjb1.1/include/asm-i386/early_printk.h --- 000-virgin/include/asm-i386/early_printk.h Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/include/asm-i386/early_printk.h Wed Aug 13 21:05:43 2003 @@ -0,0 +1,8 @@ +#ifndef __X86_EARLY_PRINTK_H_I386_ +#define __X86_EARLY_PRINTK_H_I386_ + +#define VGABASE 0xB8000 +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/ioctls.h 901-mjb1.1/include/asm-i386/ioctls.h --- 000-virgin/include/asm-i386/ioctls.h Tue Apr 8 14:38:20 2003 +++ 901-mjb1.1/include/asm-i386/ioctls.h Wed Aug 13 20:29:29 2003 @@ -68,6 +68,7 @@ #define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ #define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ #define FIOQSIZE 0x5460 +#define TIOCGDB 0x547F /* enable GDB stub mode on this tty */ /* Used for packet mode */ #define TIOCPKT_DATA 0 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/lockmeter.h 901-mjb1.1/include/asm-i386/lockmeter.h --- 000-virgin/include/asm-i386/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/include/asm-i386/lockmeter.h Wed Aug 13 20:29:36 2003 @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks. + * Added support for hold time statistics for read and write + * locks. + * Moved machine dependent code here from include/lockmeter.h. + * + */ + +#ifndef _I386_LOCKMETER_H +#define _I386_LOCKMETER_H + +#include +#include + +#include + +#ifdef __KERNEL__ +extern unsigned long cpu_khz; +#define CPU_CYCLE_FREQUENCY (cpu_khz * 1000) +#else +#define CPU_CYCLE_FREQUENCY 450000000 +#endif + +#define THIS_CPU_NUMBER smp_processor_id() + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) \ + __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory") + +#define local_irq_restore(x) \ + __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory") +#endif /* Linux version 2.2.x */ + +/* + * macros to cache and retrieve an index value inside of a spin lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. Not normally a problem!! + * we also assume that the hash table has less than 65535 entries. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Intel is little endian */ + unsigned short lock; + unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + /* read and write lock attempts may cause the lock value to temporarily */ + /* be negative. Until it is >= 0 we know nothing (i. e. can't tell if */ + /* is -1 because it was write locked and somebody tried to read lock it */ + /* or if it is -1 because it was read locked and somebody tried to write*/ + /* lock it. ........................................................... */ + do { + tmp = (int) rwlock_ptr->lock; + } while (tmp < 0); + if (tmp == 0) return(0); + else return(RW_LOCK_BIAS-tmp); +} + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock <= 0) +#define IABS(x) ((x) > 0 ? (x) : -(x)) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((IABS((rwlock_ptr)->lock) % RW_LOCK_BIAS) != 0) + +/* this is a lot of typing just to get gcc to emit "rdtsc" */ +static inline long long get_cycles64 (void) +{ +#ifndef CONFIG_X86_TSC + #error this code requires CONFIG_X86_TSC +#else + union longlong_u { + long long intlong; + struct intint_s { + uint32_t eax; + uint32_t edx; + } intint; + } longlong; + + rdtsc(longlong.intint.eax,longlong.intint.edx); + return longlong.intlong; +#endif +} + +#endif /* _I386_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/mach-default/irq_vectors.h 901-mjb1.1/include/asm-i386/mach-default/irq_vectors.h --- 000-virgin/include/asm-i386/mach-default/irq_vectors.h Sun Apr 20 19:35:05 2003 +++ 901-mjb1.1/include/asm-i386/mach-default/irq_vectors.h Wed Aug 13 20:48:49 2003 @@ -68,15 +68,22 @@ #define TIMER_IRQ 0 /* - * 16 8259A IRQ's, 208 potential APIC interrupt sources. - * Right now the APIC is mostly only used for SMP. - * 256 vectors is an architectural limit. (we can have - * more than 256 devices theoretically, but they will - * have to use shared interrupts) + * 16 8259A IRQ's, MAX_IRQ_SOURCES-16 potential APIC + * interrupt sources. Right now the APIC is mostly only + * used for SMP. 256 vectors is an architectural limit. + * (we can have more than 256 devices theoretically, but + * they will have to use shared interrupts) * Since vectors 0x00-0x1f are used/reserved for the CPU, * the usable vector space is 0x20-0xff (224 vectors) + * Linux currently makes 190 vectors available for io interrupts + * starting at FIRST_DEVICE_VECTOR till FIRST_SYSTEM_VECTOR + * + * 0________0x31__________________________0xef_________0xff + * system io interrupts resvd/smp + * */ #ifdef CONFIG_X86_IO_APIC +#define NR_IRQ_VECTORS 190 #define NR_IRQS 224 #else #define NR_IRQS 16 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/mach-summit/mach_mpparse.h 901-mjb1.1/include/asm-i386/mach-summit/mach_mpparse.h --- 000-virgin/include/asm-i386/mach-summit/mach_mpparse.h Sat Jun 14 18:37:35 2003 +++ 901-mjb1.1/include/asm-i386/mach-summit/mach_mpparse.h Wed Aug 13 20:55:51 2003 @@ -5,6 +5,12 @@ extern int use_cyclone; +#ifdef CONFIG_NUMA +extern void setup_summit(void); +#else /* !CONFIG_NUMA */ +#define setup_summit() {} +#endif /* CONFIG_NUMA */ + static inline void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, struct mpc_config_translation *translation) { @@ -24,6 +30,7 @@ static inline int mps_oem_check(struct m || !strncmp(productid, "EXA", 3) || !strncmp(productid, "RUTHLESS SMP", 12))){ use_cyclone = 1; /*enable cyclone-timer*/ + setup_summit(); return 1; } return 0; @@ -36,6 +43,7 @@ static inline int acpi_madt_oem_check(ch (!strncmp(oem_table_id, "SERVIGIL", 8) || !strncmp(oem_table_id, "EXA", 3))){ use_cyclone = 1; /*enable cyclone-timer*/ + setup_summit(); return 1; } return 0; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/mmzone.h 901-mjb1.1/include/asm-i386/mmzone.h --- 000-virgin/include/asm-i386/mmzone.h Tue Aug 5 19:59:16 2003 +++ 901-mjb1.1/include/asm-i386/mmzone.h Wed Aug 13 20:51:53 2003 @@ -10,7 +10,49 @@ #ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA + #ifdef CONFIG_X86_NUMAQ + #include + #else /* summit or generic arch */ + #include + #endif +#else /* !CONFIG_NUMA */ + #define get_memcfg_numa get_memcfg_numa_flat + #define get_zholes_size(n) (0) +#endif /* CONFIG_NUMA */ + extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[nid]) + +/* + * generic node memory support, the following assumptions apply: + * + * 1) memory comes in 256Mb contigious chunks which are either present or not + * 2) we will not have more than 64Gb in total + * + * for now assume that 64Gb is max amount of RAM for whole system + * 64Gb / 4096bytes/page = 16777216 pages + */ +#define MAX_NR_PAGES 16777216 +#define MAX_ELEMENTS 256 +#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS) + +extern u8 physnode_map[]; + +static inline int pfn_to_nid(unsigned long pfn) +{ +#ifdef CONFIG_NUMA + return(physnode_map[(pfn) / PAGES_PER_ELEMENT]); +#else + return 0; +#endif +} + +static inline struct pglist_data *pfn_to_pgdat(unsigned long pfn) +{ + return(NODE_DATA(pfn_to_nid(pfn))); +} + /* * Following are macros that are specific to this numa platform. @@ -43,11 +85,6 @@ extern struct pglist_data *node_data[]; */ #define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) -/* - * Return a pointer to the node data for node n. - */ -#define NODE_DATA(nid) (node_data[nid]) - #define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) \ @@ -92,41 +129,6 @@ extern struct pglist_data *node_data[]; * ( pfn_to_pgdat(pfn) && ((pfn) < node_end_pfn(pfn_to_nid(pfn))) ) */ #define pfn_valid(pfn) ((pfn) < num_physpages) - -/* - * generic node memory support, the following assumptions apply: - * - * 1) memory comes in 256Mb contigious chunks which are either present or not - * 2) we will not have more than 64Gb in total - * - * for now assume that 64Gb is max amount of RAM for whole system - * 64Gb / 4096bytes/page = 16777216 pages - */ -#define MAX_NR_PAGES 16777216 -#define MAX_ELEMENTS 256 -#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS) - -extern u8 physnode_map[]; - -static inline int pfn_to_nid(unsigned long pfn) -{ - return(physnode_map[(pfn) / PAGES_PER_ELEMENT]); -} -static inline struct pglist_data *pfn_to_pgdat(unsigned long pfn) -{ - return(NODE_DATA(pfn_to_nid(pfn))); -} - -#ifdef CONFIG_X86_NUMAQ -#include -#elif CONFIG_NUMA /* summit or generic arch */ -#include -#elif CONFIG_X86_PC -#define get_memcfg_numa get_memcfg_numa_flat -#define get_zholes_size(n) (0) -#else -#define pfn_to_nid(pfn) (0) -#endif /* CONFIG_X86_NUMAQ */ #endif /* CONFIG_DISCONTIGMEM */ #endif /* _ASM_MMZONE_H_ */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/numaq.h 901-mjb1.1/include/asm-i386/numaq.h --- 000-virgin/include/asm-i386/numaq.h Mon Mar 17 21:43:48 2003 +++ 901-mjb1.1/include/asm-i386/numaq.h Wed Aug 13 20:48:49 2003 @@ -29,6 +29,8 @@ #ifdef CONFIG_X86_NUMAQ #define MAX_NUMNODES 8 + +#ifndef __ASSEMBLY__ extern void get_memcfg_numaq(void); #define get_memcfg_numa() get_memcfg_numaq() @@ -161,6 +163,7 @@ static inline unsigned long *get_zholes_ { return 0; } +#endif /* __ASSEMBLY__ */ #endif /* CONFIG_X86_NUMAQ */ #endif /* NUMAQ_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/page.h 901-mjb1.1/include/asm-i386/page.h --- 000-virgin/include/asm-i386/page.h Tue Apr 8 14:38:20 2003 +++ 901-mjb1.1/include/asm-i386/page.h Wed Aug 13 20:27:43 2003 @@ -115,9 +115,26 @@ static __inline__ int get_order(unsigned #endif /* __ASSEMBLY__ */ #ifdef __ASSEMBLY__ -#define __PAGE_OFFSET (0xC0000000) +#include +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000) +#endif #else -#define __PAGE_OFFSET (0xC0000000UL) +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000UL) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000UL) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000UL) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000UL) +#endif #endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/param.h 901-mjb1.1/include/asm-i386/param.h --- 000-virgin/include/asm-i386/param.h Sun Nov 17 20:29:26 2002 +++ 901-mjb1.1/include/asm-i386/param.h Wed Aug 13 20:27:41 2003 @@ -2,10 +2,18 @@ #define _ASMi386_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +#include + +#ifdef CONFIG_1000HZ +# define HZ 1000 /* Internal kernel timer frequency */ +#else +# define HZ 100 #endif + +#define USER_HZ 100 /* .. some user interfaces are in "ticks" */ +#define CLOCKS_PER_SEC (USER_HZ) /* like times() */ + +#endif /* __KERNEL__ */ #ifndef HZ #define HZ 100 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/pgtable.h 901-mjb1.1/include/asm-i386/pgtable.h --- 000-virgin/include/asm-i386/pgtable.h Wed Aug 13 20:24:30 2003 +++ 901-mjb1.1/include/asm-i386/pgtable.h Wed Aug 13 20:51:03 2003 @@ -34,9 +34,11 @@ extern unsigned long empty_zero_page[102 extern pgd_t swapper_pg_dir[1024]; extern kmem_cache_t *pgd_cache; extern kmem_cache_t *pmd_cache; +extern kmem_cache_t *kernel_pmd_cache; extern spinlock_t pgd_lock; extern struct list_head pgd_list; +void kernel_pmd_ctor(void *, kmem_cache_t *, unsigned long); void pmd_ctor(void *, kmem_cache_t *, unsigned long); void pgd_ctor(void *, kmem_cache_t *, unsigned long); void pgd_dtor(void *, kmem_cache_t *, unsigned long); @@ -63,7 +65,22 @@ void paging_init(void); #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#define __USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#define FIRST_KERNEL_PGD_PTR (__USER_PTRS_PER_PGD) +#define PARTIAL_PGD (TASK_SIZE > __USER_PTRS_PER_PGD*PGDIR_SIZE ? 1 : 0) +#define PARTIAL_PMD ((TASK_SIZE % PGDIR_SIZE)/PMD_SIZE) +#define USER_PTRS_PER_PGD (PARTIAL_PGD + __USER_PTRS_PER_PGD) +#ifndef __ASSEMBLY__ +static inline int USER_PTRS_PER_PMD(int pgd_index) { + if (pgd_index < __USER_PTRS_PER_PGD) + return PTRS_PER_PMD; + else if (PARTIAL_PMD && (pgd_index == __USER_PTRS_PER_PGD)) + return (PTRS_PER_PMD-PARTIAL_PMD); + else + return 0; +} +#endif + #define FIRST_USER_PGD_NR 0 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/posix_types.h 901-mjb1.1/include/asm-i386/posix_types.h --- 000-virgin/include/asm-i386/posix_types.h Sun Apr 20 19:35:05 2003 +++ 901-mjb1.1/include/asm-i386/posix_types.h Wed Aug 13 20:48:46 2003 @@ -7,7 +7,7 @@ * assume GCC is being used. */ -typedef unsigned short __kernel_dev_t; +typedef unsigned long __kernel_dev_t; typedef unsigned long __kernel_ino_t; typedef unsigned short __kernel_mode_t; typedef unsigned short __kernel_nlink_t; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/processor.h 901-mjb1.1/include/asm-i386/processor.h --- 000-virgin/include/asm-i386/processor.h Tue Jun 24 21:29:24 2003 +++ 901-mjb1.1/include/asm-i386/processor.h Wed Aug 13 20:29:29 2003 @@ -288,7 +288,11 @@ extern unsigned int mca_pentium_flag; /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ +#ifdef CONFIG_05GB +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 16)) +#else #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#endif /* * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. @@ -406,6 +410,9 @@ struct thread_struct { unsigned int saved_fs, saved_gs; /* IO permissions */ unsigned long *ts_io_bitmap; +#ifdef CONFIG_X86_REMOTE_DEBUG + struct pt_regs *kgdbregs; +#endif }; #define INIT_THREAD { \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/rwlock.h 901-mjb1.1/include/asm-i386/rwlock.h --- 000-virgin/include/asm-i386/rwlock.h Sun Nov 17 20:29:57 2002 +++ 901-mjb1.1/include/asm-i386/rwlock.h Wed Aug 13 20:29:33 2003 @@ -20,28 +20,52 @@ #define RW_LOCK_BIAS 0x01000000 #define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $1,(%0)\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_read_lock_const(rw, helper) \ - asm volatile(LOCK "subl $1,%0\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "jns 1f\n\t" \ + "call " helper "\n\t" \ + "1:\t" \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "jns 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\t" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ + #define __build_read_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ @@ -50,28 +74,51 @@ __build_read_lock_ptr(rw, helper); \ } while (0) -#define __build_write_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_write_lock_const(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jz 1f\n\t" \ + "call " helper "\n\t" \ + "1:\n" \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jz 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\n" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ #define __build_write_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/segment.h 901-mjb1.1/include/asm-i386/segment.h --- 000-virgin/include/asm-i386/segment.h Tue Feb 25 23:03:50 2003 +++ 901-mjb1.1/include/asm-i386/segment.h Wed Aug 13 20:48:49 2003 @@ -94,5 +94,5 @@ * of tasks we can have.. */ #define IDT_ENTRIES 256 - +#define IDT_SIZE (IDT_ENTRIES * 8) #endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/spinlock.h 901-mjb1.1/include/asm-i386/spinlock.h --- 000-virgin/include/asm-i386/spinlock.h Fri May 30 19:02:20 2003 +++ 901-mjb1.1/include/asm-i386/spinlock.h Wed Aug 13 20:29:36 2003 @@ -43,18 +43,35 @@ typedef struct { #define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0) #define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) -#define spin_lock_string \ - "\n1:\t" \ - "lock ; decb %0\n\t" \ - "js 2f\n" \ - LOCK_SECTION_START("") \ - "2:\t" \ - "rep;nop\n\t" \ - "cmpb $0,%0\n\t" \ - "jle 2b\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END +#ifdef CONFIG_SPINLINE + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + "jmp 3f\n" \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + "3:\t" + +#else /* !CONFIG_SPINLINE */ + + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + LOCK_SECTION_START("") \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END + +#endif /* CONFIG_SPINLINE */ /* * This works. Despite all the confusion. * (except on PPro SMP or if we are using OOSTORE) @@ -138,6 +155,11 @@ here: */ typedef struct { volatile unsigned int lock; +#if CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* and we need this storage for CPU and lock INDEX */ + unsigned lockmeter_magic; +#endif #ifdef CONFIG_DEBUG_SPINLOCK unsigned magic; #endif @@ -145,11 +167,19 @@ typedef struct { #define RWLOCK_MAGIC 0xdeaf1eed +#ifdef CONFIG_LOCKMETER +#if CONFIG_DEBUG_SPINLOCK +#define RWLOCK_MAGIC_INIT , 0, RWLOCK_MAGIC +#else +#define RWLOCK_MAGIC_INIT , 0 +#endif +#else /* !CONFIG_LOCKMETER */ #ifdef CONFIG_DEBUG_SPINLOCK #define RWLOCK_MAGIC_INIT , RWLOCK_MAGIC #else #define RWLOCK_MAGIC_INIT /* */ #endif +#endif /* !CONFIG_LOCKMETER */ #define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT } @@ -195,5 +225,59 @@ static inline int _raw_write_trylock(rwl atomic_add(RW_LOCK_BIAS, count); return 0; } + +#ifdef CONFIG_LOCKMETER +static inline int _raw_read_trylock(rwlock_t *lock) +{ +/* FIXME -- replace with assembler */ + atomic_t *count = (atomic_t *)lock; + atomic_dec(count); + if (count->counter > 0) + return 1; + atomic_inc(count); + return 0; +} +#endif + +#if defined(CONFIG_LOCKMETER) && defined(CONFIG_HAVE_DEC_LOCK) +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock(spinlock_t *lock); + +/* + * Matches what is in arch/i386/lib/dec_and_lock.c, except this one is + * "static inline" so that the spin_lock(), if actually invoked, is charged + * against the real caller, not against the catch-all atomic_dec_and_lock + */ +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + int counter; + int newcount; + +repeat: + counter = atomic_read(atomic); + newcount = counter-1; + + if (!newcount) + goto slow_path; + + asm volatile("lock; cmpxchgl %1,%2" + :"=a" (newcount) + :"r" (newcount), "m" (atomic->counter), "0" (counter)); + + /* If the above failed, "eax" will have changed */ + if (newcount != counter) + goto repeat; + return 0; + +slow_path: + _metered_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + _metered_spin_unlock(lock); + return 0; +} + +#define ATOMIC_DEC_AND_LOCK +#endif #endif /* __ASM_SPINLOCK_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/srat.h 901-mjb1.1/include/asm-i386/srat.h --- 000-virgin/include/asm-i386/srat.h Mon Mar 17 21:43:48 2003 +++ 901-mjb1.1/include/asm-i386/srat.h Wed Aug 13 20:48:49 2003 @@ -28,8 +28,9 @@ #define _ASM_SRAT_H_ #define MAX_NUMNODES 8 +#ifndef __ASSEMBLY__ extern void get_memcfg_from_srat(void); extern unsigned long *get_zholes_size(int); #define get_memcfg_numa() get_memcfg_from_srat() - +#endif #endif /* _ASM_SRAT_H_ */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-i386/unistd.h 901-mjb1.1/include/asm-i386/unistd.h --- 000-virgin/include/asm-i386/unistd.h Tue Aug 5 20:01:43 2003 +++ 901-mjb1.1/include/asm-i386/unistd.h Wed Aug 13 20:51:50 2003 @@ -228,7 +228,7 @@ #define __NR_madvise1 219 /* delete when C lib stub is removed */ #define __NR_getdents64 220 #define __NR_fcntl64 221 -/* 223 is unused */ +#define __NR_mbind 223 #define __NR_gettid 224 #define __NR_readahead 225 #define __NR_setxattr 226 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-ia64/lockmeter.h 901-mjb1.1/include/asm-ia64/lockmeter.h --- 000-virgin/include/asm-ia64/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/include/asm-ia64/lockmeter.h Wed Aug 13 20:29:36 2003 @@ -0,0 +1,72 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + */ + +#ifndef _IA64_LOCKMETER_H +#define _IA64_LOCKMETER_H + +#ifdef local_cpu_data +#define CPU_CYCLE_FREQUENCY local_cpu_data->itc_freq +#else +#define CPU_CYCLE_FREQUENCY my_cpu_data.itc_freq +#endif +#define get_cycles64() get_cycles() + +#define THIS_CPU_NUMBER smp_processor_id() + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Intel is little endian */ + volatile unsigned short lock; + volatile unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int read_counter:31; + volatile int write_lock:1; + volatile unsigned short index; + volatile unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) ((rwlock_ptr)->read_counter) + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->write_lock) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->read_counter) + +#endif /* _IA64_LOCKMETER_H */ + diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-ia64/pgtable.h 901-mjb1.1/include/asm-ia64/pgtable.h --- 000-virgin/include/asm-ia64/pgtable.h Tue Jun 24 21:29:24 2003 +++ 901-mjb1.1/include/asm-ia64/pgtable.h Wed Aug 13 20:51:03 2003 @@ -92,6 +92,7 @@ #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define PTRS_PER_PGD (__IA64_UL(1) << (PAGE_SHIFT-3)) #define USER_PTRS_PER_PGD (5*PTRS_PER_PGD/8) /* regions 0-4 are user regions */ +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) #define FIRST_USER_PGD_NR 0 /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-ia64/spinlock.h 901-mjb1.1/include/asm-ia64/spinlock.h --- 000-virgin/include/asm-ia64/spinlock.h Tue Aug 5 20:01:54 2003 +++ 901-mjb1.1/include/asm-ia64/spinlock.h Wed Aug 13 20:29:36 2003 @@ -153,4 +153,25 @@ do { \ clear_bit(31, (x)); \ }) +#ifdef CONFIG_LOCKMETER +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock(spinlock_t *lock); + +/* + * Use a less efficient, and inline, atomic_dec_and_lock() if lockmetering + * so we can see the callerPC of who is actually doing the spin_lock(). + * Otherwise, all we see is the generic rollup of all locks done by + * atomic_dec_and_lock(). + */ +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + _metered_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + _metered_spin_unlock(lock); + return 0; +} +#define ATOMIC_DEC_AND_LOCK +#endif + #endif /* _ASM_IA64_SPINLOCK_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-m68k/pgtable.h 901-mjb1.1/include/asm-m68k/pgtable.h --- 000-virgin/include/asm-m68k/pgtable.h Sat Jun 14 18:37:35 2003 +++ 901-mjb1.1/include/asm-m68k/pgtable.h Wed Aug 13 20:51:03 2003 @@ -58,6 +58,7 @@ #define PTRS_PER_PGD 128 #endif #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) #define FIRST_USER_PGD_NR 0 /* Virtual address region for use by kernel_map() */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-mips/lockmeter.h 901-mjb1.1/include/asm-mips/lockmeter.h --- 000-virgin/include/asm-mips/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/include/asm-mips/lockmeter.h Wed Aug 13 20:29:36 2003 @@ -0,0 +1,126 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * Ported to mips32 for Asita Technologies + * by D.J. Barrow ( dj.barrow@asitatechnologies.com ) + */ +#ifndef _ASM_LOCKMETER_H +#define _ASM_LOCKMETER_H + +/* do_gettimeoffset is a function pointer on mips */ +/* & it is not included by */ +#include +#include +#include + +#define SPINLOCK_MAGIC_INIT /* */ + +#define CPU_CYCLE_FREQUENCY get_cpu_cycle_frequency() + +#define THIS_CPU_NUMBER smp_processor_id() + +static uint32_t cpu_cycle_frequency = 0; + +static uint32_t get_cpu_cycle_frequency(void) +{ + /* a total hack, slow and invasive, but ... it works */ + int sec; + uint32_t start_cycles; + struct timeval tv; + + if (cpu_cycle_frequency == 0) { /* uninitialized */ + do_gettimeofday(&tv); + sec = tv.tv_sec; /* set up to catch the tv_sec rollover */ + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + sec = tv.tv_sec; /* rolled over to a new sec value */ + start_cycles = get_cycles(); + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + cpu_cycle_frequency = get_cycles() - start_cycles; + } + + return cpu_cycle_frequency; +} + +extern struct timeval xtime; + +static uint64_t get_cycles64(void) +{ + static uint64_t last_get_cycles64 = 0; + uint64_t ret; + unsigned long sec; + unsigned long usec, usec_offset; + +again: + sec = xtime.tv_sec; + usec = xtime.tv_usec; + usec_offset = do_gettimeoffset(); + if ((xtime.tv_sec != sec) || + (xtime.tv_usec != usec)|| + (usec_offset >= 20000)) + goto again; + + ret = ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency); + /* We can't do a normal 64 bit division on mips without libgcc.a */ + do_div(ret,1000000); + ret += ((uint64_t)sec * cpu_cycle_frequency); + + /* XXX why does time go backwards? do_gettimeoffset? general time adj? */ + if (ret <= last_get_cycles64) + ret = last_get_cycles64+1; + last_get_cycles64 = ret; + + return ret; +} + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + */ +#define INDEX_MASK 0x7FFF0000 +#define READERS_MASK 0x0000FFFF +#define INDEX_SHIFT 16 +#define PUT_INDEX(lockp,index) \ + lockp->lock = (((lockp->lock) & ~INDEX_MASK) | (index) << INDEX_SHIFT) +#define GET_INDEX(lockp) \ + (((lockp->lock) & INDEX_MASK) >> INDEX_SHIFT) + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + return (tmp >= 0) ? tmp : 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock > 0) + +#endif /* _ASM_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-mips/spinlock.h 901-mjb1.1/include/asm-mips/spinlock.h --- 000-virgin/include/asm-mips/spinlock.h Wed Jul 2 21:59:13 2003 +++ 901-mjb1.1/include/asm-mips/spinlock.h Wed Aug 13 20:29:36 2003 @@ -91,9 +91,18 @@ static inline unsigned int _raw_spin_try typedef struct { volatile unsigned int lock; +#if CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* and we need this storage for CPU and lock INDEX */ + unsigned lockmeter_magic; +#endif } rwlock_t; +#ifdef CONFIG_LOCKMETER +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } +#else #define RW_LOCK_UNLOCKED (rwlock_t) { 0 } +#endif #define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-mips64/lockmeter.h 901-mjb1.1/include/asm-mips64/lockmeter.h --- 000-virgin/include/asm-mips64/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/include/asm-mips64/lockmeter.h Wed Aug 13 20:29:36 2003 @@ -0,0 +1,120 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + */ + +#ifndef _ASM_LOCKMETER_H +#define _ASM_LOCKMETER_H + +#include + +#define SPINLOCK_MAGIC_INIT /* */ + +#define CPU_CYCLE_FREQUENCY get_cpu_cycle_frequency() + +#define THIS_CPU_NUMBER smp_processor_id() + +static uint32_t cpu_cycle_frequency = 0; + +static uint32_t get_cpu_cycle_frequency(void) +{ + /* a total hack, slow and invasive, but ... it works */ + int sec; + uint32_t start_cycles; + struct timeval tv; + + if (cpu_cycle_frequency == 0) { /* uninitialized */ + do_gettimeofday(&tv); + sec = tv.tv_sec; /* set up to catch the tv_sec rollover */ + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + sec = tv.tv_sec; /* rolled over to a new sec value */ + start_cycles = get_cycles(); + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + cpu_cycle_frequency = get_cycles() - start_cycles; + } + + return cpu_cycle_frequency; +} + +extern struct timeval xtime; +extern long do_gettimeoffset(void); + +static uint64_t get_cycles64(void) +{ + static uint64_t last_get_cycles64 = 0; + uint64_t ret; + unsigned long sec; + unsigned long usec, usec_offset; + +again: + sec = xtime.tv_sec; + usec = xtime.tv_usec; + usec_offset = do_gettimeoffset(); + if ((xtime.tv_sec != sec) || + (xtime.tv_usec != usec)|| + (usec_offset >= 20000)) + goto again; + + ret = ((uint64_t)sec * cpu_cycle_frequency) + + ( ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency) / 1000000 ); + + /* XXX why does time go backwards? do_gettimeoffset? general time adj? */ + if (ret <= last_get_cycles64) + ret = last_get_cycles64+1; + last_get_cycles64 = ret; + + return ret; +} + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + */ +#define INDEX_MASK 0x7FFF0000 +#define READERS_MASK 0x0000FFFF +#define INDEX_SHIFT 16 +#define PUT_INDEX(lockp,index) \ + lockp->lock = (((lockp->lock) & ~INDEX_MASK) | (index) << INDEX_SHIFT) +#define GET_INDEX(lockp) \ + (((lockp->lock) & INDEX_MASK) >> INDEX_SHIFT) + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + return (tmp >= 0) ? tmp : 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock > 0) + +#endif /* _ASM_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-parisc/pgtable.h 901-mjb1.1/include/asm-parisc/pgtable.h --- 000-virgin/include/asm-parisc/pgtable.h Tue Aug 5 20:01:43 2003 +++ 901-mjb1.1/include/asm-parisc/pgtable.h Wed Aug 13 20:51:19 2003 @@ -81,6 +81,7 @@ #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define PTRS_PER_PGD (1UL << (PAGE_SHIFT - PT_NLEVELS)) #define USER_PTRS_PER_PGD PTRS_PER_PGD +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) /* Definitions for 2nd level */ #define pgtable_cache_init() do { } while (0) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-ppc/pgtable.h 901-mjb1.1/include/asm-ppc/pgtable.h --- 000-virgin/include/asm-ppc/pgtable.h Sat Jun 14 18:37:36 2003 +++ 901-mjb1.1/include/asm-ppc/pgtable.h Wed Aug 13 20:51:19 2003 @@ -83,6 +83,7 @@ extern unsigned long ioremap_bot, iorema #define PTRS_PER_PMD 1 #define PTRS_PER_PGD 1024 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) #define FIRST_USER_PGD_NR 0 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-ppc64/pgtable.h 901-mjb1.1/include/asm-ppc64/pgtable.h --- 000-virgin/include/asm-ppc64/pgtable.h Sat Jun 14 18:37:36 2003 +++ 901-mjb1.1/include/asm-ppc64/pgtable.h Wed Aug 13 20:51:19 2003 @@ -36,6 +36,7 @@ #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) #define USER_PTRS_PER_PGD (1024) +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) #define FIRST_USER_PGD_NR 0 #define EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-sh/pgtable.h 901-mjb1.1/include/asm-sh/pgtable.h --- 000-virgin/include/asm-sh/pgtable.h Wed Jul 2 21:59:15 2003 +++ 901-mjb1.1/include/asm-sh/pgtable.h Wed Aug 13 20:51:19 2003 @@ -41,6 +41,7 @@ extern unsigned long empty_zero_page[102 #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) #define FIRST_USER_PGD_NR 0 #define PTE_PHYS_MASK 0x1ffff000 diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-sparc/pgtable.h 901-mjb1.1/include/asm-sparc/pgtable.h --- 000-virgin/include/asm-sparc/pgtable.h Sat May 10 18:35:03 2003 +++ 901-mjb1.1/include/asm-sparc/pgtable.h Wed Aug 13 20:51:19 2003 @@ -125,6 +125,7 @@ BTFIXUPDEF_INT(page_kernel) #define PTRS_PER_PMD BTFIXUP_SIMM13(ptrs_per_pmd) #define PTRS_PER_PGD BTFIXUP_SIMM13(ptrs_per_pgd) #define USER_PTRS_PER_PGD BTFIXUP_SIMM13(user_ptrs_per_pgd) +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) #define FIRST_USER_PGD_NR 0 #define PAGE_NONE __pgprot(BTFIXUP_INT(page_none)) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-sparc64/lockmeter.h 901-mjb1.1/include/asm-sparc64/lockmeter.h --- 000-virgin/include/asm-sparc64/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/include/asm-sparc64/lockmeter.h Wed Aug 13 20:29:36 2003 @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2000 Anton Blanchard (anton@linuxcare.com) + */ + +#ifndef _SPARC64_LOCKMETER_H +#define _SPARC64_LOCKMETER_H + +#include + +#include + +extern unsigned long cpu_hz; +#define CPU_CYCLE_FREQUENCY cpu_hz + +#define THIS_CPU_NUMBER __cpu_number_map[smp_processor_id()] + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0) +#define local_irq_save(x) __save_and_cli(x) +#define local_irq_restore(x) __restore_flags(x) +#endif /* Linux version 2.2.x */ + +#define PUT_INDEX(lock_ptr,indexv) (lock_ptr)->index = (indexv) +#define GET_INDEX(lock_ptr) (lock_ptr)->index + +#define PUT_RWINDEX(rwlock_ptr,indexv) (rwlock_ptr)->index = (indexv) +#define GET_RWINDEX(rwlock_ptr) (rwlock_ptr)->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) (rwlock_ptr)->cpu = (cpuv) +#define GET_RW_CPU(rwlock_ptr) (rwlock_ptr)->cpu + +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + signed int tmp = rwlock_ptr->lock; + + if (tmp > 0) + return tmp; + else + return 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((signed int)((rwlock_ptr)->lock) < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((signed int)((rwlock_ptr)->lock) > 0) + +#define get_cycles64() get_cycles() + +#endif /* _SPARC64_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-sparc64/pgtable.h 901-mjb1.1/include/asm-sparc64/pgtable.h --- 000-virgin/include/asm-sparc64/pgtable.h Wed Mar 26 22:54:37 2003 +++ 901-mjb1.1/include/asm-sparc64/pgtable.h Wed Aug 13 20:51:19 2003 @@ -93,6 +93,7 @@ /* Kernel has a separate 44bit address space. */ #define USER_PTRS_PER_PGD ((const int)(test_thread_flag(TIF_32BIT)) ? \ (1) : (PTRS_PER_PGD)) +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) #define FIRST_USER_PGD_NR 0 #define pte_ERROR(e) __builtin_trap() diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-sparc64/spinlock.h 901-mjb1.1/include/asm-sparc64/spinlock.h --- 000-virgin/include/asm-sparc64/spinlock.h Sun Nov 17 20:29:27 2002 +++ 901-mjb1.1/include/asm-sparc64/spinlock.h Wed Aug 13 20:29:36 2003 @@ -30,15 +30,23 @@ #ifndef CONFIG_DEBUG_SPINLOCK -typedef unsigned char spinlock_t; -#define SPIN_LOCK_UNLOCKED 0 +typedef struct { + unsigned char lock; + unsigned int index; +} spinlock_t; -#define spin_lock_init(lock) (*((unsigned char *)(lock)) = 0) -#define spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0) +#ifdef CONFIG_LOCKMETER +#define SPIN_LOCK_UNLOCKED (spinlock_t) {0, 0} +#else +#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 } +#endif + +#define spin_lock_init(__lock) do { *(__lock) = SPIN_LOCK_UNLOCKED; } while(0) +#define spin_is_locked(__lock) (*((volatile unsigned char *)(&((__lock)->lock))) != 0) -#define spin_unlock_wait(lock) \ +#define spin_unlock_wait(__lock) \ do { membar("#LoadLoad"); \ -} while(*((volatile unsigned char *)lock)) +} while(*((volatile unsigned char *)(&(((spinlock_t *)__lock)->lock)))) static __inline__ void _raw_spin_lock(spinlock_t *lock) { @@ -109,8 +117,20 @@ extern int _spin_trylock (spinlock_t *lo #ifndef CONFIG_DEBUG_SPINLOCK -typedef unsigned int rwlock_t; -#define RW_LOCK_UNLOCKED 0 +#ifdef CONFIG_LOCKMETER +typedef struct { + unsigned int lock; + unsigned int index; + unsigned int cpu; +} rwlock_t; +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0xff } +#else +typedef struct { + unsigned int lock; +} rwlock_t; +#define RW_LOCK_UNLOCKED (rwlock_t) { 0 } +#endif + #define rwlock_init(lp) do { *(lp) = RW_LOCK_UNLOCKED; } while(0) #define rwlock_is_locked(x) (*(x) != RW_LOCK_UNLOCKED) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-um/pgtable.h 901-mjb1.1/include/asm-um/pgtable.h --- 000-virgin/include/asm-um/pgtable.h Fri May 30 19:02:21 2003 +++ 901-mjb1.1/include/asm-um/pgtable.h Wed Aug 13 20:51:19 2003 @@ -40,6 +40,7 @@ extern unsigned long *empty_zero_page; #define PTRS_PER_PMD 1 #define PTRS_PER_PGD 1024 #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) #define FIRST_USER_PGD_NR 0 #define pte_ERROR(e) \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-x86_64/early_printk.h 901-mjb1.1/include/asm-x86_64/early_printk.h --- 000-virgin/include/asm-x86_64/early_printk.h Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/include/asm-x86_64/early_printk.h Wed Aug 13 21:05:43 2003 @@ -0,0 +1,8 @@ +#ifndef __X86_EARLY_PRINTK_H_X86_64_ +#define __X86_EARLY_PRINTK_H_X86_64_ + +#define VGABASE 0xffffffff800b8000UL +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/asm-x86_64/pgtable.h 901-mjb1.1/include/asm-x86_64/pgtable.h --- 000-virgin/include/asm-x86_64/pgtable.h Wed Jul 2 21:59:15 2003 +++ 901-mjb1.1/include/asm-x86_64/pgtable.h Wed Aug 13 20:51:19 2003 @@ -112,6 +112,7 @@ static inline void set_pml4(pml4_t *dst, #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#define USER_PTRS_PER_PMD(x) (PTRS_PER_PMD) #define FIRST_USER_PGD_NR 0 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/compiler.h 901-mjb1.1/include/linux/compiler.h --- 000-virgin/include/linux/compiler.h Tue Aug 5 19:59:16 2003 +++ 901-mjb1.1/include/linux/compiler.h Wed Aug 13 20:47:23 2003 @@ -78,6 +78,6 @@ shouldn't recognize the original var, and make assumptions about it */ #define RELOC_HIDE(ptr, off) \ ({ unsigned long __ptr; \ - __asm__ ("" : "=g"(__ptr) : "0"(ptr)); \ + __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ (typeof(ptr)) (__ptr + (off)); }) #endif /* __LINUX_COMPILER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/early_printk.h 901-mjb1.1/include/linux/early_printk.h --- 000-virgin/include/linux/early_printk.h Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/include/linux/early_printk.h Wed Aug 13 21:05:43 2003 @@ -0,0 +1,47 @@ +#ifndef __X86_EARLY_PRINTK_H_ +#define __X86_EARLY_PRINTK_H_ + +#ifdef CONFIG_X86_EARLY_PRINTK +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +/* Simple serial port output */ + +#define DEFAULT_BAUD 57600 +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + + +void early_printk(const char *fmt, ...); +int __init setup_early_printk(); + +#else + +#define early_printk(...) do {} while(0) +#define setup_early_printk() do {} while(0) + +#endif + +#endif diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/fs.h 901-mjb1.1/include/linux/fs.h --- 000-virgin/include/linux/fs.h Wed Aug 13 20:24:32 2003 +++ 901-mjb1.1/include/linux/fs.h Wed Aug 13 20:51:50 2003 @@ -332,6 +332,9 @@ struct address_space { spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ +#ifdef CONFIG_NUMA + struct binding *binding; /* for memory bindings */ +#endif }; struct block_device { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/gdb.h 901-mjb1.1/include/linux/gdb.h --- 000-virgin/include/linux/gdb.h Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/include/linux/gdb.h Wed Aug 13 20:29:29 2003 @@ -0,0 +1,67 @@ +#ifndef _GDB_H_ +#define _GDB_H_ + +/* + * Copyright (C) 2001 Amit S. Kale + */ + +/* gdb locks */ +#define KGDB_MAX_NO_CPUS NR_CPUS + +extern int gdb_enter; /* 1 = enter debugger on boot */ +extern int gdb_ttyS; +extern int gdb_baud; +extern int gdb_initialized; + +extern int gdb_hook(void); +extern void breakpoint(void); + +typedef int gdb_debug_hook(int trapno, + int signo, + int err_code, + struct pt_regs *regs); +extern gdb_debug_hook *linux_debug_hook; + +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +extern spinlock_t kgdb_nmispinlock; +#else +extern unsigned kgdb_spinlock; +extern unsigned kgdb_nmispinlock; +#endif + +extern volatile int kgdb_memerr_expected; + +struct console; +void gdb_console_write(struct console *co, const char *s, + unsigned count); +void gdb_console_init(void); + +extern volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#define KGDB_ASSERT(message, condition) do { \ + if (!(condition)) { \ + printk("kgdb assertion failed: %s\n", message); \ + asm ("int $0x3"); \ + } \ +} while (0) + +#ifdef CONFIG_KERNEL_ASSERTS +#define KERNEL_ASSERT(message, condition) KGDB_ASSERT(message, condition) +#else +#define KERNEL_ASSERT(message, condition) +#endif + +#define KA_VALID_ERRNO(errno) ((errno) > 0 && (errno) <= EMEDIUMTYPE) + +#define KA_VALID_PTR_ERR(ptr) KA_VALID_ERRNO(-PTR_ERR(ptr)) + +#define KA_VALID_KPTR(ptr) (!(ptr) || \ + ((void *)(ptr) >= (void *)PAGE_OFFSET && \ + (void *)(ptr) < ERR_PTR(-EMEDIUMTYPE))) + +#define KA_VALID_PTRORERR(errptr) (KA_VALID_KPTR(errptr) || KA_VALID_PTR_ERR(errptr)) + +#define KA_HELD_GKL() (current->lock_depth >= 0) + +#endif /* _GDB_H_ */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/gfp.h 901-mjb1.1/include/linux/gfp.h --- 000-virgin/include/linux/gfp.h Tue Jun 24 21:29:25 2003 +++ 901-mjb1.1/include/linux/gfp.h Wed Aug 13 21:09:02 2003 @@ -32,6 +32,7 @@ #define __GFP_NOFAIL 0x800 /* Retry for ever. Cannot fail */ #define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */ #define __GFP_NO_GROW 0x2000 /* Slab internal usage */ +#define __GFP_NODE_STRICT 0x4000 /* Do not fall back to other nodes */ #define GFP_ATOMIC (__GFP_HIGH) #define GFP_NOIO (__GFP_WAIT) @@ -66,7 +67,7 @@ static inline struct page * alloc_pages_ if (unlikely(order >= MAX_ORDER)) return NULL; - return __alloc_pages(gfp_mask, order, NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK)); + return __alloc_pages(gfp_mask, order, get_node_zonelist(nid, gfp_mask)); } #define alloc_pages(gfp_mask, order) \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/kdev_t.h 901-mjb1.1/include/linux/kdev_t.h --- 000-virgin/include/linux/kdev_t.h Wed Aug 13 20:24:32 2003 +++ 901-mjb1.1/include/linux/kdev_t.h Wed Aug 13 20:48:46 2003 @@ -70,13 +70,13 @@ aeb - 950811 * static arrays, and they are sized for a 8-bit index. */ typedef struct { - unsigned short value; + unsigned int value; } kdev_t; -#define KDEV_MINOR_BITS 8 -#define KDEV_MAJOR_BITS 8 +#define KDEV_MINOR_BITS 16 +#define KDEV_MAJOR_BITS 16 -#define __mkdev(major,minor) (((major) << KDEV_MINOR_BITS) + (minor)) +#define __mkdev(major, minor) (((major) << KDEV_MINOR_BITS) + (minor)) #define mk_kdev(major, minor) ((kdev_t) { __mkdev(major,minor) } ) @@ -107,17 +107,55 @@ static inline int kdev_same(kdev_t dev1, #define kdev_none(d1) (!kdev_val(d1)) -/* Mask off the high bits for now.. */ -#define minor(dev) ((dev).value & 0xff) -#define major(dev) (((dev).value >> KDEV_MINOR_BITS) & 0xff) +#define minor(dev) ((dev).value & 0xffff) +#define major(dev) (((dev).value >> KDEV_MINOR_BITS) & 0xffff) /* These are for user-level "dev_t" */ +/* Since glibc uses 8+8 in , we'll get + incompatibilities with a simple scheme like 12+20. + Use 8+8 for 16-bit values, some other division, say 16+16, + for 32-bit values. */ #define MINORBITS 8 #define MINORMASK ((1U << MINORBITS) - 1) -#define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS)) -#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) -#define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi)) +#include /* dev_t */ +#if 1 +/* macro versions */ + +#define MAJOR(dev) ((unsigned int)(((dev) & 0xffff0000) ? ((dev) >> 16) & 0xffff : ((dev) >> 8) & 0xff)) +#define MINOR(dev) ((unsigned int)(((dev) & 0xffff0000) ? ((dev) & 0xffff) : ((dev) & 0xff))) +#define MKDEV(ma,mi) ((dev_t)((((ma) & ~0xff) == 0 && ((mi) & ~0xff) == 0) ? (((ma) << 8) | (mi)) : (((ma) << 16) | (mi)))) + +#else +/* inline function versions */ + +static inline unsigned int +MAJOR(dev_t dev) { + unsigned int ma; + + ma = ((dev >> 16) & 0xffff); + if (ma == 0) + ma = ((dev >> 8) & 0xff); + return ma; +} + +static inline unsigned int +MINOR(dev_t dev) { + unsigned int mi; + + mi = (dev & 0xffff); + if (mi == dev) + mi = (dev & 0xff); + return mi; +} + +static inline dev_t +MKDEV(unsigned int ma, unsigned int mi) { + if ((ma & ~0xff) == 0 && (mi & ~0xff) == 0) + return ((ma << 8) | mi); + return ((ma << 16) | mi); +} +#endif /* * Conversion functions @@ -125,12 +163,16 @@ static inline int kdev_same(kdev_t dev1, static inline int kdev_t_to_nr(kdev_t dev) { - return MKDEV(major(dev), minor(dev)); + unsigned int ma = major(dev); + unsigned int mi = minor(dev); + return MKDEV(ma, mi); } -static inline kdev_t to_kdev_t(int dev) +static inline kdev_t to_kdev_t(dev_t dev) { - return mk_kdev(MAJOR(dev),MINOR(dev)); + unsigned int ma = MAJOR(dev); + unsigned int mi = MINOR(dev); + return mk_kdev(ma, mi); } #define print_dev_t(buffer, dev) \ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/lockmeter.h 901-mjb1.1/include/linux/lockmeter.h --- 000-virgin/include/linux/lockmeter.h Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/include/linux/lockmeter.h Wed Aug 13 20:29:36 2003 @@ -0,0 +1,320 @@ +/* + * Copyright (C) 1999-2002 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) Feb-Apr 2000 + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks + * Added support for hold time statistics for read and write + * locks. + * Moved machine dependent code to include/asm/lockmeter.h. + * + */ + +#ifndef _LINUX_LOCKMETER_H +#define _LINUX_LOCKMETER_H + + +/*--------------------------------------------------- + * architecture-independent lockmeter.h + *-------------------------------------------------*/ + +/* + * raybry -- version 2: added efficient hold time statistics + * requires lstat recompile, so flagged as new version + * raybry -- version 3: added global reader lock data + * hawkes -- version 4: removed some unnecessary fields to simplify mips64 port + */ +#define LSTAT_VERSION 5 + +int lstat_update(void*, void*, int); +int lstat_update_time(void*, void*, int, uint32_t); + +/* + * Currently, the mips64 and sparc64 kernels talk to a 32-bit lockstat, so we + * need to force compatibility in the inter-communication data structure. + */ + +#if defined(CONFIG_MIPS32_COMPAT) +#define TIME_T uint32_t +#elif defined(CONFIG_SPARC32_COMPAT) +#define TIME_T uint64_t +#else +#define TIME_T time_t +#endif + +#if defined(__KERNEL__) || (!defined(CONFIG_MIPS32_COMPAT) && !defined(CONFIG_SPARC32_COMPAT)) || (_MIPS_SZLONG==32) +#define POINTER void * +#else +#define POINTER int64_t +#endif + +/* + * Values for the "action" parameter passed to lstat_update. + * ZZZ - do we want a try-success status here??? + */ +#define LSTAT_ACT_NO_WAIT 0 +#define LSTAT_ACT_SPIN 1 +#define LSTAT_ACT_REJECT 2 +#define LSTAT_ACT_WW_SPIN 3 +#define LSTAT_ACT_SLEPT 4 /* UNUSED */ + +#define LSTAT_ACT_MAX_VALUES 4 /* NOTE: Increase to 5 if use ACT_SLEPT */ + +/* + * Special values for the low 2 bits of an RA passed to + * lstat_update. + */ +/* we use these values to figure out what kind of lock data */ +/* is stored in the statistics table entry at index ....... */ +#define LSTAT_RA_SPIN 0 /* spin lock data */ +#define LSTAT_RA_READ 1 /* read lock statistics */ +#define LSTAT_RA_SEMA 2 /* RESERVED */ +#define LSTAT_RA_WRITE 3 /* write lock statistics*/ + +#define LSTAT_RA(n) \ + ((void*)( ((unsigned long)__builtin_return_address(0) & ~3) | n) ) + +/* + * Constants used for lock addresses in the lstat_directory + * to indicate special values of the lock address. + */ +#define LSTAT_MULTI_LOCK_ADDRESS NULL + +/* + * Maximum size of the lockstats tables. Increase this value + * if its not big enough. (Nothing bad happens if its not + * big enough although some locks will not be monitored.) + * We record overflows of this quantity in lstat_control.dir_overflows + * + * Note: The max value here must fit into the field set + * and obtained by the macro's PUT_INDEX() and GET_INDEX(). + * This value depends on how many bits are available in the + * lock word in the particular machine implementation we are on. + */ +#define LSTAT_MAX_STAT_INDEX 2000 + +/* + * Size and mask for the hash table into the directory. + */ +#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ +#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) + +#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) + +/* + * This defines an entry in the lockstat directory. It contains + * information about a lock being monitored. + * A directory entry only contains the lock identification - + * counts on usage of the lock are kept elsewhere in a per-cpu + * data structure to minimize cache line pinging. + */ +typedef struct { + POINTER caller_ra; /* RA of code that set lock */ + POINTER lock_ptr; /* lock address */ + ushort next_stat_index; /* Used to link multiple locks that have the same hash table value */ +} lstat_directory_entry_t; + +/* + * A multi-dimensioned array used to contain counts for lock accesses. + * The array is 3-dimensional: + * - CPU number. Keep from thrashing cache lines between CPUs + * - Directory entry index. Identifies the lock + * - Action. Indicates what kind of contention occurred on an + * access to the lock. + * + * The index of an entry in the directory is the same as the 2nd index + * of the entry in the counts array. + */ +/* + * This table contains data for spin_locks, write locks, and read locks + * Not all data is used for all cases. In particular, the hold time + * information is not stored here for read locks since that is a global + * (e. g. cannot be separated out by return address) quantity. + * See the lstat_read_lock_counts_t structure for the global read lock + * hold time. + */ +typedef struct { + uint64_t cum_wait_ticks; /* sum of wait times */ + /* for write locks, sum of time a */ + /* writer is waiting for a reader */ + int64_t cum_hold_ticks; /* cumulative sum of holds */ + /* not used for read mode locks */ + /* must be signed. ............... */ + uint32_t max_wait_ticks; /* max waiting time */ + uint32_t max_hold_ticks; /* max holding time */ + uint64_t cum_wait_ww_ticks; /* sum times writer waits on writer*/ + uint32_t max_wait_ww_ticks; /* max wait time writer vs writer */ + /* prev 2 only used for write locks*/ + uint32_t acquire_time; /* time lock acquired this CPU */ + uint32_t count[LSTAT_ACT_MAX_VALUES]; +} lstat_lock_counts_t; + +typedef lstat_lock_counts_t lstat_cpu_counts_t[LSTAT_MAX_STAT_INDEX]; + +/* + * User request to: + * - turn statistic collection on/off, or to reset + */ +#define LSTAT_OFF 0 +#define LSTAT_ON 1 +#define LSTAT_RESET 2 +#define LSTAT_RELEASE 3 + +#define LSTAT_MAX_READ_LOCK_INDEX 1000 +typedef struct { + POINTER lock_ptr; /* address of lock for output stats */ + uint32_t read_lock_count; + int64_t cum_hold_ticks; /* sum of read lock hold times over */ + /* all callers. ....................*/ + uint32_t write_index; /* last write lock hash table index */ + uint32_t busy_periods; /* count of busy periods ended this */ + uint64_t start_busy; /* time this busy period started. ..*/ + uint64_t busy_ticks; /* sum of busy periods this lock. ..*/ + uint64_t max_busy; /* longest busy period for this lock*/ + uint32_t max_readers; /* maximum number of readers ...... */ +#ifdef USER_MODE_TESTING + rwlock_t entry_lock; /* lock for this read lock entry... */ + /* avoid having more than one rdr at*/ + /* needed for user space testing... */ + /* not needed for kernel 'cause it */ + /* is non-preemptive. ............. */ +#endif +} lstat_read_lock_counts_t; +typedef lstat_read_lock_counts_t lstat_read_lock_cpu_counts_t[LSTAT_MAX_READ_LOCK_INDEX]; + +#if defined(__KERNEL__) || defined(USER_MODE_TESTING) + +#ifndef USER_MODE_TESTING +#include +#else +#include "asm_newlockmeter.h" +#endif + +/* + * Size and mask for the hash table into the directory. + */ +#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ +#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) + +#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) + +/* + * This version eliminates the per processor lock stack. What we do is to + * store the index of the lock hash structure in unused bits in the lock + * itself. Then on unlock we can find the statistics record without doing + * any additional hash or lock stack lookup. This works for spin_locks. + * Hold time reporting is now basically as cheap as wait time reporting + * so we ignore the difference between LSTAT_ON_HOLD and LSTAT_ON_WAIT + * as in version 1.1.* of lockmeter. + * + * For rw_locks, we store the index of a global reader stats structure in + * the lock and the writer index is stored in the latter structure. + * For read mode locks we hash at the time of the lock to find an entry + * in the directory for reader wait time and the like. + * At unlock time for read mode locks, we update just the global structure + * so we don't need to know the reader directory index value at unlock time. + * + */ + +/* + * Protocol to change lstat_control.state + * This is complicated because we don't want the cum_hold_time for + * a rw_lock to be decremented in _read_lock_ without making sure it + * is incremented in _read_lock_ and vice versa. So here is the + * way we change the state of lstat_control.state: + * I. To Turn Statistics On + * After allocating storage, set lstat_control.state non-zero. + * This works because we don't start updating statistics for in use + * locks until the reader lock count goes to zero. + * II. To Turn Statistics Off: + * (0) Disable interrupts on this CPU + * (1) Seize the lstat_control.directory_lock + * (2) Obtain the current value of lstat_control.next_free_read_lock_index + * (3) Store a zero in lstat_control.state. + * (4) Release the lstat_control.directory_lock + * (5) For each lock in the read lock list up to the saved value + * (well, -1) of the next_free_read_lock_index, do the following: + * (a) Check validity of the stored lock address + * by making sure that the word at the saved addr + * has an index that matches this entry. If not + * valid, then skip this entry. + * (b) If there is a write lock already set on this lock, + * skip to (d) below. + * (c) Set a non-metered write lock on the lock + * (d) set the cached INDEX in the lock to zero + * (e) Release the non-metered write lock. + * (6) Re-enable interrupts + * + * These rules ensure that a read lock will not have its statistics + * partially updated even though the global lock recording state has + * changed. See put_lockmeter_info() for implementation. + * + * The reason for (b) is that there may be write locks set on the + * syscall path to put_lockmeter_info() from user space. If we do + * not do this check, then we can deadlock. A similar problem would + * occur if the lock was read locked by the current CPU. At the + * moment this does not appear to happen. + */ + +/* + * Main control structure for lockstat. Used to turn statistics on/off + * and to maintain directory info. + */ +typedef struct { + int state; + spinlock_t control_lock; /* used to serialize turning statistics on/off */ + spinlock_t directory_lock; /* for serialize adding entries to directory */ + volatile int next_free_dir_index;/* next free entry in the directory */ + /* FIXME not all of these fields are used / needed .............. */ + /* the following fields represent data since */ + /* first "lstat on" or most recent "lstat reset" */ + TIME_T first_started_time; /* time when measurement first enabled */ + TIME_T started_time; /* time when measurement last started */ + TIME_T ending_time; /* time when measurement last disabled */ + uint64_t started_cycles64; /* cycles when measurement last started */ + uint64_t ending_cycles64; /* cycles when measurement last disabled */ + uint64_t enabled_cycles64; /* total cycles with measurement enabled */ + int intervals; /* number of measurement intervals recorded */ + /* i. e. number of times did lstat on;lstat off */ + lstat_directory_entry_t *dir; /* directory */ + int dir_overflow; /* count of times ran out of space in directory */ + int rwlock_overflow; /* count of times we couldn't allocate a rw block*/ + ushort *hashtab; /* hash table for quick dir scans */ + lstat_cpu_counts_t *counts[NR_CPUS]; /* Array of pointers to per-cpu stats */ + int next_free_read_lock_index; /* next rwlock reader (global) stats block */ + lstat_read_lock_cpu_counts_t *read_lock_counts[NR_CPUS]; /* per cpu read lock stats */ +} lstat_control_t; + +#endif /* defined(__KERNEL__) || defined(USER_MODE_TESTING) */ + +typedef struct { + short lstat_version; /* version of the data */ + short state; /* the current state is returned */ + int maxcpus; /* Number of cpus present */ + int next_free_dir_index; /* index of the next free directory entry */ + TIME_T first_started_time; /* when measurement enabled for first time */ + TIME_T started_time; /* time in secs since 1969 when stats last turned on */ + TIME_T ending_time; /* time in secs since 1969 when stats last turned off */ + uint32_t cycleval; /* cycles per second */ +#ifdef notyet + void *kernel_magic_addr; /* address of kernel_magic */ + void *kernel_end_addr; /* contents of kernel magic (points to "end") */ +#endif + int next_free_read_lock_index; /* index of next (global) read lock stats struct */ + uint64_t started_cycles64; /* cycles when measurement last started */ + uint64_t ending_cycles64; /* cycles when stats last turned off */ + uint64_t enabled_cycles64; /* total cycles with measurement enabled */ + int intervals; /* number of measurement intervals recorded */ + /* i.e. number of times we did lstat on;lstat off*/ + int dir_overflow; /* number of times we wanted more space in directory */ + int rwlock_overflow; /* # of times we wanted more space in read_locks_count */ + struct new_utsname uts; /* info about machine where stats are measured */ + /* -T option of lockstat allows data to be */ + /* moved to another machine. ................. */ +} lstat_user_request_t; + +#endif /* _LINUX_LOCKMETER_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/mm.h 901-mjb1.1/include/linux/mm.h --- 000-virgin/include/linux/mm.h Wed Aug 13 20:24:32 2003 +++ 901-mjb1.1/include/linux/mm.h Wed Aug 13 20:51:52 2003 @@ -179,6 +179,7 @@ struct page { struct pte_chain *chain;/* Reverse pte mapping pointer. * protected by PG_chainlock */ pte_addr_t direct; + int mapcount; } pte; unsigned long private; /* mapping-private opaque data */ @@ -610,6 +611,39 @@ extern struct page * follow_page(struct int write); extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); + +/* + * Given a struct page, determine which node's memory it is from. + * TODO: There's probably a more efficient way to do this... + */ +static inline int page_to_nid(struct page *page) +{ + return pfn_to_nid(page_to_pfn(page)); +} + +#ifdef CONFIG_NUMA +static inline void zero_rss(struct mm_struct *mm) +{ + mm->rss = 0; + memset(mm->pernode_rss, 0, MAX_NUMNODES * sizeof(*mm->pernode_rss)); +} + +static inline void inc_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss++; + mm->pernode_rss[page_to_nid(page)]++; +} + +static inline void dec_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss--; + mm->pernode_rss[page_to_nid(page)]--; +} +#else /* !CONFIG_NUMA */ +#define zero_rss(mm) ((mm)->rss = 0) +#define inc_rss(mm, page) ((mm)->rss++) +#define dec_rss(mm, page) ((mm)->rss--) +#endif /* CONFIG_NUMA */ #ifndef CONFIG_DEBUG_PAGEALLOC static inline void diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/mmzone.h 901-mjb1.1/include/linux/mmzone.h --- 000-virgin/include/linux/mmzone.h Wed Aug 13 20:24:32 2003 +++ 901-mjb1.1/include/linux/mmzone.h Wed Aug 13 20:51:53 2003 @@ -300,6 +300,7 @@ extern struct pglist_data contig_page_da #define NODE_DATA(nid) (&contig_page_data) #define NODE_MEM_MAP(nid) mem_map #define MAX_NR_NODES 1 +#define pfn_to_nid(pfn) (0) #else /* CONFIG_DISCONTIGMEM */ #include @@ -362,6 +363,19 @@ static inline unsigned int num_online_me #define num_online_memblks() 1 #endif /* CONFIG_DISCONTIGMEM || CONFIG_NUMA */ + +static inline struct zonelist *get_node_zonelist(int nid, int gfp_mask) +{ + return NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK); +} + +#define get_zonelist(gfp_mask) get_node_zonelist(numa_node_id(), gfp_mask) + +/* Structure to keep track of memory segment (VMA) bindings */ +struct binding { + struct zonelist zonelist; +}; + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/module.h 901-mjb1.1/include/linux/module.h --- 000-virgin/include/linux/module.h Tue Aug 5 20:01:55 2003 +++ 901-mjb1.1/include/linux/module.h Wed Aug 13 20:51:56 2003 @@ -257,6 +257,11 @@ struct module /* The command line arguments (may be mangled). People like keeping pointers to this stuff */ char *args; + +#ifdef CONFIG_GCOV_PROFILE + const char *ctors_start; /* Pointer to start of .ctors-section */ + const char *ctors_end; /* Pointer to end of .ctors-section */ +#endif }; /* FIXME: It'd be nice to isolate modules during init, too, so they diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/nfsd/syscall.h 901-mjb1.1/include/linux/nfsd/syscall.h --- 000-virgin/include/linux/nfsd/syscall.h Sat May 10 18:35:03 2003 +++ 901-mjb1.1/include/linux/nfsd/syscall.h Wed Aug 13 20:48:46 2003 @@ -59,7 +59,7 @@ struct nfsctl_client { struct nfsctl_export { char ex_client[NFSCLNT_IDMAX+1]; char ex_path[NFS_MAXPATHLEN+1]; - __kernel_old_dev_t ex_dev; + u16 ex_dev; __kernel_ino_t ex_ino; int ex_flags; __kernel_uid_t ex_anon_uid; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/page-flags.h 901-mjb1.1/include/linux/page-flags.h --- 000-virgin/include/linux/page-flags.h Tue Jun 24 21:29:26 2003 +++ 901-mjb1.1/include/linux/page-flags.h Wed Aug 13 20:29:24 2003 @@ -75,6 +75,7 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_anon 20 /* Anonymous page */ /* @@ -266,6 +267,10 @@ extern void get_full_page_state(struct p #define PageCompound(page) test_bit(PG_compound, &(page)->flags) #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) + +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) /* * The PageSwapCache predicate doesn't use a PG_flag at this time, diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/pagemap.h 901-mjb1.1/include/linux/pagemap.h --- 000-virgin/include/linux/pagemap.h Fri May 30 19:02:23 2003 +++ 901-mjb1.1/include/linux/pagemap.h Wed Aug 13 20:51:50 2003 @@ -27,14 +27,37 @@ #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); +#ifndef CONFIG_NUMA + +static inline struct page *__page_cache_alloc(struct address_space *x, int gfp_mask) +{ + return alloc_pages(gfp_mask, 0); +} + +#else /* CONFIG_NUMA */ + +static inline struct page *__page_cache_alloc(struct address_space *x, int gfp_mask) +{ + struct zonelist *zonelist; + + if (!x->binding) + zonelist = get_zonelist(gfp_mask); + else + zonelist = &x->binding->zonelist; + + return __alloc_pages(gfp_mask, 0, zonelist); +} + +#endif /* !CONFIG_NUMA */ + static inline struct page *page_cache_alloc(struct address_space *x) { - return alloc_pages(x->gfp_mask, 0); + return __page_cache_alloc(x, x->gfp_mask); } static inline struct page *page_cache_alloc_cold(struct address_space *x) { - return alloc_pages(x->gfp_mask|__GFP_COLD, 0); + return __page_cache_alloc(x, x->gfp_mask|__GFP_COLD); } typedef int filler_t(void *, struct page *); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/pci.h 901-mjb1.1/include/linux/pci.h --- 000-virgin/include/linux/pci.h Wed Aug 13 20:24:33 2003 +++ 901-mjb1.1/include/linux/pci.h Wed Aug 13 20:47:25 2003 @@ -459,10 +459,10 @@ struct pci_bus { void *sysdata; /* hook for sys-specific extension */ struct proc_dir_entry *procdir; /* directory entry in /proc/bus/pci */ - unsigned char number; /* bus number */ - unsigned char primary; /* number of primary bridge */ - unsigned char secondary; /* number of secondary bridge */ - unsigned char subordinate; /* max number of subordinate buses */ + unsigned int number; /* bus number */ + unsigned int primary; /* number of primary bridge */ + unsigned int secondary; /* number of secondary bridge */ + unsigned int subordinate; /* max number of subordinate buses */ char name[48]; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/sched.h 901-mjb1.1/include/linux/sched.h --- 000-virgin/include/linux/sched.h Wed Aug 13 20:24:33 2003 +++ 901-mjb1.1/include/linux/sched.h Wed Aug 13 20:51:52 2003 @@ -70,7 +70,11 @@ struct exec_domain; * the EXP_n values would be 1981, 2034 and 2043 if still using only * 11 bit fractions. */ -extern unsigned long avenrun[]; /* Load averages */ +extern unsigned long avenrun[]; /* Load averages */ +extern unsigned long tasks_running[3]; /* Real load averages */ +DECLARE_PER_CPU(unsigned long[3],cpu_tasks_running); /* Real load averages per cpu */ + +extern unsigned long tasks_running[]; /* Real load averages */ #define FSHIFT 11 /* nr of bits of precision */ #define FIXED_1 (1< #include #include @@ -172,7 +180,13 @@ extern unsigned long cache_decay_ticks; #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); -asmlinkage void schedule(void); +#ifdef CONFIG_KGDB_THREAD + asmlinkage void do_schedule(void); + asmlinkage void kern_schedule(void); + asmlinkage void kern_do_schedule(struct pt_regs); +#else + asmlinkage void schedule(void); +#endif struct namespace; @@ -191,7 +205,7 @@ struct mm_struct { atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ int map_count; /* number of VMAs */ struct rw_semaphore mmap_sem; - spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ + spinlock_t page_table_lock; /* Protects task page tables and RSS data */ struct list_head mmlist; /* List of all active mm's. These are globally strung * together off init_mm.mmlist, and are protected @@ -201,7 +215,11 @@ struct mm_struct { unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; - unsigned long rss, total_vm, locked_vm; + unsigned long total_vm, locked_vm; + unsigned long rss; +#ifdef CONFIG_NUMA + unsigned long pernode_rss[MAX_NUMNODES]; +#endif unsigned long def_flags; unsigned long cpu_vm_mask; unsigned long swap_address; @@ -321,6 +339,13 @@ struct k_itimer { struct sigqueue *sigq; /* signal queue entry. */ }; +struct sched_info { + /* running averages */ + unsigned long response_time, inter_arrival_time, service_time; + + /* timestamps */ + unsigned long last_arrival, began_service; +}; struct io_context; /* See blkdev.h */ void exit_io_context(void); @@ -345,6 +370,8 @@ struct task_struct { unsigned long cpus_allowed; unsigned int time_slice, first_time_slice; + struct sched_info sched_info; + struct list_head tasks; struct list_head ptrace_children; struct list_head ptrace_list; @@ -497,7 +524,7 @@ static inline int set_cpus_allowed(task_ } #endif -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED extern void sched_balance_exec(void); extern void node_nr_running_init(void); #else @@ -532,6 +559,7 @@ extern struct task_struct init_task; extern struct mm_struct init_mm; +extern int find_next_pid(int pid); extern struct task_struct *find_task_by_pid(int pid); extern void set_special_pids(pid_t session, pid_t pgrp); extern void __set_special_pids(pid_t session, pid_t pgrp); @@ -715,6 +743,12 @@ static inline int thread_group_empty(tas (thread_group_leader(p) && !thread_group_empty(p)) extern void unhash_process(struct task_struct *p); + +#ifdef CONFIG_KGDB_THREAD +#define schedule() kern_schedule() +#else +#define user_schedule() schedule() +#endif /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). * Nests both inside and outside of read_lock(&tasklist_lock). diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/spinlock.h 901-mjb1.1/include/linux/spinlock.h --- 000-virgin/include/linux/spinlock.h Wed Jul 2 21:59:15 2003 +++ 901-mjb1.1/include/linux/spinlock.h Wed Aug 13 20:48:54 2003 @@ -184,6 +184,17 @@ typedef struct { #endif /* !SMP */ +#ifdef CONFIG_LOCKMETER +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock (spinlock_t *lock); +extern int _metered_spin_trylock(spinlock_t *lock); +extern void _metered_read_lock (rwlock_t *lock); +extern void _metered_read_unlock (rwlock_t *lock); +extern void _metered_write_lock (rwlock_t *lock); +extern void _metered_write_unlock (rwlock_t *lock); +extern int _metered_write_trylock(rwlock_t *lock); +#endif + /* * Define the various spin_lock and rw_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various @@ -388,6 +399,141 @@ do { \ #define spin_trylock_bh(lock) ({ local_bh_disable(); preempt_disable(); \ _raw_spin_trylock(lock) ? 1 : \ ({preempt_enable(); local_bh_enable(); 0;});}) + +#ifdef CONFIG_LOCKMETER +#undef spin_lock +#undef spin_trylock +#undef spin_unlock +#undef spin_lock_irqsave +#undef spin_lock_irq +#undef spin_lock_bh +#undef read_lock +#undef read_unlock +#undef write_lock +#undef write_unlock +#undef write_trylock +#undef spin_unlock_bh +#undef read_lock_irqsave +#undef read_lock_irq +#undef read_lock_bh +#undef read_unlock_bh +#undef write_lock_irqsave +#undef write_lock_irq +#undef write_lock_bh +#undef write_unlock_bh + +#define spin_lock(lock) \ +do { \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while(0) + +#define spin_trylock(lock) ({preempt_disable(); _metered_spin_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable(); \ +} while (0) + +#define spin_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_unlock_bh(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + + +#define read_lock(lock) ({preempt_disable(); _metered_read_lock(lock);}) +#define read_unlock(lock) ({_metered_read_unlock(lock); preempt_enable();}) +#define write_lock(lock) ({preempt_disable(); _metered_write_lock(lock);}) +#define write_unlock(lock) ({_metered_write_unlock(lock); preempt_enable();}) +#define write_trylock(lock) ({preempt_disable();_metered_write_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock_no_resched(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable_no_resched(); \ +} while (0) + +#define read_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_unlock_bh(lock) \ +do { \ + _metered_read_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#define write_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_unlock_bh(lock) \ +do { \ + _metered_write_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#endif /* !CONFIG_LOCKMETER */ /* "lock on reference count zero" */ #ifndef ATOMIC_DEC_AND_LOCK diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/swap.h 901-mjb1.1/include/linux/swap.h --- 000-virgin/include/linux/swap.h Fri May 30 19:02:23 2003 +++ 901-mjb1.1/include/linux/swap.h Wed Aug 13 20:29:24 2003 @@ -186,6 +186,8 @@ struct pte_chain *FASTCALL(page_add_rmap void FASTCALL(page_remove_rmap(struct page *, pte_t *)); int FASTCALL(try_to_unmap(struct page *)); +int page_convert_anon(struct page *); + /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); #else diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/sysctl.h 901-mjb1.1/include/linux/sysctl.h --- 000-virgin/include/linux/sysctl.h Tue Aug 5 20:01:43 2003 +++ 901-mjb1.1/include/linux/sysctl.h Wed Aug 13 20:27:51 2003 @@ -60,7 +60,8 @@ enum CTL_DEV=7, /* Devices */ CTL_BUS=8, /* Busses */ CTL_ABI=9, /* Binary emulation */ - CTL_CPU=10 /* CPU stuff (speed scaling, etc) */ + CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_SCHED=11, /* scheduler tunables */ }; /* CTL_BUS names: */ @@ -155,6 +156,21 @@ enum VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ }; +/* Tunable scheduler parameters in /proc/sys/sched/ */ +enum { + SCHED_MIN_TIMESLICE=1, /* minimum process timeslice */ + SCHED_MAX_TIMESLICE=2, /* maximum process timeslice */ + SCHED_CHILD_PENALTY=3, /* penalty on fork to child */ + SCHED_PARENT_PENALTY=4, /* penalty on fork to parent */ + SCHED_EXIT_WEIGHT=5, /* penalty to parent of CPU hog child */ + SCHED_PRIO_BONUS_RATIO=6, /* percent of max prio given as bonus */ + SCHED_INTERACTIVE_DELTA=7, /* delta used to scale interactivity */ + SCHED_MAX_SLEEP_AVG=8, /* maximum sleep avg attainable */ + SCHED_STARVATION_LIMIT=9, /* no re-active if expired is starved */ + SCHED_NODE_THRESHOLD=10, /* NUMA node rebalance threshold */ + SCHED_IDLE_NODE_REBALANCE_RATIO=11, /* how often to global balance */ + SCHED_BUSY_NODE_REBALANCE_RATIO=12, /* how often to global balance */ +}; /* CTL_NET names: */ enum diff -urpN -X /home/fletch/.diff.exclude 000-virgin/include/linux/timex.h 901-mjb1.1/include/linux/timex.h --- 000-virgin/include/linux/timex.h Tue Jun 24 21:29:26 2003 +++ 901-mjb1.1/include/linux/timex.h Wed Aug 13 20:27:41 2003 @@ -78,7 +78,7 @@ #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #else -# error You lose. +# error Please use a HZ value which is between 12 and 1536 #endif /* diff -urpN -X /home/fletch/.diff.exclude 000-virgin/init/main.c 901-mjb1.1/init/main.c --- 000-virgin/init/main.c Tue Aug 5 20:01:56 2003 +++ 901-mjb1.1/init/main.c Wed Aug 13 20:51:56 2003 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -55,6 +56,10 @@ #include #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + /* * Versions of gcc older than that listed below may actually compile * and link okay, but the end product can have subtle run time bugs. @@ -115,6 +120,10 @@ char *execute_command; /* Setup configured maximum number of CPUs to activate */ static unsigned int max_cpus = NR_CPUS; +#if defined(CONFIG_GCOV_PROFILE) && (defined(CONFIG_PPC32) || defined(CONFIG_PPC64)) +void __bb_fork_func (void) { } +#endif + /* * Setup routine for controlling SMP activation * @@ -377,6 +386,7 @@ static void rest_init(void) /* * Activate the first processor. */ +int kgdb_not_ready_yet; asmlinkage void __init start_kernel(void) { @@ -389,6 +399,8 @@ asmlinkage void __init start_kernel(void */ lock_kernel(); printk(linux_banner); + setup_early_printk(); + setup_arch(&command_line); setup_per_zone_pages_min(); setup_per_cpu_areas(); @@ -461,6 +473,14 @@ asmlinkage void __init start_kernel(void * make syscalls (and thus be locked). */ init_idle(current, smp_processor_id()); + +#ifdef CONFIG_X86_REMOTE_DEBUG + if (gdb_enter) { + kgdb_not_ready_yet = 1; + gdb_hook(); /* right at boot time */ + kgdb_not_ready_yet = 0; + } +#endif /* Do the rest non-__init'ed, we're now alive */ rest_init(); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/Makefile 901-mjb1.1/kernel/Makefile --- 000-virgin/kernel/Makefile Wed Aug 13 20:24:33 2003 +++ 901-mjb1.1/kernel/Makefile Wed Aug 13 21:05:43 2003 @@ -8,9 +8,16 @@ obj-y = sched.o fork.o exec_domain.o signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o intermodule.o extable.o params.o posix-timers.o +ifdef CONFIG_GCOV_PROFILE +obj-y += gcov.o +export-objs += gcov.o +CFLAGS_gcov.o := -DGCOV_PATH='"$(TOPDIR)"' +endif + obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o +obj-$(CONFIG_LOCKMETER) += lockmeter.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += ksyms.o module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o @@ -19,6 +26,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o +obj-$(CONFIG_X86_EARLY_PRINTK) += early_printk.o # files to be removed upon make clean clean-files := ikconfig.h diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/early_printk.c 901-mjb1.1/kernel/early_printk.c --- 000-virgin/kernel/early_printk.c Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/kernel/early_printk.c Wed Aug 13 20:25:53 2003 @@ -0,0 +1,218 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +static int current_ypos = 1, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= MAX_YPOS) { + /* scroll 1 line up */ + for(k = 1, j = 0; k < MAX_YPOS; k++, j++) { + for(i = 0; i < MAX_XPOS; i++) { + writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), + VGABASE + 2*(MAX_XPOS*j + i)); + } + } + for(i = 0; i < MAX_XPOS; i++) { + writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); + } + current_ypos = MAX_YPOS-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++)); + if (current_xpos >= MAX_XPOS) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ + +int early_serial_base; /* ttyS0 */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + rep_nop(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); + s++; + } +} + +static __init void early_serial_init(char *opt) +{ + unsigned char c; + unsigned divisor, baud = DEFAULT_BAUD; + static int bases[] = SERIAL_BASES; + char *s, *e; + + early_serial_base = bases[0]; + + if (*opt == ',') + ++opt; + + s = strsep(&opt, ","); + if (s != NULL) { + unsigned port; + if (!strncmp(s,"0x",2)) + early_serial_base = simple_strtoul(s, &e, 16); + else { + if (!strncmp(s,"ttyS",4)) + s+=4; + port = simple_strtoul(s, &e, 10); + if (port > (SERIAL_BASES_LEN-1) || s == e) + port = 0; + early_serial_base = bases[port]; + } + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + s = strsep(&opt, ","); + if (s != NULL) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + va_start(ap,fmt); + n = vsnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int keep_early; + +int __init setup_early_printk(void) +{ + char *space, *s; + char buf[256]; + char cmd[COMMAND_LINE_SIZE]; + char *opt; + + /* Get our own copy of the cmd line */ + memcpy(cmd, COMMAND_LINE, COMMAND_LINE_SIZE); + cmd[COMMAND_LINE_SIZE-1] = '\0'; + opt = cmd; + + s = strstr(opt, "earlyprintk="); + if (s == NULL) + return -1; + opt = s+12; + + if (early_console_initialized) + return -1; + + strncpy(buf,opt,256); + buf[255] = 0; + space = strchr(buf, ' '); + if (space) + *space = 0; + + if (strstr(buf,"keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3)) { + early_console = &early_vga_console; + } else { + early_console = NULL; + return -1; + } + early_console_initialized = 1; + register_console(early_console); + printk("early printk console registered\n"); + return 0; +} + +void __init disable_early_printk(void) +{ + if (!early_console_initialized || !early_console) + return; + if (!keep_early) { + printk("disabling early console...\n"); + unregister_console(early_console); + early_console_initialized = 0; + } else { + printk("keeping early console.\n"); + } +} + +/* syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. */ +__setup("earlyprintk=", setup_early_printk); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/fork.c 901-mjb1.1/kernel/fork.c --- 000-virgin/kernel/fork.c Tue Aug 5 20:01:56 2003 +++ 901-mjb1.1/kernel/fork.c Wed Aug 13 20:51:52 2003 @@ -180,7 +180,10 @@ void __init fork_init(unsigned long memp * value: the thread structures can take up at most half * of memory. */ - max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + if (THREAD_SIZE >= PAGE_SIZE) + max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + else + max_threads = (mempages * (PAGE_SIZE/THREAD_SIZE)) / 8; /* * we need to allow at least 20 threads to boot a system */ @@ -232,7 +235,7 @@ static inline int dup_mmap(struct mm_str mm->mmap_cache = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->map_count = 0; - mm->rss = 0; + zero_rss(mm); mm->cpu_vm_mask = 0; pprev = &mm->mmap; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/gcov.c 901-mjb1.1/kernel/gcov.c --- 000-virgin/kernel/gcov.c Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/kernel/gcov.c Wed Aug 13 20:51:56 2003 @@ -0,0 +1,158 @@ +/* + * Coverage support under Linux + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (c) International Business Machines Corp., 2002 + * + * Author: Hubertus Franke + * Rajan Ravindran + * + * Modified by + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct bb +{ + long zero_word; + const char *filename; + long *counts; + long ncounts; + struct bb *next; + const unsigned long *addresses; + + /* Older GCC's did not emit these fields. */ + long nwords; + const char **functions; + const long *line_nums; + const char **filenames; + char *flags; +}; + +struct bb *bb_head; +struct module *bb_context_address; +void (*gcov_callback)(int cmd, struct bb *bbptr) = NULL; + +#ifdef GCOV_PATH +char *gcov_kernelpath = GCOV_PATH; +#else +char *gcov_kernelpath = __FILE__; +#endif + + +void +__bb_init_func (struct bb *blocks) +{ + if (blocks->zero_word) + return; + + /* Set up linked list. */ + blocks->zero_word = 1; + + /* Store the address of the module of which this object-file is a part + of (set in do_global_ctors). */ + blocks->addresses = (unsigned long *) bb_context_address; + + blocks->next = bb_head; + bb_head = blocks; + + if (gcov_callback && bb_context_address) + (*gcov_callback)(1,blocks); +} + +/* Call constructors for all kernel objects and dynamic modules. This function + * is called both during module initialization and when the gcov kernel + * module is insmod'ed. The list of constructors is compiled into the + * kernel at &__CTOR_LIST__ to &__DTOR_LIST__ (labels are defined in + * head.S). In the case of a dynamic module the list is located at + * ctors_start to ctors_end. + * + * The constructors in turn call __bb_init_func, reporting the respective + * struct bb for each object file. + */ + +void +do_global_ctors (char *ctors_start, char *ctors_end, struct module *addr, int mod_flag) +{ + extern char __CTOR_LIST__; + extern char __DTOR_LIST__; + typedef void (*func_ptr)(void) ; + func_ptr *constructor_ptr=NULL; + + if (!mod_flag) { + /* Set start and end ptr from global kernel constructor list. */ + ctors_start = &__CTOR_LIST__; + ctors_end = &__DTOR_LIST__; + bb_context_address = NULL; + } else { + /* Set context to current module address. */ + bb_context_address = addr; + } + + if (!ctors_start) + return; + + /* Call all constructor functions until either the end of the + list is reached or until a NULL is encountered. */ + for (constructor_ptr = (func_ptr *) ctors_start; + (constructor_ptr != (func_ptr *) ctors_end) && + (*constructor_ptr != NULL); + constructor_ptr++) { + (*constructor_ptr) (); + } +} + + +/* When a module is unloaded, this function is called to remove + * the respective bb entries from our list. context specifies + * the address of the module that is unloaded. */ + +void +remove_bb_link (struct module *context) +{ + struct bb *bbptr; + struct bb *prev = NULL; + + /* search for all the module's bbptrs */ + for (bbptr = bb_head; bbptr ; bbptr = bbptr->next) { + if (bbptr->addresses == (unsigned long *) context) { + if (gcov_callback) + (*gcov_callback)(0,bbptr); + if (prev == NULL) + bb_head = bbptr->next; + else + prev->next = bbptr->next; + } + else + prev = bbptr; + } +} + +EXPORT_SYMBOL(bb_head); +EXPORT_SYMBOL(__bb_init_func); +EXPORT_SYMBOL(do_global_ctors); +EXPORT_SYMBOL(gcov_kernelpath); +EXPORT_SYMBOL(gcov_callback); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/ksyms.c 901-mjb1.1/kernel/ksyms.c --- 000-virgin/kernel/ksyms.c Wed Aug 13 20:24:33 2003 +++ 901-mjb1.1/kernel/ksyms.c Wed Aug 13 20:29:41 2003 @@ -462,7 +462,12 @@ EXPORT_SYMBOL(sleep_on); EXPORT_SYMBOL(sleep_on_timeout); EXPORT_SYMBOL(interruptible_sleep_on); EXPORT_SYMBOL(interruptible_sleep_on_timeout); +#ifdef CONFIG_KGDB_THREAD +EXPORT_SYMBOL(kern_schedule); +EXPORT_SYMBOL(do_schedule); +#else EXPORT_SYMBOL(schedule); +#endif #ifdef CONFIG_PREEMPT EXPORT_SYMBOL(preempt_schedule); #endif @@ -610,6 +615,16 @@ EXPORT_SYMBOL(__per_cpu_offset); EXPORT_SYMBOL(set_fs_pwd); EXPORT_SYMBOL(set_fs_root); + +#if defined(CONFIG_LOCKMETER) +EXPORT_SYMBOL(_metered_spin_lock); +EXPORT_SYMBOL(_metered_spin_unlock); +EXPORT_SYMBOL(_metered_spin_trylock); +EXPORT_SYMBOL(_metered_read_lock); +EXPORT_SYMBOL(_metered_read_unlock); +EXPORT_SYMBOL(_metered_write_lock); +EXPORT_SYMBOL(_metered_write_unlock); +#endif /* debug */ EXPORT_SYMBOL(dump_stack); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/lockmeter.c 901-mjb1.1/kernel/lockmeter.c --- 000-virgin/kernel/lockmeter.c Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/kernel/lockmeter.c Wed Aug 13 20:29:36 2003 @@ -0,0 +1,1088 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.c by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks + * Added support for hold time statistics for read and write + * locks. + */ + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#else +#define __SMP__ +#include +#include +#include +#include "bitops.h" +#include "user_scaffold.h" +#include +#include +#include "newlockmeter.h" +#endif + +#ifdef __KERNEL__ +#define ASSERT(cond) +#define bzero(loc,size) memset(loc,0,size) +#endif + +/*<---------------------------------------------------*/ +/* lockmeter.c */ +/*>---------------------------------------------------*/ + +#ifdef __KERNEL__ +static lstat_control_t lstat_control __cacheline_aligned = {LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 19*0, NR_CPUS*0, 0, NR_CPUS*0}; +#else +lstat_control_t lstat_control = {LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 19*0, NR_CPUS*0, 0, NR_CPUS*0}; +#endif + +int smp_num_cpus=NR_CPUS; + +#undef BUG +#define BUG() + +static ushort lstat_make_dir_entry(void *, void *); + +/* + * lstat_lookup + * + * Given a RA, locate the directory entry for the lock. + */ +static ushort +lstat_lookup( + void *lock_ptr, + void *caller_ra) +{ + ushort index; + lstat_directory_entry_t *dirp; + + dirp = lstat_control.dir; + + index = lstat_control.hashtab[DIRHASH(caller_ra)]; + while (dirp[index].caller_ra != caller_ra) { + if (index == 0) { + return(lstat_make_dir_entry(lock_ptr, caller_ra)); + } + index = dirp[index].next_stat_index; + } + + if (dirp[index].lock_ptr != NULL && + dirp[index].lock_ptr != lock_ptr) { + dirp[index].lock_ptr = NULL; + } + + return(index); +} + + +/* + * lstat_make_dir_entry + * Called to add a new lock to the lock directory. + */ +static ushort +lstat_make_dir_entry( + void *lock_ptr, + void *caller_ra) +{ + lstat_directory_entry_t *dirp; + ushort index, hindex; + unsigned long flags; + + /* lock the table without recursively reentering this metering code */ + do { local_irq_save(flags); + _raw_spin_lock(&lstat_control.directory_lock); } while(0); + + hindex = DIRHASH(caller_ra); + index = lstat_control.hashtab[hindex]; + dirp = lstat_control.dir; + while (index && dirp[index].caller_ra != caller_ra) + index = dirp[index].next_stat_index; + + if (index == 0) { + if(lstat_control.next_free_dir_index < LSTAT_MAX_STAT_INDEX) { + index = lstat_control.next_free_dir_index++; + lstat_control.dir[index].caller_ra = caller_ra; + lstat_control.dir[index].lock_ptr = lock_ptr; + lstat_control.dir[index].next_stat_index = lstat_control.hashtab[hindex]; + lstat_control.hashtab[hindex] = index; + } else { + lstat_control.dir_overflow++; + } + } + + do { _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags);} while(0); + return(index); +} + +int +lstat_update ( + void *lock_ptr, + void *caller_ra, + int action) +{ + int index; + int cpu; + + ASSERT(action < LSTAT_ACT_MAX_VALUES); + + if (lstat_control.state == LSTAT_OFF) { + return(0); + } + + index = lstat_lookup(lock_ptr, caller_ra); + cpu = THIS_CPU_NUMBER; + (*lstat_control.counts[cpu])[index].count[action]++; + (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); + + return(index); +} + +int +lstat_update_time ( + void *lock_ptr, + void *caller_ra, + int action, + uint32_t ticks) +{ + ushort index; + int cpu; + + ASSERT(action < LSTAT_ACT_MAX_VALUES); + + if (lstat_control.state == LSTAT_OFF) { + return(0); + } + + index = lstat_lookup(lock_ptr, caller_ra); + cpu = THIS_CPU_NUMBER; + (*lstat_control.counts[cpu])[index].count[action]++; + (*lstat_control.counts[cpu])[index].cum_wait_ticks += (uint64_t)ticks; + if ((*lstat_control.counts[cpu])[index].max_wait_ticks < ticks) + (*lstat_control.counts[cpu])[index].max_wait_ticks = ticks; + + (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); + + return(index); +} + +void _metered_spin_lock(spinlock_t *lock_ptr) +{ + if (lstat_control.state == LSTAT_OFF) { + _raw_spin_lock(lock_ptr); /* do the real lock */ + PUT_INDEX(lock_ptr,0); /* clean index in case lockmetering */ + /* gets turned on before unlock */ + } else { + void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); + int index; + + if (_raw_spin_trylock(lock_ptr)) { + index = lstat_update(lock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + } else { + uint32_t start_cycles = get_cycles(); + _raw_spin_lock(lock_ptr); /* do the real lock */ + index = lstat_update_time(lock_ptr, this_pc, LSTAT_ACT_SPIN, + get_cycles() - start_cycles); + } + /* save the index in the lock itself for use in spin unlock */ + PUT_INDEX(lock_ptr,index); + } +} + +int _metered_spin_trylock(spinlock_t *lock_ptr) +{ + if (lstat_control.state == LSTAT_OFF) { + return _raw_spin_trylock(lock_ptr); + } else { + int retval; + void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); + + if ((retval = _raw_spin_trylock(lock_ptr))) { + int index = lstat_update(lock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* save the index in the lock itself for use in spin unlock */ + PUT_INDEX(lock_ptr,index); + } else { + lstat_update(lock_ptr, this_pc, LSTAT_ACT_REJECT); + } + + return retval; + } +} + +void _metered_spin_unlock(spinlock_t *lock_ptr) +{ + int index=-1; + + if (lstat_control.state != LSTAT_OFF) { + index = GET_INDEX(lock_ptr); + /* + * If statistics were turned off when we set the lock, + * then the index can be zero. If that is the case, + * then collect no stats on this call. + */ + if (index > 0) { + uint32_t hold_time; + int cpu = THIS_CPU_NUMBER; + hold_time = get_cycles() - (*lstat_control.counts[cpu])[index].acquire_time; + (*lstat_control.counts[cpu])[index].cum_hold_ticks += (uint64_t)hold_time; + if ((*lstat_control.counts[cpu])[index].max_hold_ticks < hold_time) + (*lstat_control.counts[cpu])[index].max_hold_ticks = hold_time; + } + } + + /* make sure we don't have a stale index value saved */ + PUT_INDEX(lock_ptr,0); + _raw_spin_unlock(lock_ptr); /* do the real unlock */ +} + +/* + * allocate the next global read lock structure and store its index + * in the rwlock at "lock_ptr". + */ +uint32_t alloc_rwlock_struct(rwlock_t *rwlock_ptr) +{ + int index; + int flags; + int cpu=THIS_CPU_NUMBER; + + /* If we've already overflowed, then do a quick exit */ + if (lstat_control.next_free_read_lock_index > LSTAT_MAX_READ_LOCK_INDEX) { + lstat_control.rwlock_overflow++; + return(0); + } + + do { local_irq_save(flags); + _raw_spin_lock(&lstat_control.directory_lock); } while(0); + + /* It is possible this changed while we were waiting for the directory_lock */ + if (lstat_control.state == LSTAT_OFF) { + index=0; + goto unlock; + } + + /* It is possible someone else got here first and set the index */ + if ((index=GET_RWINDEX(rwlock_ptr)) == 0) { + + /* we can't turn on read stats for this lock while there are readers */ + /* (this would mess up the running hold time sum at unlock time) */ + if (RWLOCK_READERS(rwlock_ptr) != 0) { + index=0; + goto unlock; + } + + /* if stats are turned on after being off, we may need to return an old */ + /* index from when the statistics were on last time. ................... */ + for(index=1;index= LSTAT_MAX_READ_LOCK_INDEX) { + lstat_control.rwlock_overflow++; + index = 0; + goto unlock; + } + index = lstat_control.next_free_read_lock_index++; + + /* initialize the global read stats data structure for each cpu */ + for(cpu=0; cpu < smp_num_cpus; cpu++) { + (*lstat_control.read_lock_counts[cpu])[index].lock_ptr = rwlock_ptr; + } +put_index_and_unlock: + /* store the index for the read lock structure into the lock */ + PUT_RWINDEX(rwlock_ptr,index); + } + +unlock: + do { _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags);} while(0); + + return(index); +} + +void +_metered_read_lock(rwlock_t *rwlock_ptr) +{ + void *this_pc; + uint32_t start_cycles; + int index; + int cpu; + int flags; + int readers_before, readers_after; + uint64_t cycles64; + + if (lstat_control.state == LSTAT_OFF) { + _raw_read_lock(rwlock_ptr); + /* clean index in case lockmetering turns on before an unlock */ + PUT_RWINDEX(rwlock_ptr, 0); + return; + } + + this_pc = LSTAT_RA(LSTAT_RA_READ); + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* allocate the global stats entry for this lock, if needed */ + if (index==0) { + index = alloc_rwlock_struct(rwlock_ptr); + } + + readers_before = RWLOCK_READERS(rwlock_ptr); + if (_raw_read_trylock(rwlock_ptr)) { + /* + * We have decremented the lock to count a new reader, + * and have confirmed that no writer has it locked. + */ + /* update statistics if enabled */ + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do { local_irq_save(flags); } while(0); +#endif + lstat_update((void *)rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* preserve value of TSC so cum_hold_ticks and start_busy use same value */ + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= cycles64; + + /* record time and cpu of start of busy period */ + /* this is not perfect (some race conditions are possible) */ + if (readers_before==0) { + (*lstat_control.read_lock_counts[cpu])[index].start_busy = cycles64; + PUT_RW_CPU(rwlock_ptr, cpu); + } + readers_after=RWLOCK_READERS(rwlock_ptr); + if (readers_after > (*lstat_control.read_lock_counts[cpu])[index].max_readers) + (*lstat_control.read_lock_counts[cpu])[index].max_readers = readers_after; +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t*)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } + + return; + } + /* If we get here, then we could not quickly grab the read lock */ + + start_cycles = get_cycles(); /* start counting the wait time */ + + /* Now spin until read_lock is successful */ + _raw_read_lock(rwlock_ptr); + + lstat_update_time((void *)rwlock_ptr, this_pc, LSTAT_ACT_SPIN, + get_cycles() - start_cycles); + + /* update statistics if they are enabled for this lock */ + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do { local_irq_save(flags); } while(0); +#endif + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= cycles64; + + /* this is not perfect (some race conditions are possible) */ + if (readers_before==0) { + (*lstat_control.read_lock_counts[cpu])[index].start_busy = cycles64; + PUT_RW_CPU(rwlock_ptr, cpu); + } + readers_after=RWLOCK_READERS(rwlock_ptr); + if (readers_after > (*lstat_control.read_lock_counts[cpu])[index].max_readers) + (*lstat_control.read_lock_counts[cpu])[index].max_readers = readers_after; + +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } +} + +void _metered_read_unlock(rwlock_t *rwlock_ptr) +{ + int index; + int cpu; + int flags; + uint64_t busy_length; + uint64_t cycles64; + + if (lstat_control.state == LSTAT_OFF) { + _raw_read_unlock(rwlock_ptr); + return; + } + + index = GET_RWINDEX(rwlock_ptr); + cpu = THIS_CPU_NUMBER; + + if (index>0) { +#ifndef __KERNEL__ + _raw_spin_lock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + /* updates below are non-atomic */ + do { local_irq_save(flags); } while(0); +#endif + /* preserve value of TSC so cum_hold_ticks and busy_ticks are consistent.. */ + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks += cycles64; + (*lstat_control.read_lock_counts[cpu])[index].read_lock_count++; + + /* once again, this is not perfect (some race conditions are possible) */ + if (RWLOCK_READERS(rwlock_ptr) == 1) { + int cpu1 = GET_RW_CPU(rwlock_ptr); + uint64_t last_start_busy = (*lstat_control.read_lock_counts[cpu1])[index].start_busy; + (*lstat_control.read_lock_counts[cpu])[index].busy_periods++; + if (cycles64 > last_start_busy) { + busy_length = cycles64 - last_start_busy; + (*lstat_control.read_lock_counts[cpu])[index].busy_ticks += busy_length; + if (busy_length > (*lstat_control.read_lock_counts[cpu])[index].max_busy) + (*lstat_control.read_lock_counts[cpu])[index].max_busy = busy_length; + } + } +#ifndef __KERNEL__ + _raw_spin_unlock((spinlock_t *)&(*lstat_control.read_lock_counts[cpu])[index].entry_lock); +#else + do {local_irq_restore(flags);} while(0); +#endif + } + + /* unlock the lock */ + _raw_read_unlock(rwlock_ptr); +} + +void _metered_write_lock(rwlock_t *rwlock_ptr) +{ + uint32_t start_cycles; + void *this_pc; + uint32_t spin_ticks = 0; /* in anticipation of a potential wait */ + int index; + int write_index = 0; + int cpu; + enum {writer_writer_conflict, writer_reader_conflict} why_wait = writer_writer_conflict; + + if (lstat_control.state == LSTAT_OFF) { + _raw_write_lock(rwlock_ptr); + /* clean index in case lockmetering turns on before an unlock */ + PUT_RWINDEX(rwlock_ptr, 0); + return; + } + + this_pc = LSTAT_RA(LSTAT_RA_WRITE); + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* allocate the global stats entry for this lock, if needed */ + if (index == 0) { + index = alloc_rwlock_struct(rwlock_ptr); + } + + if (_raw_write_trylock(rwlock_ptr)) { + /* We acquired the lock on the first try */ + write_index = lstat_update((void *)rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + /* save the write_index for use in unlock if stats enabled */ + if (index > 0) + (*lstat_control.read_lock_counts[cpu])[index].write_index = write_index; + return; + } + + /* If we get here, then we could not quickly grab the write lock */ + start_cycles = get_cycles(); /* start counting the wait time */ + + why_wait = RWLOCK_READERS(rwlock_ptr) ? writer_reader_conflict : writer_writer_conflict; + + /* Now set the lock and wait for conflicts to disappear */ + _raw_write_lock(rwlock_ptr); + + spin_ticks = get_cycles() - start_cycles; + + /* update stats -- if enabled */ + if (index > 0) + if (spin_ticks) { + if (why_wait == writer_reader_conflict) { + /* waited due to a reader holding the lock */ + write_index = lstat_update_time((void *)rwlock_ptr, this_pc, + LSTAT_ACT_SPIN, spin_ticks); + } else { + /* waited due to another writer holding the lock */ + write_index = lstat_update_time((void *)rwlock_ptr, this_pc, + LSTAT_ACT_WW_SPIN, spin_ticks); + (*lstat_control.counts[cpu])[write_index].cum_wait_ww_ticks += spin_ticks; + if (spin_ticks > + (*lstat_control.counts[cpu])[write_index].max_wait_ww_ticks) { + (*lstat_control.counts[cpu])[write_index].max_wait_ww_ticks = spin_ticks; + } + } + + /* save the directory index for use on write_unlock */ + (*lstat_control.read_lock_counts[cpu])[index].write_index = write_index; + } + +} + +void +_metered_write_unlock(rwlock_t *rwlock_ptr) +{ + int index; + int cpu; + int write_index; + uint32_t hold_time; + + if (lstat_control.state == LSTAT_OFF) { + _raw_write_unlock(rwlock_ptr); + return; + } + + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* update statistics if stats enabled for this lock */ + if (index>0) { + write_index = (*lstat_control.read_lock_counts[cpu])[index].write_index; + + hold_time = get_cycles() - (*lstat_control.counts[cpu])[write_index].acquire_time; + (*lstat_control.counts[cpu])[write_index].cum_hold_ticks += (uint64_t)hold_time; + if ((*lstat_control.counts[cpu])[write_index].max_hold_ticks < hold_time) + (*lstat_control.counts[cpu])[write_index].max_hold_ticks = hold_time; + } + _raw_write_unlock(rwlock_ptr); +} + +int _metered_write_trylock(rwlock_t *rwlock_ptr) +{ + int retval; + void *this_pc = LSTAT_RA(LSTAT_RA_WRITE); + + if ((retval = _raw_write_trylock(rwlock_ptr))) { + lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + } else { + lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_REJECT); + } + + return retval; +} + +#ifdef __KERNEL__ +static void +init_control_space(void) +{ + /* Set all control space pointers to null and indices to "empty" */ + int cpu; + + /* + * Access CPU_CYCLE_FREQUENCY at the outset, which in some + * architectures may trigger a runtime calculation that uses a + * spinlock. Let's do this before lockmetering is turned on. + */ + if (CPU_CYCLE_FREQUENCY == 0) + BUG(); + + lstat_control.hashtab = NULL; + lstat_control.dir = NULL; + for (cpu=0; cpu max_len) + return actual_ret_bcount; + + copy_to_user(buffer, (void *)&req, next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + return actual_ret_bcount; + } else { + /* measurement is off but valid data present */ + /* fetch time info from lstat_control */ + req.ending_time = lstat_control.ending_time; + req.ending_cycles64 = lstat_control.ending_cycles64; + req.enabled_cycles64 = lstat_control.enabled_cycles64; + } + } else { + /* this must be a read while data active--use current time, etc */ + do_gettimeofday(&tv); + req.ending_time = tv.tv_sec; + req.ending_cycles64 = get_cycles64(); + req.enabled_cycles64 = req.ending_cycles64-req.started_cycles64 + + lstat_control.enabled_cycles64; + } + + next_ret_bcount = sizeof(lstat_user_request_t); + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; + + copy_to_user(buffer, (void *)&req, next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + + if (!lstat_control.counts[0]) /* not initialized? */ + return actual_ret_bcount; + + next_ret_bcount = sizeof(lstat_cpu_counts_t); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; /* leave early */ + copy_to_user(buffer + actual_ret_bcount, lstat_control.counts[cpu], + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + } + + next_ret_bcount = LSTAT_MAX_STAT_INDEX * sizeof(lstat_directory_entry_t); + if ( ((actual_ret_bcount + next_ret_bcount) > max_len) + || !lstat_control.dir ) + return actual_ret_bcount; /* leave early */ + + copy_to_user(buffer + actual_ret_bcount, lstat_control.dir, + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + + next_ret_bcount = sizeof(lstat_read_lock_cpu_counts_t); + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if (actual_ret_bcount + next_ret_bcount > max_len) + return actual_ret_bcount; + copy_to_user(buffer + actual_ret_bcount, lstat_control.read_lock_counts[cpu], + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + } + + return actual_ret_bcount; +} + +/* + * Writing to the /proc lockmeter node enables or disables metering. + * based upon the first byte of the "written" data. + * The following values are defined: + * LSTAT_ON: 1st call: allocates storage, intializes and turns on measurement + * subsequent calls just turn on measurement + * LSTAT_OFF: turns off measurement + * LSTAT_RESET: resets statistics + * LSTAT_RELEASE: releases statistics storage + * + * This allows one to accumulate statistics over several lockstat runs: + * + * lockstat on + * lockstat off + * ...repeat above as desired... + * lockstat get + * ...now start a new set of measurements... + * lockstat reset + * lockstat on + * ... + * + */ +ssize_t put_lockmeter_info(const char *buffer, size_t len) +{ + int error = 0; + int dirsize, countsize, read_lock_countsize, hashsize; + int cpu; + char put_char; + int i, read_lock_blocks, flags; + rwlock_t *lock_ptr; + struct timeval tv; + + if (len <= 0) + return -EINVAL; + + _raw_spin_lock(&lstat_control.control_lock); + + get_user(put_char, buffer); + switch (put_char) { + + case LSTAT_OFF: + if (lstat_control.state != LSTAT_OFF) { + /* + * To avoid seeing read lock hold times in an inconsisent state, + * we have to follow this protocol to turn off statistics + */ + do { local_irq_save(flags); } while(0); + /* getting this lock will stop any read lock block allocations */ + _raw_spin_lock(&lstat_control.directory_lock); + /* keep any more read lock blocks from being allocated */ + lstat_control.state = LSTAT_OFF; + /* record how may read lock blocks there are */ + read_lock_blocks = lstat_control.next_free_read_lock_index; + _raw_spin_unlock(&lstat_control.directory_lock); + /* now go through the list of read locks */ + cpu = THIS_CPU_NUMBER; + for(i=1;ictors_start && mod->ctors_end) + remove_bb_link(mod); +#endif + /* Module unload stuff */ module_unload_free(mod); @@ -1573,6 +1583,13 @@ static struct module *load_module(void _ /* Module has been moved. */ mod = (void *)sechdrs[modindex].sh_addr; +#ifdef CONFIG_GCOV_PROFILE + modindex = find_sec(hdr, sechdrs, secstrings, ".ctors"); + mod->ctors_start = (char *)sechdrs[modindex].sh_addr; + mod->ctors_end = (char *)(mod->ctors_start + + sechdrs[modindex].sh_size); +#endif + /* Now we've moved module, initialize linked lists, etc. */ module_unload_init(mod); @@ -1722,6 +1739,12 @@ sys_init_module(void __user *umod, /* Start the module */ ret = mod->init(); + +#ifdef CONFIG_GCOV_PROFILE + if (mod->ctors_start && mod->ctors_end) { + do_global_ctors(mod->ctors_start, mod->ctors_end, mod, 1); + } +#endif if (ret < 0) { /* Init routine failed: abort. Try to protect us from buggy refcounters. */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/pid.c 901-mjb1.1/kernel/pid.c --- 000-virgin/kernel/pid.c Fri May 30 19:02:24 2003 +++ 901-mjb1.1/kernel/pid.c Wed Aug 13 20:51:45 2003 @@ -172,13 +172,22 @@ int attach_pid(task_t *task, enum pid_ty if (pid) atomic_inc(&pid->count); else { + struct list_head *elem, *bucket; + pid = &task->pids[type].pid; pid->nr = nr; atomic_set(&pid->count, 1); INIT_LIST_HEAD(&pid->task_list); pid->task = task; get_task_struct(task); - list_add(&pid->hash_chain, &pid_hash[type][pid_hashfn(nr)]); + bucket = &pid_hash[type][pid_hashfn(nr)]; + __list_for_each(elem, bucket) { + struct pid *walk; + walk = list_entry(elem, struct pid, hash_chain); + if (walk->nr > nr) + break; + } + list_add_tail(&pid->hash_chain, elem); } list_add_tail(&task->pids[type].pid_chain, &pid->task_list); task->pids[type].pidptr = pid; @@ -219,6 +228,42 @@ void detach_pid(task_t *task, enum pid_t if (find_pid(type, nr)) return; free_pidmap(nr); +} + +/** + * find_next_pid - Returns the pid of next task. + * @pid: Starting point for the search. + * + * Returns the pid number of the task that follows behind + * "pid". The function works even if the input pid value + * is not valid anymore. + */ + int find_next_pid(int pid) +{ + struct list_head *elem, *bucket; + + if(!pid) { + bucket = &pid_hash[PIDTYPE_PID][0]; + } else { + bucket = &pid_hash[PIDTYPE_PID][pid_hashfn(pid)]; + } + read_lock(&tasklist_lock); +next_chain: + __list_for_each(elem, bucket) { + struct pid *walk; + walk = list_entry(elem, struct pid, hash_chain); + if (walk->nr > pid) { + pid = walk->nr; + read_unlock(&tasklist_lock); + return pid; + } + } + pid = 0; + bucket++; + if (bucket < &pid_hash[PIDTYPE_PID][1< #include -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) #else #define cpu_to_node_mask(cpu) (cpu_online_map) @@ -59,6 +59,11 @@ #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +/* the FIXED_1 gunk is so running averages don't vanish prematurely */ +#define RAVG_WEIGHT 128 +#define RAVG_FACTOR (RAVG_WEIGHT*FIXED_1) +#define RUNNING_AVG(x,y) (((RAVG_WEIGHT-1)*(x)+RAVG_FACTOR*(y))/RAVG_WEIGHT) + /* * These are the 'tuning knobs' of the scheduler: * @@ -66,16 +71,29 @@ * maximum timeslice is 200 msecs. Timeslices get refilled after * they expire. */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) -#define NODE_THRESHOLD 125 +int min_timeslice = (10 * HZ) / 1000; +int max_timeslice = (200 * HZ) / 1000; +int child_penalty = 50; +int parent_penalty = 100; +int exit_weight = 3; +int prio_bonus_ratio = 25; +int interactive_delta = 2; +int max_sleep_avg = 10 * HZ; +int starvation_limit = 10 * HZ; +int node_threshold = 125; + +#define MIN_TIMESLICE (min_timeslice) +#define MAX_TIMESLICE (max_timeslice) +#define CHILD_PENALTY (child_penalty) +#define PARENT_PENALTY (parent_penalty) +#define EXIT_WEIGHT (exit_weight) +#define PRIO_BONUS_RATIO (prio_bonus_ratio) +#define INTERACTIVE_DELTA (interactive_delta) +#define MAX_SLEEP_AVG (max_sleep_avg) +#define STARVATION_LIMIT (starvation_limit) +#define NODE_THRESHOLD (node_threshold) + +#define TIMESLICE_GRANULARITY (HZ/20 ?: 1) /* * If a task is 'interactive' then we reinsert it in the active @@ -163,7 +181,7 @@ struct runqueue { struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; int prev_cpu_load[NR_CPUS]; -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED atomic_t *node_nr_running; int prev_node_load[MAX_NUMNODES]; #endif @@ -171,6 +189,8 @@ struct runqueue { struct list_head migration_queue; atomic_t nr_iowait; + + struct sched_info info; }; static DEFINE_PER_CPU(struct runqueue, runqueues); @@ -190,7 +210,7 @@ static DEFINE_PER_CPU(struct runqueue, r # define task_running(rq, p) ((rq)->curr == (p)) #endif -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED /* * Keep track of running tasks. @@ -227,13 +247,13 @@ __init void node_nr_running_init(void) } } -#else /* !CONFIG_NUMA */ +#else /* !CONFIG_NUMA_SCHED */ # define nr_running_init(rq) do { } while (0) # define nr_running_inc(rq) do { (rq)->nr_running++; } while (0) # define nr_running_dec(rq) do { (rq)->nr_running--; } while (0) -#endif /* CONFIG_NUMA */ +#endif /* CONFIG_NUMA_SCHED */ /* * task_rq_lock - lock the runqueue a given task resides on and disable @@ -260,6 +280,74 @@ static inline void task_rq_unlock(runque spin_unlock_irqrestore(&rq->lock, *flags); } +static inline void sched_info_arrive(task_t *t) +{ + unsigned long now = jiffies; + unsigned long diff = now - t->sched_info.last_arrival; + struct runqueue *rq = task_rq(t); + + t->sched_info.inter_arrival_time = + RUNNING_AVG(t->sched_info.inter_arrival_time, diff); + t->sched_info.last_arrival = now; + + if (!rq) + return; + diff = now - rq->info.last_arrival; + rq->info.inter_arrival_time = + RUNNING_AVG(rq->info.inter_arrival_time, diff); + rq->info.last_arrival = now; +} + +/* is this ever used? */ +static inline void sched_info_depart(task_t *t) +{ + struct runqueue *rq = task_rq(t); + unsigned long diff, now = jiffies; + + diff = now - t->sched_info.began_service; + t->sched_info.service_time = + RUNNING_AVG(t->sched_info.service_time, diff); + + if (!rq) + return; + diff = now - rq->info.began_service; + rq->info.service_time = + RUNNING_AVG(rq->info.service_time, diff); +} + +static inline void sched_info_switch(task_t *prev, task_t *next) +{ + struct runqueue *rq = task_rq(prev); + unsigned long diff, now = jiffies; + + /* prev now departs the cpu */ + sched_info_depart(prev); + + /* only for involuntary context switches */ + if (prev->state == TASK_RUNNING) + sched_info_arrive(prev); + + diff = now - next->sched_info.last_arrival; + next->sched_info.response_time = + RUNNING_AVG(next->sched_info.response_time, diff); + next->sched_info.began_service = now; + + if (!rq) + return; + /* yes, reusing next's service time is valid */ + rq->info.response_time = + RUNNING_AVG(rq->info.response_time, diff); + rq->info.began_service = now; + + if (prev->state != TASK_RUNNING) + return; + /* if prev arrived subtract rq's last arrival from its arrival */ + diff = now - rq->info.last_arrival; + rq->info.inter_arrival_time = + RUNNING_AVG(rq->info.inter_arrival_time, diff); + rq->info.last_arrival = now; +} + /* * rq_lock - lock a given runqueue and disable interrupts. */ @@ -492,15 +580,18 @@ repeat_lock_task: (p->cpus_allowed & (1UL << smp_processor_id())))) { set_task_cpu(p, smp_processor_id()); + sched_info_arrive(p); task_rq_unlock(rq, &flags); goto repeat_lock_task; } if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - if (sync) + if (sync) { + sched_info_arrive(p); __activate_task(p, rq); - else { + } else { activate_task(p, rq); + sched_info_arrive(p); if (p->prio < rq->curr->prio) resched_task(rq->curr); } @@ -554,6 +645,7 @@ void wake_up_forked_process(task_t * p) p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); + sched_info_arrive(p); if (unlikely(!current->array)) __activate_task(p, rq); @@ -679,6 +771,11 @@ unsigned long nr_running(void) return sum; } +unsigned long nr_running_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_running; +} + unsigned long nr_uninterruptible(void) { unsigned long i, sum = 0; @@ -715,6 +812,11 @@ unsigned long nr_iowait(void) return sum; } +void cpu_sched_info(struct sched_info *info, int cpu) +{ + memcpy(info, &cpu_rq(cpu)->info, sizeof(struct sched_info)); +} + /* * double_rq_lock - safely lock two runqueues * @@ -749,7 +851,7 @@ static inline void double_rq_unlock(runq spin_unlock(&rq2->lock); } -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -774,38 +876,75 @@ static void sched_migrate_task(task_t *p * Find the least loaded CPU. Slightly favor the current CPU by * setting its runqueue length as the minimum to start. */ + static int sched_best_cpu(struct task_struct *p) { - int i, minload, load, best_cpu, node = 0; + int cpu, node, minload, load, best_cpu, best_node; + int this_cpu, this_node, this_node_load; unsigned long cpumask; - best_cpu = task_cpu(p); - if (cpu_rq(best_cpu)->nr_running <= 2) - return best_cpu; + this_cpu = best_cpu = task_cpu(p); + if (cpu_rq(this_cpu)->nr_running <= 2) + return this_cpu; + this_node = best_node = cpu_to_node(this_cpu); + + /* + * First look for any node-local idle queue and use that. + * This improves performance under light loads (mbligh). + * In case this node turns out to be the lightest node, store the best + * cpu that we find, so we don't go sniffing the same runqueues again. + */ + minload = 10000000; + cpumask = node_to_cpumask(this_node); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!(cpumask & (1UL << cpu))) + continue; + load = cpu_rq(cpu)->nr_running; + if (load == 0) + return cpu; + if (load < minload) { + minload = load; + best_cpu = cpu; + } + } + /* + * Now find the lightest loaded node, and put it in best_node + * + * Node load is always divided by nr_cpus_node to normalise load + * values in case cpu count differs from node to node. We first + * multiply node_nr_running by 16 to get a little better resolution. + */ minload = 10000000; - for_each_node_with_cpus(i) { - /* - * Node load is always divided by nr_cpus_node to normalise - * load values in case cpu count differs from node to node. - * We first multiply node_nr_running by 10 to get a little - * better resolution. - */ - load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i); + this_node_load = 16 * atomic_read(&node_nr_running[this_node]) + / nr_cpus_node(this_node); + for_each_node_with_cpus(node) { + if (node == this_node) + load = this_node_load; + else + load = 16 * atomic_read(&node_nr_running[node]) + / nr_cpus_node(node); if (load < minload) { minload = load; - node = i; + best_node = node; } } + /* If we chose this node, we already did the legwork earlier */ + if (best_node == this_node) + return best_cpu; + + /* Now find the lightest loaded cpu on best_node, and use that */ minload = 10000000; - cpumask = node_to_cpumask(node); - for (i = 0; i < NR_CPUS; ++i) { - if (!(cpumask & (1UL << i))) + best_cpu = this_cpu; + cpumask = node_to_cpumask(best_node); + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (!(cpumask & (1UL << cpu))) continue; - if (cpu_rq(i)->nr_running < minload) { - best_cpu = i; - minload = cpu_rq(i)->nr_running; + load = cpu_rq(cpu)->nr_running; + if (load < minload) { + minload = load; + best_cpu = cpu; } } return best_cpu; @@ -856,7 +995,10 @@ static int find_busiest_node(int this_no return node; } -#endif /* CONFIG_NUMA */ +#endif /* CONFIG_NUMA_SCHED */ + +int idle_node_rebalance_ratio = 10; +int busy_node_rebalance_ratio = 2; #ifdef CONFIG_SMP @@ -1085,10 +1227,10 @@ out: */ #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) #define BUSY_REBALANCE_TICK (HZ/5 ?: 1) -#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) -#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) +#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * idle_node_rebalance_ratio) +#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * busy_node_rebalance_ratio) -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) { int node = find_busiest_node(cpu_to_node(this_cpu)); @@ -1119,7 +1261,7 @@ static void rebalance_tick(runqueue_t *t * are not balanced.) */ if (idle) { -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED if (!(j % IDLE_NODE_REBALANCE_TICK)) balance_node(this_rq, idle, this_cpu); #endif @@ -1130,7 +1272,7 @@ static void rebalance_tick(runqueue_t *t } return; } -#ifdef CONFIG_NUMA +#ifdef CONFIG_NUMA_SCHED if (!(j % BUSY_NODE_REBALANCE_TICK)) balance_node(this_rq, idle, this_cpu); #endif @@ -1246,6 +1388,27 @@ void scheduler_tick(int user_ticks, int enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + */ + if (!(p->time_slice % TIMESLICE_GRANULARITY) && + (p->array == rq->active)) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); + } } out_unlock: spin_unlock(&rq->lock); @@ -1258,7 +1421,11 @@ void scheduling_functions_start_here(voi /* * schedule() is the main scheduler function. */ +#ifdef CONFIG_KGDB_THREAD +asmlinkage void do_schedule(void) +#else asmlinkage void schedule(void) +#endif { task_t *prev, *next; runqueue_t *rq; @@ -1339,6 +1506,7 @@ switch_tasks: if (likely(prev != next)) { rq->nr_switches++; + sched_info_switch(prev, next); rq->curr = next; prepare_arch_switch(rq, next); @@ -1485,6 +1653,20 @@ void complete_all(struct completion *x) __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0, 0); spin_unlock_irqrestore(&x->wait.lock, flags); } + +#ifdef CONFIG_KGDB_THREAD +asmlinkage void user_schedule(void) +{ + current->thread.kgdbregs = NULL; + do_schedule(); +} + +asmlinkage void kern_do_schedule(struct pt_regs regs) +{ + current->thread.kgdbregs = ®s; + do_schedule(); +} +#endif void wait_for_completion(struct completion *x) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/sys.c 901-mjb1.1/kernel/sys.c --- 000-virgin/kernel/sys.c Tue Aug 5 20:01:56 2003 +++ 901-mjb1.1/kernel/sys.c Wed Aug 13 20:51:50 2003 @@ -235,6 +235,7 @@ cond_syscall(sys_epoll_ctl) cond_syscall(sys_epoll_wait) cond_syscall(sys_pciconfig_read) cond_syscall(sys_pciconfig_write) +cond_syscall(sys_mbind) static int set_one_prio(struct task_struct *p, int niceval, int error) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/sysctl.c 901-mjb1.1/kernel/sysctl.c --- 000-virgin/kernel/sysctl.c Wed Aug 13 20:24:33 2003 +++ 901-mjb1.1/kernel/sysctl.c Wed Aug 13 20:28:58 2003 @@ -58,6 +58,18 @@ extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; extern int min_free_kbytes; +extern int min_timeslice; +extern int max_timeslice; +extern int child_penalty; +extern int parent_penalty; +extern int exit_weight; +extern int prio_bonus_ratio; +extern int interactive_delta; +extern int max_sleep_avg; +extern int starvation_limit; +extern int node_threshold; +extern int idle_node_rebalance_ratio; +extern int busy_node_rebalance_ratio; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -120,6 +132,7 @@ static struct ctl_table_header root_tabl static ctl_table kern_table[]; static ctl_table vm_table[]; +static ctl_table sched_table[]; #ifdef CONFIG_NET extern ctl_table net_table[]; #endif @@ -199,6 +212,12 @@ static ctl_table root_table[] = { .mode = 0555, .child = dev_table, }, + { + .ctl_name = CTL_SCHED, + .procname = "sched", + .mode = 0555, + .child = sched_table, + }, { .ctl_name = 0 } }; @@ -586,6 +605,7 @@ static ctl_table kern_table[] = { /* Constants for minimum and maximum testing in vm_table. We use these as one-element integer vectors. */ static int zero; +static int one = 1; static int one_hundred = 100; @@ -805,6 +825,48 @@ static ctl_table debug_table[] = { static ctl_table dev_table[] = { { .ctl_name = 0 } }; + +static ctl_table sched_table[] = { + {SCHED_MAX_TIMESLICE, "max_timeslice", &max_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_MIN_TIMESLICE, "min_timeslice", &min_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_CHILD_PENALTY, "child_penalty", &child_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PARENT_PENALTY, "parent_penalty", &parent_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_EXIT_WEIGHT, "exit_weight", &exit_weight, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio", &prio_bonus_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_INTERACTIVE_DELTA, "interactive_delta", &interactive_delta, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_MAX_SLEEP_AVG, "max_sleep_avg", &max_sleep_avg, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_STARVATION_LIMIT, "starvation_limit", &starvation_limit, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_NODE_THRESHOLD, "node_threshold", &node_threshold, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + sysctl_intvec, NULL, &one, NULL}, + {SCHED_IDLE_NODE_REBALANCE_RATIO, "idle_node_rebalance_ratio", + &idle_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_BUSY_NODE_REBALANCE_RATIO, "busy_node_rebalance_ratio", + &busy_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {0} +}; extern void init_irq_proc (void); diff -urpN -X /home/fletch/.diff.exclude 000-virgin/kernel/timer.c 901-mjb1.1/kernel/timer.c --- 000-virgin/kernel/timer.c Wed Aug 13 20:24:33 2003 +++ 901-mjb1.1/kernel/timer.c Wed Aug 13 20:51:40 2003 @@ -756,6 +756,8 @@ static unsigned long count_active_tasks( * Requires xtime_lock to access. */ unsigned long avenrun[3]; +unsigned long tasks_running[3]; +DEFINE_PER_CPU(unsigned long[3],cpu_tasks_running); /* * calc_load - given tick count, update the avenrun load estimates. @@ -763,7 +765,7 @@ unsigned long avenrun[3]; */ static inline void calc_load(unsigned long ticks) { - unsigned long active_tasks; /* fixed-point */ + unsigned long active_tasks, running_tasks; /* fixed-point */ static int count = LOAD_FREQ; count -= ticks; @@ -773,7 +775,37 @@ static inline void calc_load(unsigned lo CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); + running_tasks = nr_running() * FIXED_1; + CALC_LOAD(tasks_running[0], EXP_1, running_tasks); + CALC_LOAD(tasks_running[1], EXP_5, running_tasks); + CALC_LOAD(tasks_running[2], EXP_15, running_tasks); } +} + +/* + * This does the frequency calculation a little bit different from the + * global version above. It doesn't ever look at the kernel's concept + * of time, it just updates that stats every LOAD_FREQ times into the + * function. + * + * Using jiffies is more accurate, but there _are_ just statistics, so + * they're not worth messing with xtime_lock and company. If we miss + * an interrupt or two, big deal. + */ +void calc_load_cpu(int cpu) +{ + unsigned long running_tasks; + static DEFINE_PER_CPU(int, count) = { LOAD_FREQ }; + + per_cpu(count, cpu)--; + if (per_cpu(count, cpu) != 0) + return; + + per_cpu(count, cpu) += LOAD_FREQ; + running_tasks = nr_running_cpu(cpu) * FIXED_1; + CALC_LOAD(per_cpu(cpu_tasks_running, cpu)[0], EXP_1, running_tasks); + CALC_LOAD(per_cpu(cpu_tasks_running, cpu)[1], EXP_5, running_tasks); + CALC_LOAD(per_cpu(cpu_tasks_running, cpu)[2], EXP_15, running_tasks); } /* jiffies at the most recent update of wall time */ diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/Makefile 901-mjb1.1/mm/Makefile --- 000-virgin/mm/Makefile Thu Feb 13 11:08:15 2003 +++ 901-mjb1.1/mm/Makefile Wed Aug 13 20:51:50 2003 @@ -7,8 +7,10 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ shmem.o vmalloc.o -obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := bootmem.o fadvise.o filemap.o mempool.o oom_kill.o \ page_alloc.o page-writeback.o pdflush.o readahead.o \ slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o + +obj-$(CONFIG_NUMA) += mbind.o diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/filemap.c 901-mjb1.1/mm/filemap.c --- 000-virgin/mm/filemap.c Wed Aug 13 20:24:33 2003 +++ 901-mjb1.1/mm/filemap.c Wed Aug 13 20:29:24 2003 @@ -63,6 +63,9 @@ * ->mmap_sem * ->i_shared_sem (various places) * + * ->lock_page + * ->i_shared_sem (page_convert_anon) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/fremap.c 901-mjb1.1/mm/fremap.c --- 000-virgin/mm/fremap.c Fri May 30 19:02:24 2003 +++ 901-mjb1.1/mm/fremap.c Wed Aug 13 20:51:52 2003 @@ -36,7 +36,7 @@ static inline int zap_pte(struct mm_stru set_page_dirty(page); page_remove_rmap(page, ptep); page_cache_release(page); - mm->rss--; + dec_rss(mm, page); } } return 1; @@ -60,10 +60,26 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pmd_t *pmd; struct pte_chain *pte_chain; + unsigned long pgidx; pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto err; + + /* + * Convert this page to anon for objrmap if it's nonlinear + */ + pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (!PageAnon(page) && (page->index != pgidx)) { + lock_page(page); + err = page_convert_anon(page); + unlock_page(page); + if (err < 0) + goto err_free; + } + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -77,7 +93,7 @@ int install_page(struct mm_struct *mm, s flush = zap_pte(mm, vma, addr, pte); - mm->rss++; + inc_rss(mm, page); flush_icache_page(vma, page); set_pte(pte, mk_pte(page, prot)); pte_chain = page_add_rmap(page, pte, pte_chain); @@ -85,12 +101,11 @@ int install_page(struct mm_struct *mm, s if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, *pte); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); +err_free: pte_chain_free(pte_chain); err: return err; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/mbind.c 901-mjb1.1/mm/mbind.c --- 000-virgin/mm/mbind.c Wed Dec 31 16:00:00 1969 +++ 901-mjb1.1/mm/mbind.c Wed Aug 13 20:51:50 2003 @@ -0,0 +1,147 @@ +/* + * mm/mbind.c + * + * Written by: Matthew Dobson, IBM Corporation + * + * Copyright (C) 2003, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + */ +#include +#include +#include +#include +#include + +/* Translate a cpumask to a nodemask */ +static inline void cpumask_to_nodemask(unsigned long * cpumask, unsigned long * nodemask) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + if (test_bit(i, cpumask)) + set_bit(cpu_to_node(i), nodemask); +} + +/* + * Adds the zones belonging to @pgdat to @zonelist. Returns the next + * index in @zonelist. + */ +static inline int add_node(pg_data_t *pgdat, struct zonelist *zonelist, int zone_num) +{ + int i; + struct zone *zone; + + for (i = MAX_NR_ZONES-1; i >=0 ; i--) { + zone = pgdat->node_zones + i; + if (zone->present_pages) + zonelist->zones[zone_num++] = zone; + } + return zone_num; +} + +/* Builds a binding for a region of memory, based on a bitmask of nodes. */ +static inline int build_binding(unsigned long * nodemask, struct binding *binding) +{ + int node, zone_num; + + memset(binding, 0, sizeof(struct binding)); + + /* Build binding zonelist */ + for (node = 0, zone_num = 0; node < MAX_NUMNODES; node++) + if (test_bit(node, nodemask) && node_online(node)) + zone_num = add_node(NODE_DATA(node), + &binding->zonelist, zone_num); + binding->zonelist.zones[zone_num] = NULL; + + if (zone_num == 0) + /* No zones were added to the zonelist. Let the caller know. */ + return -EINVAL; + + return 0; +} + + +/* + * mbind - Bind a range of a process' VM space to a set of memory blocks according to + * a predefined policy. + * @start: beginning address of memory region to bind + * @len: length of memory region to bind + * @mask_ptr: pointer to bitmask of cpus + * @mask_len: length of the bitmask + * @policy: flag specifying the policy to use for the segment + */ +asmlinkage unsigned long sys_mbind(unsigned long start, unsigned long len, + unsigned long *mask_ptr, unsigned int mask_len, unsigned long policy) +{ + DECLARE_BITMAP(cpu_mask, NR_CPUS); + DECLARE_BITMAP(node_mask, MAX_NUMNODES); + struct vm_area_struct *vma = NULL; + struct address_space *mapping; + int copy_len, error = 0; + + /* Deal with getting cpu_mask from userspace & translating to node_mask */ + CLEAR_BITMAP(cpu_mask, NR_CPUS); + CLEAR_BITMAP(node_mask, MAX_NUMNODES); + copy_len = min(mask_len, (unsigned int)NR_CPUS); + if (copy_from_user(cpu_mask, mask_ptr, (copy_len+7)/8)) { + error = -EFAULT; + goto out; + } + cpumask_to_nodemask(cpu_mask, node_mask); + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + up_read(¤t->mm->mmap_sem); + /* This is an ugly, gross hack. This is purely because I've hurt my + * brain trying to come up with a brilliant way of implementing this + * for VMA's in general. Shared Memory VMA's lend themselves to binding + * both because of how they're implemented, and their actual uses. + * If anyone has a great place to squirrel-away some data about the + * requested binding, and a way to easily force the allocator to respect + * these bindings, then send a patch, or let me know. Otherwise, this + * will have to wait for a stroke of insight. + */ + if (!(vma && vma->vm_file && vma->vm_ops && + vma->vm_ops->nopage == shmem_nopage)) { + /* This isn't a shm segment. For now, we bail. */ + error = -EINVAL; + goto out; + } + + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + if (mapping->binding) { + kfree(mapping->binding); + mapping->binding = NULL; + } + mapping->binding = kmalloc(sizeof(struct binding), GFP_KERNEL); + if (!mapping->binding) { + error = -ENOMEM; + goto out; + } + error = build_binding(node_mask, mapping->binding); + if (error) { + kfree(mapping->binding); + mapping->binding = NULL; + } + +out: + return error; +} diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/memory.c 901-mjb1.1/mm/memory.c --- 000-virgin/mm/memory.c Wed Aug 13 20:24:33 2003 +++ 901-mjb1.1/mm/memory.c Wed Aug 13 20:51:52 2003 @@ -100,10 +100,10 @@ static inline void free_one_pmd(struct m pte_free_tlb(tlb, page); } -static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) +static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * pgd, unsigned long pgdi) { - int j; - pmd_t * pmd; + pmd_t * pmd, * md, * emd; + pgd_t *dir = pgd + pgdi; if (pgd_none(*dir)) return; @@ -114,8 +114,21 @@ static inline void free_one_pgd(struct m } pmd = pmd_offset(dir, 0); pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(tlb, pmd+j); + /* + * Beware if changing the loop below. It once used int j, + * for (j = 0; j < PTRS_PER_PMD; j++) + * free_one_pmd(pmd+j); + * but some older i386 compilers (e.g. egcs-2.91.66, gcc-2.95.3) + * terminated the loop with a _signed_ address comparison + * using "jle", when configured for HIGHMEM64GB (X86_PAE). + * If also configured for 3GB of kernel virtual address space, + * if page at physical 0x3ffff000 virtual 0x7ffff000 is used as + * a pmd, when that mm exits the loop goes on to free "entries" + * found at 0x80000000 onwards. The loop below compiles instead + * to be terminated by unsigned address comparison using "jb". + */ + for (md = pmd, emd = pmd + USER_PTRS_PER_PMD(pgdi); md < emd; md++) + free_one_pmd(tlb,md); pmd_free_tlb(tlb, pmd); } @@ -128,11 +141,11 @@ static inline void free_one_pgd(struct m void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr) { pgd_t * page_dir = tlb->mm->pgd; - - page_dir += first; + int index = first; + do { - free_one_pgd(tlb, page_dir); - page_dir++; + free_one_pgd(tlb, page_dir, index); + index++; } while (--nr); } @@ -319,7 +332,7 @@ skip_copy_pte_range: pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(page); - dst->rss++; + inc_rss(dst, page); set_pte(dst_pte, pte); pte_chain = page_add_rmap(page, dst_pte, @@ -411,7 +424,14 @@ zap_pte_range(struct mmu_gather *tlb, pm if (page->mapping && pte_young(pte) && !PageSwapCache(page)) mark_page_accessed(page); - tlb->freed++; + /* + * While we have the page that is being + * freed handy, make sure we decrement + * the mm's RSS accordingly. This is + * only important for NUMA per-node + * RSS accounting. + */ + dec_rss(tlb->mm, page); page_remove_rmap(page, ptep); tlb_remove_page(tlb, page); } @@ -1037,9 +1057,10 @@ static int do_wp_page(struct mm_struct * page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) - ++mm->rss; + inc_rss(mm, new_page); page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + SetPageAnon(new_page); pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); @@ -1270,7 +1291,7 @@ static int do_swap_page(struct mm_struct if (vm_swap_full()) remove_exclusive_swap_page(page); - mm->rss++; + inc_rss(mm, page); pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte = pte_mkdirty(pte_mkwrite(pte)); @@ -1278,6 +1299,7 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); set_pte(page_table, pte); + SetPageAnon(page); pte_chain = page_add_rmap(page, page_table, pte_chain); /* No need to invalidate - it was non-present before */ @@ -1339,10 +1361,11 @@ do_anonymous_page(struct mm_struct *mm, ret = VM_FAULT_MINOR; goto out; } - mm->rss++; + inc_rss(mm, page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); mark_page_accessed(page); + SetPageAnon(page); } set_pte(page_table, entry); @@ -1408,6 +1431,10 @@ retry: if (!pte_chain) goto oom; + /* See if nopage returned an anon page */ + if (!new_page->mapping || PageSwapCache(new_page)) + SetPageAnon(new_page); + /* * Should we do an early C-O-W break? */ @@ -1420,6 +1447,7 @@ retry: copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); + SetPageAnon(page); new_page = page; } @@ -1449,7 +1477,7 @@ retry: */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { - ++mm->rss; + inc_rss(mm, new_page); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/mmap.c 901-mjb1.1/mm/mmap.c --- 000-virgin/mm/mmap.c Tue Aug 5 20:01:43 2003 +++ 901-mjb1.1/mm/mmap.c Wed Aug 13 20:51:52 2003 @@ -291,9 +291,7 @@ static void vma_link(struct mm_struct *m if (mapping) down(&mapping->i_shared_sem); - spin_lock(&mm->page_table_lock); __vma_link(mm, vma, prev, rb_link, rb_parent); - spin_unlock(&mm->page_table_lock); if (mapping) up(&mapping->i_shared_sem); @@ -322,6 +320,25 @@ static inline int is_mergeable_vma(struc return 1; } +static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) +{ + struct inode *inode = NULL; + + if (vma->vm_file) { + inode = vma->vm_file->f_dentry->d_inode; + down(&inode->i_mapping->i_shared_sem); + } + if (inode) + __remove_shared_vm_struct(vma, inode); + /* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */ + vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = addr; + if (inode) { + __vma_link_file(vma); + up(&inode->i_mapping->i_shared_sem); + } +} + /* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. @@ -374,8 +391,6 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t * lock = &mm->page_table_lock; - /* * We later require that vma->vm_flags == vm_flags, so this tests * vma->vm_flags & VM_SPECIAL, too. @@ -403,7 +418,6 @@ static int vma_merge(struct mm_struct *m down(&inode->i_mapping->i_shared_sem); need_up = 1; } - spin_lock(lock); prev->vm_end = end; /* @@ -416,7 +430,6 @@ static int vma_merge(struct mm_struct *m prev->vm_end = next->vm_end; __vma_unlink(mm, next, prev); __remove_shared_vm_struct(next, inode); - spin_unlock(lock); if (need_up) up(&inode->i_mapping->i_shared_sem); if (file) @@ -426,7 +439,6 @@ static int vma_merge(struct mm_struct *m kmem_cache_free(vm_area_cachep, next); return 1; } - spin_unlock(lock); if (need_up) up(&inode->i_mapping->i_shared_sem); return 1; @@ -442,10 +454,7 @@ static int vma_merge(struct mm_struct *m pgoff, (end - addr) >> PAGE_SHIFT)) return 0; if (end == prev->vm_start) { - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + move_vma_start(prev, addr); return 1; } } @@ -891,19 +900,16 @@ int expand_stack(struct vm_area_struct * */ address += 4 + PAGE_SIZE - 1; address &= PAGE_MASK; - spin_lock(&vma->vm_mm->page_table_lock); grow = (address - vma->vm_end) >> PAGE_SHIFT; /* Overcommit.. */ if (security_vm_enough_memory(grow)) { - spin_unlock(&vma->vm_mm->page_table_lock); return -ENOMEM; } if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur || ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { - spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); return -ENOMEM; } @@ -911,7 +917,6 @@ int expand_stack(struct vm_area_struct * vma->vm_mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) vma->vm_mm->locked_vm += grow; - spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -945,19 +950,16 @@ int expand_stack(struct vm_area_struct * * the spinlock only before relocating the vma range ourself. */ address &= PAGE_MASK; - spin_lock(&vma->vm_mm->page_table_lock); grow = (vma->vm_start - address) >> PAGE_SHIFT; /* Overcommit.. */ if (security_vm_enough_memory(grow)) { - spin_unlock(&vma->vm_mm->page_table_lock); return -ENOMEM; } if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { - spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); return -ENOMEM; } @@ -966,7 +968,6 @@ int expand_stack(struct vm_area_struct * vma->vm_mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) vma->vm_mm->locked_vm += grow; - spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -1129,8 +1130,6 @@ static void unmap_region(struct mm_struc /* * Create a list of vma's touched by the unmap, removing them from the mm's * vma list as we go.. - * - * Called with the page_table_lock held. */ static void detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, @@ -1174,8 +1173,7 @@ int split_vma(struct mm_struct * mm, str if (new_below) { new->vm_end = addr; - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + move_vma_start(vma, addr); } else { vma->vm_end = addr; new->vm_start = addr; @@ -1254,8 +1252,8 @@ int do_munmap(struct mm_struct *mm, unsi /* * Remove the vma's, and unmap the actual pages */ - spin_lock(&mm->page_table_lock); detach_vmas_to_be_unmapped(mm, mpnt, prev, end); + spin_lock(&mm->page_table_lock); unmap_region(mm, mpnt, prev, start, end); spin_unlock(&mm->page_table_lock); @@ -1404,7 +1402,7 @@ void exit_mmap(struct mm_struct *mm) vma = mm->mmap; mm->mmap = mm->mmap_cache = NULL; mm->mm_rb = RB_ROOT; - mm->rss = 0; + zero_rss(mm); mm->total_vm = 0; mm->locked_vm = 0; diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/page_alloc.c 901-mjb1.1/mm/page_alloc.c --- 000-virgin/mm/page_alloc.c Wed Aug 13 20:24:33 2003 +++ 901-mjb1.1/mm/page_alloc.c Wed Aug 13 21:09:02 2003 @@ -224,6 +224,8 @@ static inline void free_pages_check(cons bad_page(function, page); if (PageDirty(page)) ClearPageDirty(page); + if (PageAnon(page)) + ClearPageAnon(page); } /* @@ -558,7 +560,11 @@ __alloc_pages(unsigned int gfp_mask, uns min = 1UL << order; for (i = 0; zones[i] != NULL; i++) { struct zone *z = zones[i]; - + + if ((__GFP_NODE_STRICT & gfp_mask) && + (pfn_to_nid(z->zone_start_pfn) != numa_node_id())) + continue; + min += z->pages_low; if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/rmap.c 901-mjb1.1/mm/rmap.c --- 000-virgin/mm/rmap.c Wed Jul 2 21:59:16 2003 +++ 901-mjb1.1/mm/rmap.c Wed Aug 13 20:51:53 2003 @@ -102,6 +102,136 @@ pte_chain_encode(struct pte_chain *pte_c **/ /** + * find_pte - Find a pte pointer given a vma and a struct page. + * @vma: the vma to search + * @page: the page to find + * + * Determine if this page is mapped in this vma. If it is, map and rethrn + * the pte pointer associated with it. Return null if the page is not + * mapped in this vma for any reason. + * + * This is strictly an internal helper function for the object-based rmap + * functions. + * + * It is the caller's responsibility to unmap the pte if it is returned. + */ +static inline pte_t * +find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long loffset; + unsigned long address; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (addr) + *addr = address; + + return pte; + +out_unmap: + pte_unmap(pte); +out: + return NULL; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @vma: the vma to look in. + * @page: the page we're working on. + * + * Find a pte entry for a page/vma pair, then check and clear the referenced + * bit. + * + * This is strictly a helper function for page_referenced_obj. + */ +static int +page_referenced_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte; + int referenced = 0; + + if (!spin_trylock(&mm->page_table_lock)) + return 1; + + pte = find_pte(vma, page, NULL); + if (pte) { + if (ptep_test_and_clear_young(pte)) + referenced++; + pte_unmap(pte); + } + + spin_unlock(&mm->page_table_lock); + return referenced; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * assume a reference count of 1. + */ +static int +page_referenced_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int referenced = 0; + + if (!page->pte.mapcount) + return 0; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return 1; + + list_for_each_entry(vma, &mapping->i_mmap, shared) + referenced += page_referenced_obj_one(vma, page); + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) + referenced += page_referenced_obj_one(vma, page); + + up(&mapping->i_shared_sem); + + return referenced; +} + +/** * page_referenced - test if the page was referenced * @page: the page to test * @@ -120,6 +250,10 @@ int page_referenced(struct page * page) if (TestClearPageReferenced(page)) referenced++; + if (!PageAnon(page)) { + referenced += page_referenced_obj(page); + goto out; + } if (PageDirect(page)) { pte_t *pte = rmap_ptep_map(page->pte.direct); if (ptep_test_and_clear_young(pte)) @@ -153,6 +287,7 @@ int page_referenced(struct page * page) __pte_chain_free(pc); } } +out: return referenced; } @@ -175,6 +310,21 @@ page_add_rmap(struct page *page, pte_t * pte_chain_lock(page); + /* + * If this is an object-based page, just count it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + inc_page_state(nr_mapped); + page->pte.mapcount++; + goto out; + } + if (page->pte.direct == 0) { page->pte.direct = pte_paddr; SetPageDirect(page); @@ -231,8 +381,25 @@ void page_remove_rmap(struct page *page, pte_chain_lock(page); if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + goto out_unlock; + /* + * If this is an object-based page, just uncount it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + BUG(); + page->pte.mapcount--; + if (!page->pte.mapcount) + dec_page_state(nr_mapped); + goto out_unlock; + } + if (PageDirect(page)) { if (page->pte.direct == pte_paddr) { page->pte.direct = 0; @@ -279,6 +446,102 @@ out_unlock: } /** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Determine whether a page is mapped in a given vma and unmap it if it's found. + * + * This function is strictly a helper function for try_to_unmap_obj. + */ +static inline int +try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + pte_t pteval; + int ret = SWAP_AGAIN; + + if (!spin_trylock(&mm->page_table_lock)) + return ret; + + pte = find_pte(vma, page, &address); + if (!pte) + goto out; + + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unmap; + } + + flush_cache_page(vma, address); + pteval = ptep_get_and_clear(pte); + flush_tlb_page(vma, address); + + if (pte_dirty(pteval)) + set_page_dirty(page); + + if (!page->pte.mapcount) + BUG(); + + mm->rss--; + page->pte.mapcount--; + page_cache_release(page); + +out_unmap: + pte_unmap(pte); + +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * return a temporary error. + */ +static int +try_to_unmap_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return ret; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + +out: + up(&mapping->i_shared_sem); + return ret; +} + +/** * try_to_unmap_one - worker function for try_to_unmap * @page: page to unmap * @ptep: page table entry to unmap from page @@ -360,7 +623,7 @@ static int try_to_unmap_one(struct page if (pte_dirty(pte)) set_page_dirty(page); - mm->rss--; + dec_rss(mm, page); page_cache_release(page); ret = SWAP_SUCCESS; @@ -397,6 +660,15 @@ int try_to_unmap(struct page * page) if (!page->mapping) BUG(); + /* + * If it's an object-based page, use the object vma chain to find all + * the mappings. + */ + if (!PageAnon(page)) { + ret = try_to_unmap_obj(page); + goto out; + } + if (PageDirect(page)) { ret = try_to_unmap_one(page, page->pte.direct); if (ret == SWAP_SUCCESS) { @@ -452,9 +724,112 @@ int try_to_unmap(struct page * page) } } out: - if (!page_mapped(page)) + if (!page_mapped(page)) { dec_page_state(nr_mapped); + ret = SWAP_SUCCESS; + } return ret; +} + +/** + * page_convert_anon - Convert an object-based mapped page to pte_chain-based. + * @page: the page to convert + * + * Find all the mappings for an object-based page and convert them + * to 'anonymous', ie create a pte_chain and store all the pte pointers there. + * + * This function takes the address_space->i_shared_sem, sets the PageAnon flag, + * then sets the mm->page_table_lock for each vma and calls page_add_rmap. This + * means there is a period when PageAnon is set, but still has some mappings + * with no pte_chain entry. This is in fact safe, since page_remove_rmap will + * simply not find it. try_to_unmap might erroneously return success, but it + * will never be called because the page_convert_anon() caller has locked the + * page. + * + * page_referenced() may fail to scan all the appropriate pte's and may return + * an inaccurate result. This is so rare that it does not matter. + */ +int page_convert_anon(struct page *page) +{ + struct address_space *mapping; + struct vm_area_struct *vma; + struct pte_chain *pte_chain = NULL; + pte_t *pte; + int err = 0; + + mapping = page->mapping; + if (mapping == NULL) + goto out; /* truncate won the lock_page() race */ + + down(&mapping->i_shared_sem); + pte_chain_lock(page); + + /* + * Has someone else done it for us before we got the lock? + * If so, pte.direct or pte.chain has replaced pte.mapcount. + */ + if (PageAnon(page)) { + pte_chain_unlock(page); + goto out_unlock; + } + + SetPageAnon(page); + if (page->pte.mapcount == 0) { + pte_chain_unlock(page); + goto out_unlock; + } + /* This is gonna get incremented by page_add_rmap */ + dec_page_state(nr_mapped); + page->pte.mapcount = 0; + + /* + * Now that the page is marked as anon, unlock it. page_add_rmap will + * lock it as necessary. + */ + pte_chain_unlock(page); + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + +out_unlock: + pte_chain_free(pte_chain); + up(&mapping->i_shared_sem); +out: + return err; } /** diff -urpN -X /home/fletch/.diff.exclude 000-virgin/mm/swapfile.c 901-mjb1.1/mm/swapfile.c --- 000-virgin/mm/swapfile.c Wed Aug 13 20:24:33 2003 +++ 901-mjb1.1/mm/swapfile.c Wed Aug 13 20:51:53 2003 @@ -386,9 +386,10 @@ static void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { - vma->vm_mm->rss++; + inc_rss(vma->vm_mm, page); get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + SetPageAnon(page); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); } @@ -497,6 +498,7 @@ static int unuse_process(struct mm_struc /* * Go through process' page directory. */ + down_read(&mm->mmap_sem); spin_lock(&mm->page_table_lock); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); @@ -504,6 +506,7 @@ static int unuse_process(struct mm_struc break; } spin_unlock(&mm->page_table_lock); + up_read(&mm->mmap_sem); pte_chain_free(pte_chain); return 0; } diff -urpN -X /home/fletch/.diff.exclude 000-virgin/scripts/Makefile.build 901-mjb1.1/scripts/Makefile.build --- 000-virgin/scripts/Makefile.build Sat Jun 14 18:37:41 2003 +++ 901-mjb1.1/scripts/Makefile.build Wed Aug 13 20:51:56 2003 @@ -119,7 +119,16 @@ cmd_cc_i_c = $(CPP) $(c_flags) - quiet_cmd_cc_o_c = CC $(quiet_modtag) $@ ifndef CONFIG_MODVERSIONS -cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< +new1_c_flags = $(c_flags:-I%=-I$(TOPDIR)/%) +new2_c_flags = $(new1_c_flags:-Wp%=) +PWD = $(TOPDIR) + +quiet_cmd_cc_o_c = CC $(quiet_modtag) $@ +cmd_cc_o_c = $(CC) $(c_flags) -E -o $@ $< \ + && cd $(dir $<) \ + && $(CC) $(new2_c_flags) -c -o $(notdir $@) $(notdir $<) \ + && cd $(TOPDIR) +#cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< else # When module versioning is enabled the following steps are executed: @@ -134,12 +143,21 @@ else # replace the unresolved symbols __crc_exported_symbol with # the actual value of the checksum generated by genksyms -cmd_cc_o_c = $(CC) $(c_flags) -c -o $(@D)/.tmp_$(@F) $< +new1_c_flags = $(c_flags:-I%=-I$(TOPDIR)/%) +new2_c_flags = $(new1_c_flags:-Wp%=) +PWD = $(TOPDIR) + +quiet_cmd_cc_o_c = CC $(quiet_modtag) $@ +cmd_cc_o_c = $(CC) $(c_flags) -E -o $@ $< \ + && cd $(dir $<) \ + && $(CC) $(new2_c_flags) -c -o .tmp_$(@F) $(notdir $<) \ + && cd $(TOPDIR) +#cmd_cc_o_c = $(CC) $(c_flags) -c -o $(@D)/.tmp_$(@F) $< cmd_modversions = \ if ! $(OBJDUMP) -h $(@D)/.tmp_$(@F) | grep -q __ksymtab; then \ mv $(@D)/.tmp_$(@F) $@; \ else \ - $(CPP) -D__GENKSYMS__ $(c_flags) $< \ + $(CPP) -D__GENKSYMS__ $(new2_c_flags) $< \ | $(GENKSYMS) \ > $(@D)/.tmp_$(@F:.o=.ver); \ \