diff -urpN -X /home/fletch/.diff.exclude 00-virgin/Documentation/filesystems/proc.txt 90-mjb/Documentation/filesystems/proc.txt --- 00-virgin/Documentation/filesystems/proc.txt Mon Jan 13 21:09:08 2003 +++ 90-mjb/Documentation/filesystems/proc.txt Sat Feb 1 22:09:10 2003 @@ -37,6 +37,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/sched - scheduler tunables ------------------------------------------------------------------------------ Preface @@ -1662,6 +1663,104 @@ IPX. The /proc/net/ipx_route table holds a list of IPX routes. For each route it gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. + +2.11 /proc/sys/sched - scheduler tunables +----------------------------------------- + +Useful knobs for tuning the scheduler live in /proc/sys/sched. + +child_penalty +------------- + +Percentage of the parent's sleep_avg that children inherit. sleep_avg is +a running average of the time a process spends sleeping. Tasks with high +sleep_avg values are considered interactive and given a higher dynamic +priority and a larger timeslice. You typically want this some value just +under 100. + +exit_weight +----------- + +When a CPU hog task exits, its parent's sleep_avg is reduced by a factor of +exit_weight against the exiting task's sleep_avg. + +interactive_delta +----------------- + +If a task is "interactive" it is reinserted into the active array after it +has expired its timeslice, instead of being inserted into the expired array. +How "interactive" a task must be in order to be deemed interactive is a +function of its nice value. This interactive limit is scaled linearly by nice +value and is offset by the interactive_delta. + +max_sleep_avg +------------- + +max_sleep_avg is the largest value (in ms) stored for a task's running sleep +average. The larger this value, the longer a task needs to sleep to be +considered interactive (maximum interactive bonus is a function of +max_sleep_avg). + +max_timeslice +------------- + +Maximum timeslice, in milliseconds. This is the value given to tasks of the +highest dynamic priority. + +min_timeslice +------------- + +Minimum timeslice, in milliseconds. This is the value given to tasks of the +lowest dynamic priority. Every task gets at least this slice of the processor +per array switch. + +parent_penalty +-------------- + +Percentage of the parent's sleep_avg that it retains across a fork(). +sleep_avg is a running average of the time a process spends sleeping. Tasks +with high sleep_avg values are considered interactive and given a higher +dynamic priority and a larger timeslice. Normally, this value is 100 and thus +task's retain their sleep_avg on fork. If you want to punish interactive +tasks for forking, set this below 100. + +prio_bonus_ratio +---------------- + +Middle percentage of the priority range that tasks can receive as a dynamic +priority. The default value of 25% ensures that nice values at the +extremes are still enforced. For example, nice +19 interactive tasks will +never be able to preempt a nice 0 CPU hog. Setting this higher will increase +the size of the priority range the tasks can receive as a bonus. Setting +this lower will decrease this range, making the interactivity bonus less +apparent and user nice values more applicable. + +starvation_limit +---------------- + +Sufficiently interactive tasks are reinserted into the active array when they +run out of timeslice. Normally, tasks are inserted into the expired array. +Reinserting interactive tasks into the active array allows them to remain +runnable, which is important to interactive performance. This could starve +expired tasks, however, since the interactive task could prevent the array +switch. To prevent starving the tasks on the expired array for too long. the +starvation_limit is the longest (in ms) we will let the expired array starve +at the expense of reinserting interactive tasks back into active. Higher +values here give more preferance to running interactive tasks, at the expense +of expired tasks. Lower values provide more fair scheduling behavior, at the +expense of interactivity. The units are in milliseconds. + +idle_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N idle rebalance ticks, where N = idle_node_rebalance_ratio. + +busy_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N busy rebalance ticks, where N = busy_node_rebalance_ratio. ------------------------------------------------------------------------------ Summary diff -urpN -X /home/fletch/.diff.exclude 00-virgin/Documentation/i386/gdb-serial.txt 90-mjb/Documentation/i386/gdb-serial.txt --- 00-virgin/Documentation/i386/gdb-serial.txt Wed Dec 31 16:00:00 1969 +++ 90-mjb/Documentation/i386/gdb-serial.txt Sat Feb 1 22:09:06 2003 @@ -0,0 +1,386 @@ +Version +======= + +This version of the gdbstub package was developed and tested on +kernel version 2.3.48. It will not install on a 2.2 kernel. It may +not work on earlier versions of 2.3 kernels. It is possible that +it will continue to work on later versions of 2.3 and then +versions of 2.4 (I hope). + + +Debugging Setup +=============== + +Designate one machine as the "development" machine. This is the +machine on which you run your compiles and which has your source +code for the kernel. Designate a second machine as the "target" +machine. This is the machine that will run your experimental +kernel. + +The two machines will be connected together via a serial line out +one or the other of the COM ports of the PC. You will need a modem +eliminator and the appropriate cables. + +On the DEVELOPMENT machine you need to apply the patch for the gdb +hooks. You have probably already done that if you are reading this +file. + +On your DEVELOPMENT machine, go to your kernel source directory and +do "make menuconfig". Go down to the kernel hacking menu item and +open it up. Enable the kernel gdb stub code by selecting that item. + +Save and exit the menuconfig program. Then do "make clean" and +"make bzImage" (or whatever target you want to make). This gets +the kernel compiled with the "-g" option set -- necessary for +debugging. + +You have just built the kernel on your DEVELOPMENT machine that you +intend to run on our TARGET machine. + +To install this new kernel, use the following installation procedure. +Remember, you are on the DEVELOPMENT machine patching the kernel source +for the kernel that you intend to run on the TARGET machine. + +Copy this kernel to your target machine using your usual procedures. +I usually arrange to copy development:/usr/src/linux/arch/i386/boot/zImage +to /vmlinuz on the TARGET machine via a LAN based NFS access. That is, +I run the cp command on the target and copy from the development machine +via the LAN. Run Lilo on the new kernel on the target machine so that it +will boot! Then boot the kernel on the target machine. + +There is an utility program named "gdbstart" in the +development:/usr/src/linux/arch/i386/kernel directory. +You should copy this program over to your target machine, probably into +/sbin. This utility program is run on the target machine to +activate the kernel hooks for the debugger. It is invoked as follows: + + gdbstart [-s speed] [-t tty-dev] + defaults: /dev/ttyS0 with speed unmodified by gdbstart + +Don't run the program just yet. We'll get to that in a bit. + +Decide on which tty port you want the machines to communicate, then +cable them up back-to-back using the null modem. COM1 is /dev/ttyS0 +and COM2 is /dev/ttyS1. + +On the DEVELOPMENT machine, create a file called .gdbinit in the +directory /usr/src/linux. An example .gdbinit file looks like this: + +define rmt +set remotebaud 38400 +target remote /dev/ttyS0 +end + +Assuming that you added my gdbinit stuff to your .gdbinit, edit .gdbinit +and find the section that looks like this: + + define rmt + set remotebaud 38400 + target remote /dev/ttyS0 + end + +Change the "target" definition so that it specifies the tty port that +you intend to use. Change the "remotebaud" definition to match the +data rate that you are going to use for the com line. + +On the TARGET machine I find it helpful to create shell script file +named "debug" in the root home directory with the following contents: + + gdbstart -s 38400 -t /dev/ttyS0 < + EOF + +This runs the gdbstart program and gives it the carriage return that +it prompts for. This sets the data rate from the target machine's side. + +You are now ready to try it out. + +On your TARGET machine, freshly rebooted with your gdbstub-equipped +kernel, type "debug" in the root home directory. The system will appear +to hang with some messages on the screen from the debug stub. What +it is doing is waiting for contact from the development machine. + +On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". +When gdb gets the symbols loaded and prompts you, enter "rmt" (that's +the macro from the .gdbinit file that you just edited). If everything +is working correctly you should see gdb print out a few lines indicating +that a breakpoint has been taken. It will actually show a line of +code in the target kernel inside the gdbstub activation code. + +The gdb interaction should look something like this: + + linux-dev:/usr/src/linux# gdb vmlinux + GDB is free software and you are welcome to distribute copies of it + under certain conditions; type "show copying" to see the conditions. + There is absolutely no warranty for GDB; type "show warranty" for details. + GDB 4.15.1 (i486-slackware-linux), + Copyright 1995 Free Software Foundation, Inc... + (gdb) rmt + breakpoint () at i386-stub.c:750 + 750 } + (gdb) + + +You can now use whatever gdb commands you like to set breakpoints. +Enter "continue" to start your target machine executing again. At this +point the target system will run at full speed until it encounters +your breakpoint or gets a segment violation in the kernel, or whatever. + + +Triggering gdbstub at Kernel Boot Time +====================================== + +The gdbstub patch now has the ability for gdb to connect to the kernel during +bootup (as opposed to waiting for the system to come all the way up and then +running the gdbstart program on the target machine). This new functionality was +added by Scott Foehner at SGI. + +To force a kernel that has been compiled with gdbstub to pause during the boot +process and wait for a connection from gdb, the paramter "gdb" should be passed +to the kernel. This can be done by typing "gdb" after the name of the kernel +on the LILO command line. The patch defaults to use ttyS1 at a baud rate of +38400. These parameters can be changed by using "gdbttyS=" and +"gdbbaud=" on the command line. + +Example: + +LILO boot: linux gdb gdbttyS=1 gdbbaud=38400 + +Note that this command is entered on the TARGET machine as it is booting +the kernel that was compiled on the DEVELOPMENT machine. + +An alternate approach is to place a line in the /etc/lilo.conf file on +your TARGET machine. Under the heading for the kernel that you intend +to boot, place a line that looks like this: + + append = "gdb gdbttyS=1 gdbbaud=38400" + +This will cause the kernel to enter the gdbstub automatically at boot +time. + +BE SURE to run "lilo" after changing the /etc/lilo.conf file. + + +The "gdbstart" Program +===================== + +This utility program is used to set up the com port and data rate +for the connection from the target system to the development system. +Its usage has been described above. + +This version of the patch uses the same tty ioctl for kernel versions +2.0.30 onwards. Thus, the gdbstart utility does not need to be re-compiled +to install the patch in a later version of the kernel. The ioctl added +to the kernel for this purpose is far enough "off the end" of existing +ioctls (as of 2.1.120) that it should not interfere with any new kernel +tty ioctls for quite some time (famous last words). + +The source for the gdbstart program resides in the arch/i386/kernel directory. + + +Debugging hints +=============== + +You can break into the target machine at any time from the development +machine by typing ^C. If the target machine has interrupts enabled +this will stop it in the kernel and enter the debugger. + +There is unfortunately no way of breaking into the kernel if it is +in a loop with interrupts disabled, so if this happens to you then +you need to place exploratory breakpoints or printk's into the kernel +to find out where it is looping. + +There is a copy of an e-mail in the kgdb distribution directory which +describes how to create an NMI on an ISA bus machine using a paper +clip. I have a sophisticated version of this made by wiring a push +button switch into a PC104/ISA bus adapter card. The adapter card +nicely furnishes wire wrap pins for all the ISA bus signals. + +When you are done debugging the kernel on the target machine it is +a good idea to leave it in a running state. This makes reboots +faster, bypassing the fsck. So do a gdb "continue" as the last gdb +command if this is possible. To terminate gdb itself on the development +machine and leave the target machine running, type ^Z to suspend gdb +and then kill it with "kill %1" or something similar. + +If gdbstub Does Not Work +======================== + +If it doesn't work, you will have to troubleshoot it. Do the easy things +first like double checking your cabling and data rates. You might +try some non-kernel based programs to see if the back-to-back connection +works properly. Just something simple like cat /etc/hosts >/dev/ttyS0 +on one machine and cat /dev/ttyS0 on the other will tell you if you +can send data from one machine to the other. There is no point in tearing +out your hair in the kernel if the line doesn't work. + +All of the real action takes place in the file +/usr/src/linux/arch/i386/kernel/gdbstub.c. That is the code on the target +machine that interacts with gdb on the development machine. In gdb you can +turn on a debug switch with the following command: + + set remotedebug + +This will print out the protocol messages that gdb is exchanging with +the target machine. + +Another place to look is /usr/src/linux/drivers/char/gdbserial.c +That is the code that talks to the serial port on the target side. +There might be a problem there. + +If you are really desperate you can use printk debugging in the +gdbstub code in the target kernel until you get it working. In particular, +there is a global variable in /usr/src/linux/arch/i386/kernel/gdbstub.c +named "remote_debug". Compile your kernel with this set to 1, rather +than 0 and the debug stub will print out lots of stuff as it does +what it does. + + +Debugging Loadable Modules +========================== + +This technique comes courtesy of Edouard Parmelan + + +When you run gdb, enter the command + +source gdbinit-modules + +This will read in a file of gdb macros that was installed in your +kernel source directory with kgdb was installed. This file implements +the following commands: + +mod-list + Lists the loaded modules in the form + +mod-print-symbols + Prints all the symbols in the indicated module. + +mod-add-symbols + Loads the symbols from the object file and associates them + with the indicated module. + +After you have loaded the module that you want to debug, use the command +mod-list to find the of your module. Then use that +address in the mod-add-symbols command to load your module's symbols. +From that point onward you can debug your module as if it were a part +of the kernel. + +The file gdbinit-modules also contains a command named mod-add-lis as +an example of how to construct a command of your own to load your +favorite module. The idea is to "can" the pathname of the module +in the command so you don't have to type so much. + +Threads +======= + +Each process in a target machine is seen as a gdb thread. gdb thread related +commands (info threads, thread n) can be used. + +ia-32 hardware breakpoints +========================== + +gdb stub contains support for hardware breakpoints using debugging features +of ia-32(x86) processors. These breakpoints do not need code modification. +They use debugging registers. 4 hardware breakpoints are available in ia-32 +processors. + +Each hardware breakpoint can be of one of the following three types. +1. Execution breakpoint - An Execution breakpoint is triggered when code at the + breakpoint address is executed. + + As limited number of hardware breakpoints are available, it is advisable + to use software breakpoints ( break command ) instead of execution + hardware breakpoints, unless modification of code is to be avoided. + +2. Write breakpoint - A write breakpoint is triggered when memory location at the + breakpoint address is written. + + A write or can be placed for data of variable length. Length of a write + breakpoint indicates length of the datatype to be watched. Length is 1 + for 1 byte data , 2 for 2 byte data, 3 for 4 byte data. + +3. Access breakpoint - An access breakpoint is triggered when memory location at + the breakpoint address is either read or written. + + Access breakpoints also have lengths similar to write breakpoints. + +IO breakpoints in ia-32 are not supported. + +Since gdb stub at present does not use the protocol used by gdb for hardware +breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros +for hardware breakpoints are described below. + +hwebrk - Places an execution breakpoint + hwebrk breakpointno address +hwwbrk - Places a write breakpoint + hwwbrk breakpointno length address +hwabrk - Places an access breakpoint + hwabrk breakpointno length address +hwrmbrk - Removes a breakpoint + hwrmbrk breakpointno +exinfo - Tells whether a software or hardware breakpoint has occured. + Prints number of the hardware breakpoint if a hardware breakpoint has + occured. + +Arguments required by these commands are as follows +breakpointno - 0 to 3 +length - 1 to 3 +address - Memory location in hex digits ( without 0x ) e.g c015e9bc + +MP support +========== + +When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb client, +all the processors are forced to enter the debugger. Current thread +corresponds to the thread running on the processor where breakpoint occured. +Threads running on other processor(s) appear similar to other non running +threads in the 'info threads' output. + +ia-32 hardware debugging registers on all processors are set to same values. +Hence any hardware breakpoints may occur on any processor. + +gdb troubleshooting +=================== + +1. gdb hangs +Kill it. restart gdb. Connect to target machine. + +2. gdb cannot connect to target machine (after killing a gdb and restarting +another) +If the target machine was not inside debugger when you killed gdb, gdb cannot +connect because the target machine won't respond. +In this case echo "Ctrl+C"(ascii 3) in the serial line. +e.g. echo -e "\003" > /dev/ttyS1 +This forces that target machine into debugger after which you can connect. + +3. gdb cannot connect even after echoing Ctrl+C into serial line +Try changing serial line settings min to 1 and time to 0 +e.g. stty min 1 time 0 < /dev/ttyS1 +Try echoing again + +check serial line speed and set it to correct value if required +e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 + +Final Items +=========== + +I picked up this code from Dave Grothe and enhanced it. + +If you make some really cool modification to this stuff, or if you +fix a bug, please let me know. + +Amit S. Kale + + +(First kgdb by David Grothe ) + +(modified by Tigran Aivazian ) + Putting gdbstub into the kernel config menu. + +(modified by Scott Foehner ) + Hooks for entering gdbstub at boot time. + +(modified by Amit S. Kale ) + Threads, ia-32 hw debugging, mp support, console support, + nmi watchdog handling. diff -urpN -X /home/fletch/.diff.exclude 00-virgin/Documentation/sysrq.txt 90-mjb/Documentation/sysrq.txt --- 00-virgin/Documentation/sysrq.txt Thu Jan 2 22:04:57 2003 +++ 90-mjb/Documentation/sysrq.txt Sat Feb 1 22:09:06 2003 @@ -73,6 +73,8 @@ On other - If you know of the key combos 'l' - Send a SIGKILL to all processes, INCLUDING init. (Your system will be non-functional after this.) +'g' - Enter the kernel debugger (if configured and supported). + 'h' - Will display help ( actually any other key than those listed above will display help. but 'h' is easy to remember :-) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/Makefile 90-mjb/Makefile --- 00-virgin/Makefile Fri Jan 17 09:18:19 2003 +++ 90-mjb/Makefile Sun Feb 2 13:19:32 2003 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 5 SUBLEVEL = 59 -EXTRAVERSION = +EXTRAVERSION = -mjb3 # *DOCUMENTATION* # To see a list of typical targets execute "make help" @@ -47,7 +47,7 @@ TOPDIR := $(CURDIR) HOSTCC = gcc HOSTCXX = g++ -HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer +HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 HOSTCXXFLAGS = -O2 CROSS_COMPILE = @@ -260,8 +260,8 @@ ifdef CONFIG_MODULES export EXPORT_FLAGS := -DEXPORT_SYMTAB endif -ifndef CONFIG_FRAME_POINTER -CFLAGS += -fomit-frame-pointer +ifdef CONFIG_X86_REMOTE_DEBUG +CFLAGS += -g endif # diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/alpha/kernel/time.c 90-mjb/arch/alpha/kernel/time.c --- 00-virgin/arch/alpha/kernel/time.c Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/alpha/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -51,7 +51,7 @@ u64 jiffies_64; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; /* kernel/timer.c */ static int set_rtc_mmss(unsigned long); @@ -106,7 +106,7 @@ void timer_interrupt(int irq, void *dev, alpha_do_profile(regs->pc); #endif - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); /* * Calculate how many ticks have passed since the last update, @@ -138,7 +138,7 @@ void timer_interrupt(int irq, void *dev, state.last_rtc_update = xtime.tv_sec - (tmp ? 600 : 0); } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } void @@ -410,18 +410,19 @@ time_init(void) void do_gettimeofday(struct timeval *tv) { - unsigned long sec, usec, lost, flags; + unsigned long sec, usec, lost, seq; unsigned long delta_cycles, delta_usec, partial_tick; - read_lock_irqsave(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); - delta_cycles = rpcc() - state.last_time; - sec = xtime.tv_sec; - usec = (xtime.tv_nsec / 1000); - partial_tick = state.partial_tick; - lost = jiffies - wall_jiffies; + delta_cycles = rpcc() - state.last_time; + sec = xtime.tv_sec; + usec = (xtime.tv_nsec / 1000); + partial_tick = state.partial_tick; + lost = jiffies - wall_jiffies; - read_unlock_irqrestore(&xtime_lock, flags); + } while (seq != fr_read_end(&xtime_lock)); #ifdef CONFIG_SMP /* Until and unless we figure out how to get cpu cycle counters @@ -463,7 +464,7 @@ do_settimeofday(struct timeval *tv) unsigned long delta_usec; long sec, usec; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* The offset that is added into time in do_gettimeofday above must be subtracted out here to keep a coherent view of the @@ -494,7 +495,7 @@ do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/arm/kernel/time.c 90-mjb/arch/arm/kernel/time.c --- 00-virgin/arch/arm/kernel/time.c Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/arm/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -34,7 +34,7 @@ u64 jiffies_64; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; /* this needs a better home */ @@ -147,19 +147,20 @@ static void do_leds(void) void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec, lost; - read_lock_irqsave(&xtime_lock, flags); - usec = gettimeoffset(); - - lost = jiffies - wall_jiffies; - if (lost) - usec += lost * USECS_PER_JIFFY; - - sec = xtime.tv_sec; - usec += xtime.tv_nsec / 1000; - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + usec = gettimeoffset(); + + lost = jiffies - wall_jiffies; + if (lost) + usec += lost * USECS_PER_JIFFY; + + sec = xtime.tv_sec; + usec += xtime.tv_nsec / 1000; + } while (seq != fr_read_end(&xtime_lock)); /* usec may have gone up a lot: be safe */ while (usec >= 1000000) { @@ -173,7 +174,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -194,7 +195,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } static struct irqaction timer_irq = { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/Kconfig 90-mjb/arch/i386/Kconfig --- 00-virgin/arch/i386/Kconfig Fri Jan 17 09:18:19 2003 +++ 90-mjb/arch/i386/Kconfig Sat Feb 1 22:21:15 2003 @@ -328,11 +328,6 @@ config X86_ALIGNMENT_16 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 default y -config X86_TSC - bool - depends on MWINCHIP3D || MWINCHIP2 || MCRUSOE || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 - default y - config X86_GOOD_APIC bool depends on MK7 || MPENTIUM4 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 @@ -368,6 +363,11 @@ config X86_SSE2 depends on MK8 || MPENTIUM4 default y +config X86_CMOV + bool + depends on M686 || MPENTIUMII || MPENTIUMIII || MPENTIUM4 || MK8 || MCRUSOE + default y + config HUGETLB_PAGE bool "Huge TLB Page Support" help @@ -474,7 +474,7 @@ config NR_CPUS # Common NUMA Features config NUMA bool "Numa Memory Allocation Support" - depends on X86_NUMAQ + depends on (X86_NUMAQ || X86_SUMMIT) config DISCONTIGMEM bool @@ -486,6 +486,11 @@ config HAVE_ARCH_BOOTMEM_NODE depends on NUMA default y +config X86_TSC + bool + depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8) && !X86_NUMAQ + default y + config X86_MCE bool "Machine Check Exception" ---help--- @@ -660,6 +665,44 @@ config HIGHMEM64G endchoice +choice + help + On i386, a process can only virtually address 4GB of memory. This + lets you select how much of that virtual space you would like to + devoted to userspace, and how much to the kernel. + + Some userspace programs would like to address as much as possible and + have few demands of the kernel other than it get out of the way. These + users may opt to use the 3.5GB option to give their userspace program + as much room as possible. Due to alignment issues imposed by PAE, + the "3.5GB" option is unavailable if "64GB" high memory support is + enabled. + + Other users (especially those who use PAE) may be running out of + ZONE_NORMAL memory. Those users may benefit from increasing the + kernel's virtual address space size by taking it away from userspace, + which may not need all of its space. An indicator that this is + happening is when /proc/Meminfo's "LowFree:" is a small percentage of + "LowTotal:" while "HighFree:" is very large. + + If unsure, say "3GB" + prompt "User address space size" + default 1GB + +config 05GB + bool "3.5 GB" + depends on !HIGHMEM64G + +config 1GB + bool "3 GB" + +config 2GB + bool "2 GB" + +config 3GB + bool "1 GB" +endchoice + config HIGHMEM bool depends on HIGHMEM64G || HIGHMEM4G @@ -679,6 +722,16 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config 4K_STACK + bool "Use smaller 4k per-task stacks" + help + This option will shrink the kernel's per-task stack from 8k to + 4k. This will greatly increase your chance of overflowing it. + But, if you use the per-cpu interrupt stacks as well, your chances + go way down. Also try the CONFIG_X86_STACK_CHECK overflow + detection. It is much more reliable than the currently in-kernel + version. + config MATH_EMULATION bool "Math emulation" ---help--- @@ -738,6 +791,25 @@ config MTRR See for more information. +choice + help + This is unrelated to your processor's speed. This variable alters + how often the system is asked to generate timer interrupts. A larger + value can lead to a more responsive system, but also causes extra + overhead from the increased number of context switches. + + If in doubt, leave it at the default of 1000. + + prompt "Kernel HZ" + default 1000HZ + +config 100HZ + bool "100 Hz" + +config 1000HZ + bool "1000 Hz" +endchoice + config HAVE_DEC_LOCK bool depends on (SMP || PREEMPT) && X86_CMPXCHG @@ -1557,6 +1629,17 @@ config DEBUG_SLAB allocation as well as poisoning memory on free to catch use of freed memory. +config X86_REMOTE_DEBUG + bool "KGDB: Remote (serial) kernel debugging with gdb" + +config KGDB_THREAD + bool "KGDB: Thread analysis" + depends on X86_REMOTE_DEBUG + +config GDB_CONSOLE + bool "KGDB: Console messages through gdb" + depends on X86_REMOTE_DEBUG + config DEBUG_IOVIRT bool "Memory mapped I/O debugging" depends on DEBUG_KERNEL @@ -1582,6 +1665,26 @@ config MAGIC_SYSRQ keys are documented in . Don't say Y unless you really know what this hack does. +config EARLY_PRINTK + bool "Early console support" + default n + depends on DEBUG_KERNEL + help + Write kernel log output directly into the VGA buffer or serial port. + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server.You should normally N here, + unless you want to debug such a crash. + + Syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. + config DEBUG_SPINLOCK bool "Spinlock debugging" depends on DEBUG_KERNEL @@ -1612,12 +1715,33 @@ config DEBUG_SPINLOCK_SLEEP noisy if they are called with a spinlock held. config FRAME_POINTER - bool "Compile the kernel with frame pointers" + bool + default y if X86_REMOTE_DEBUG + default n if !X86_REMOTE_DEBUG help If you say Y here the resulting kernel image will be slightly larger and slower, but it will give very useful debugging information. If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. + +config X86_STACK_CHECK + bool "Detect stack overflows" + depends on FRAME_POINTER + help + Say Y here to have the kernel attempt to detect when the per-task + kernel stack overflows. This is much more robust checking than + the above overflow check, which will only occasionally detect + an overflow. The level of guarantee here is much greater. + + Some older versions of gcc don't handle the -p option correctly. + Kernprof is affected by the same problem, which is described here: + http://oss.sgi.com/projects/kernprof/faq.html#Q9 + + Basically, if you get oopses in __free_pages_ok during boot when + you have this turned on, you need to fix gcc. The Redhat 2.96 + version and gcc-3.x seem to work. + + If not debugging a stack overflow problem, say N config X86_EXTRA_IRQS bool diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/Makefile 90-mjb/arch/i386/Makefile --- 00-virgin/arch/i386/Makefile Fri Jan 17 09:18:19 2003 +++ 90-mjb/arch/i386/Makefile Sun Feb 2 13:16:19 2003 @@ -76,6 +76,10 @@ mcore-$(CONFIG_X86_SUMMIT) := mach-defa # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +ifdef CONFIG_X86_STACK_CHECK +CFLAGS += -p +endif + HEAD := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ @@ -89,6 +93,7 @@ drivers-$(CONFIG_OPROFILE) += arch/i386 CFLAGS += $(mflags-y) AFLAGS += $(mflags-y) +AFLAGS_vmlinux.lds.o += -imacros $(TOPDIR)/include/asm-i386/page.h boot := arch/i386/boot diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/boot/compressed/misc.c 90-mjb/arch/i386/boot/compressed/misc.c --- 00-virgin/arch/i386/boot/compressed/misc.c Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/i386/boot/compressed/misc.c Sat Feb 1 22:18:26 2003 @@ -377,3 +377,7 @@ asmlinkage int decompress_kernel(struct if (high_loaded) close_output_buffer_if_we_run_high(mv); return high_loaded; } + +/* We don't actually check for stack overflows this early. */ +__asm__(".globl mcount ; mcount: ret\n"); + diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/Makefile 90-mjb/arch/i386/kernel/Makefile --- 00-virgin/arch/i386/kernel/Makefile Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/i386/kernel/Makefile Sun Feb 2 13:18:31 2003 @@ -17,6 +17,7 @@ obj-$(CONFIG_MCA) += mca.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbstub.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_ACPI_SLEEP) += acpi_wakeup.o @@ -31,6 +32,17 @@ obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_EDD) += edd.o obj-$(CONFIG_MODULES) += module.o obj-y += sysenter.o +ifdef CONFIG_NUMA +obj-$(CONFIG_X86_SUMMIT) += srat.o +endif + +ifdef CONFIG_X86_REMOTE_DEBUG +GDBSTART=gdbstart +GDBCLEAN= -rm -f gdbstart /sbin/gdbstart +else +GDBSTART= +GDBCLEAN= +endif EXTRA_AFLAGS := -traditional diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/apic.c 90-mjb/arch/i386/kernel/apic.c --- 00-virgin/arch/i386/kernel/apic.c Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/i386/kernel/apic.c Sat Feb 1 22:14:22 2003 @@ -1038,7 +1038,8 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs regs) +struct pt_regs * smp_apic_timer_interrupt(struct pt_regs* regs) __attribute__((regparm(1))); +struct pt_regs * smp_apic_timer_interrupt(struct pt_regs* regs) { int cpu = smp_processor_id(); @@ -1058,14 +1059,16 @@ void smp_apic_timer_interrupt(struct pt_ * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(®s); + smp_local_timer_interrupt(regs); irq_exit(); + return regs; } /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -asmlinkage void smp_spurious_interrupt(void) +struct pt_regs * smp_spurious_interrupt(struct pt_regs* regs) __attribute__((regparm(1))); +struct pt_regs * smp_spurious_interrupt(struct pt_regs* regs) { unsigned long v; @@ -1083,13 +1086,15 @@ asmlinkage void smp_spurious_interrupt(v printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", smp_processor_id()); irq_exit(); + return regs; } /* * This interrupt should never happen with our APIC/SMP architecture */ -asmlinkage void smp_error_interrupt(void) +struct pt_regs * smp_error_interrupt(struct pt_regs* regs) __attribute__((regparm(1))); +struct pt_regs * smp_error_interrupt(struct pt_regs* regs) { unsigned long v, v1; @@ -1114,6 +1119,7 @@ asmlinkage void smp_error_interrupt(void printk (KERN_INFO "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); + return regs; } /* diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/apm.c 90-mjb/arch/i386/kernel/apm.c --- 00-virgin/arch/i386/kernel/apm.c Thu Jan 9 19:15:56 2003 +++ 90-mjb/arch/i386/kernel/apm.c Sun Feb 2 13:19:25 2003 @@ -227,7 +227,7 @@ #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern spinlock_t i8253_lock; extern unsigned long get_cmos_time(void); extern void machine_real_restart(unsigned char *, int); @@ -1264,7 +1264,7 @@ static int suspend(int vetoable) printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n"); } /* serialize with the timer interrupt */ - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* protect against access to timer chip registers */ spin_lock(&i8253_lock); @@ -1276,7 +1276,7 @@ static int suspend(int vetoable) ignore_normal_resume = 1; spin_unlock(&i8253_lock); - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); if (err == APM_NO_ERROR) err = APM_SUCCESS; @@ -1301,10 +1301,10 @@ static void standby(void) int err; /* serialize with the timer interrupt */ - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* If needed, notify drivers here */ get_time_diff(); - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); err = set_system_power_state(APM_STATE_STANDBY); if ((err != APM_SUCCESS) && (err != APM_NO_ERROR)) @@ -1393,9 +1393,9 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); set_time(); - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); pm_send_all(PM_RESUME, (void *)0); queue_event(event, NULL); } @@ -1410,9 +1410,9 @@ static void check_events(void) break; case APM_UPDATE_TIME: - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); set_time(); - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); break; case APM_CRITICAL_SUSPEND: diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/cpu/mcheck/p4.c 90-mjb/arch/i386/kernel/cpu/mcheck/p4.c --- 00-virgin/arch/i386/kernel/cpu/mcheck/p4.c Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/i386/kernel/cpu/mcheck/p4.c Sat Feb 1 22:14:22 2003 @@ -61,7 +61,8 @@ static void intel_thermal_interrupt(stru /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs regs) +asmlinkage struct pt_regs * smp_thermal_interrupt(struct pt_regs *regs) __attribute__((regparm(1))); +struct pt_regs* smp_thermal_interrupt(struct pt_regs* regs) { irq_enter(); vendor_thermal_interrupt(®s); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/entry.S 90-mjb/arch/i386/kernel/entry.S --- 00-virgin/arch/i386/kernel/entry.S Fri Jan 17 09:18:20 2003 +++ 90-mjb/arch/i386/kernel/entry.S Sat Feb 1 22:18:26 2003 @@ -138,8 +138,6 @@ VM_MASK = 0x00020000 .long 1b,2b; \ .previous - - ENTRY(lcall7) pushfl # We get a different stack layout with call # gates, which has to be cleaned up later.. @@ -155,7 +153,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp @@ -218,7 +216,7 @@ need_resched: jz restore_all movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) sti - call schedule + call user_schedule movl $0,TI_PRE_COUNT(%ebp) cli jmp need_resched @@ -300,7 +298,7 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: - call schedule + call user_schedule cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -388,17 +386,76 @@ ENTRY(irq_entries_start) vector=vector+1 .endr + +# lets play optimizing compiler... +#ifdef CONFIG_X86_CMOV +#define COND_MOVE cmovnz %esi,%esp; +#else +#define COND_MOVE \ + jz 1f; \ + mov %esi,%esp; \ +1: +#endif + +# These macros will switch you to, and from a per-cpu interrupt stack +# They take the pt_regs arg and move it from the normal place on the +# stack to %eax. Any handler function can retrieve it using regparm(1). +# The handlers are expected to return the stack to switch back to in +# the same register. +# +# This means that the irq handlers need to return their arg +# +# SWITCH_TO_IRQSTACK clobbers %ebx, %ecx, %edx, %esi +# old stack in %eax + +#define SWITCH_TO_IRQSTACK \ + GET_THREAD_INFO(%ebx); \ + movl TI_IRQ_STACK(%ebx),%ecx; \ + movl TI_TASK(%ebx),%edx; \ + movl %esp,%eax; \ + \ + /* %ecx+THREAD_SIZE is next stack -4 keeps us in the right one */\ + leal (THREAD_SIZE-4)(%ecx),%esi; \ + \ + /* is there a valid irq_stack? */ \ + testl %ecx,%ecx; \ + COND_MOVE; \ + \ + /* update the task pointer in the irq stack */ \ + GET_THREAD_INFO(%esi); \ + movl %edx,TI_TASK(%esi); \ + \ + /* update the preempt count in the irq stack */ \ + movl TI_PRE_COUNT(%ebx),%ecx; \ + movl %ecx,TI_PRE_COUNT(%esi); + +# copy flags from the irq stack back into the task's thread_info +# %esi is saved over the irq handler call and contains the irq stack +# thread_info pointer +# %eax was returned from the handler, as described above +# %ebx contains the original thread_info pointer + +#define RESTORE_FROM_IRQSTACK \ + movl %eax,%esp; \ + movl TI_FLAGS(%esi),%eax; \ + movl $0,TI_FLAGS(%esi); \ + LOCK orl %eax,TI_FLAGS(%ebx); + ALIGN common_interrupt: SAVE_ALL + SWITCH_TO_IRQSTACK call do_IRQ + RESTORE_FROM_IRQSTACK jmp ret_from_intr #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ - pushl $nr-256; \ + pushl $nr-256; \ SAVE_ALL \ - call smp_/**/name; \ + SWITCH_TO_IRQSTACK \ + call smp_/**/name; \ + RESTORE_FROM_IRQSTACK \ jmp ret_from_intr; /* The include is where all of the SMP etc. interrupts come from */ @@ -504,6 +561,31 @@ ENTRY(double_fault) pushl $do_double_fault jmp error_code +#ifdef CONFIG_KGDB_THREAD +ENTRY(kern_schedule) + pushl %ebp + movl %esp, %ebp + pushl %ss + pushl %ebp + pushfl + pushl %cs + pushl 4(%ebp) + pushl %eax + pushl %es + pushl %ds + pushl %eax + pushl (%ebp) + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + call kern_do_schedule + movl %ebp, %esp + pop %ebp + ret +#endif + ENTRY(invalid_TSS) pushl $do_invalid_TSS jmp error_code @@ -539,6 +621,61 @@ ENTRY(spurious_interrupt_bug) pushl $0 pushl $do_spurious_interrupt_bug jmp error_code + + +#ifdef CONFIG_X86_STACK_CHECK +.data + .globl stack_overflowed +stack_overflowed: + .long 0 +.text + +ENTRY(mcount) + push %eax + movl $(THREAD_SIZE - 1),%eax + andl %esp,%eax + cmpl $STACK_WARN,%eax /* more than half the stack is used*/ + jle 1f +2: + popl %eax + ret +1: + lock; btsl $0,stack_overflowed + jc 2b + + # switch to overflow stack + movl %esp,%eax + movl $(stack_overflow_stack + THREAD_SIZE - 4),%esp + + pushf + cli + pushl %eax + + # push eip then esp of error for stack_overflow_panic + pushl 4(%eax) + pushl %eax + + # update the task pointer and cpu in the overflow stack's thread_info. + GET_THREAD_INFO_WITH_ESP(%eax) + movl TI_TASK(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_TASK + movl TI_CPU(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_CPU + + call stack_overflow + + # pop off call arguments + addl $8,%esp + + popl %eax + popf + movl %eax,%esp + popl %eax + movl $0,stack_overflowed + ret + +#warning stack check enabled +#endif .data ENTRY(sys_call_table) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/gdbstart.c 90-mjb/arch/i386/kernel/gdbstart.c --- 00-virgin/arch/i386/kernel/gdbstart.c Wed Dec 31 16:00:00 1969 +++ 90-mjb/arch/i386/kernel/gdbstart.c Sat Feb 1 22:09:06 2003 @@ -0,0 +1,147 @@ +/* + * This program opens a tty file and issues the GDB stub activating + * ioctl on it. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +char *tty_name = "/dev/ttyS0" ; /* COM1 port */ +int speed = 9600 ; /* default speed */ +struct termios save_ts ; /* original term struct */ + +void print_usage(void) +{ + printf("gdbstub [-s speed] [-t tty-dev]\n") ; + printf(" defaults: /dev/ttyS0 with speed unmodified by this program\n"); + +} /* print_usage */ + +void tty_err(char *msg) +{ + char buf[100] ; + + strcpy(buf, msg) ; + strcat(buf, ": ") ; + strcat(buf, tty_name) ; + perror(buf) ; + exit(1) ; + +} /* tty_err */ + + +void setup_term(int fd) +{ + struct termios ts ; + int speed_code ; + + if (tcgetattr(fd, &ts) < 0) tty_err("tcgetattr") ; + + save_ts = ts ; + switch (speed) + { + case 4800: + speed_code = B4800 ; + break ; + case 9600: + speed_code = B9600 ; + break ; + case 19200: + speed_code = B19200 ; + break ; + case 38400: + speed_code = B38400 ; + break ; + case 57600: + speed_code = B57600 ; + break ; + case 115200: + speed_code = B115200 ; + break ; + case 230400: + speed_code = B230400 ; + break ; + default: + printf("Invalid speed: %d\n", speed) ; + exit(1) ; + } + + ts.c_cflag = CS8 | CREAD | CLOCAL ; + if (cfsetospeed(&ts, speed_code) < 0) tty_err("cfsetospeed") ; + if (cfsetispeed(&ts, speed_code) < 0) tty_err("cfsetispeed") ; + + if (tcsetattr(fd, TCSANOW, &ts) < 0) tty_err("tcsetattr") ; + +} /* setup_term */ + +int main(int argc, char **argv) +{ + int opt ; + int fil ; + int rslt ; + + while ((opt = getopt(argc, argv, "hs:t:")) > 0) + { + switch (opt) + { + case 's': + speed = atol(optarg) ; + break ; + case 't': + tty_name = optarg ; + break ; + case ':': + printf("Invalid option\n") ; + break ; + case '?': + case 'h': + default: + print_usage() ; + return 1; + } + } + + fil = open(tty_name, O_RDWR) ; + if (fil < 0) + { + perror(tty_name) ; + return 1; + } + + + setup_term(fil) ; + + /* + * When we issue this ioctl, control will not return until + * the debugger running on the remote host machine says "go". + */ + printf("\nAbout to activate GDB stub in the kernel on %s\n", tty_name) ; + printf("Hit CR to continue, kill program to abort -- ") ; + getchar() ; + sync() ; + rslt = ioctl(fil, TIOCGDB, 0) ; + if (rslt < 0) + { + perror("TIOCGDB ioctl") ; + return 1; + } + + printf("\nGDB stub successfully activated\n") ; + + for (;;) + { + pause() ; + } + + if (tcsetattr(fil, TCSANOW, &save_ts) < 0) tty_err("tcsetattr") ; + + exit(0); +} /* main */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/gdbstub.c 90-mjb/arch/i386/kernel/gdbstub.c --- 00-virgin/arch/i386/kernel/gdbstub.c Wed Dec 31 16:00:00 1969 +++ 90-mjb/arch/i386/kernel/gdbstub.c Sat Feb 1 22:09:06 2003 @@ -0,0 +1,1208 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (C) 2000-2001 VERITAS Software Corporation. + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: Amit Kale + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Origianl kgdb, compatibility with 2.1.xx kernel by David Grothe + * Integrated into 2.2.5 kernel by Tigran Aivazian + * thread support, + * support for multiple processors, + * support for ia-32(x86) hardware debugging, + * Console support, + * handling nmi watchdog + * Amit S. Kale ( akale@veritas.com ) + * + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif +#include + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int putDebugChar(int); /* write a single character */ +extern int getDebugChar(void); /* read and return a single char */ + +extern int pid_max; + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 1024 + +static char initialized; /* boolean flag. != 0 means we've been initialized */ + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES 64 +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +enum regnames { _EAX, /* 0 */ + _ECX, /* 1 */ + _EDX, /* 2 */ + _EBX, /* 3 */ + _ESP, /* 4 */ + _EBP, /* 5 */ + _ESI, /* 6 */ + _EDI, /* 7 */ + _PC /* 8 also known as eip */ , + _PS /* 9 also known as eflags */ , + _CS, /* 10 */ + _SS, /* 11 */ + _DS, /* 12 */ + _ES, /* 13 */ + _FS, /* 14 */ + _GS +}; /* 15 */ + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* */ + +#define BREAKPOINT() asm(" int $3"); + +/* Put the error code here just in case the user cares. */ +int gdb_i386errcode; +/* Likewise, the vector number here (since GDB only gets the signal + number through the usual means, and that's not very specific). */ +int gdb_i386vector = -1; + +static spinlock_t slavecpulocks[KGDB_MAX_NO_CPUS]; +volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#ifdef CONFIG_SMP +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +spinlock_t kgdb_nmispinlock = SPIN_LOCK_UNLOCKED; +#else +unsigned kgdb_spinlock = 0; +unsigned kgdb_nmispinlock = 0; +#endif + +static void +kgdb_usercode(void) +{ +} + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + do { + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + if (!putDebugChar(ch)) + return; + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + + } while ((getDebugChar() & 0x7f) != '+'); + +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static short error; + +static void +regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_EAX] = regs->eax; + gdb_regs[_EBX] = regs->ebx; + gdb_regs[_ECX] = regs->ecx; + gdb_regs[_EDX] = regs->edx; + gdb_regs[_ESI] = regs->esi; + gdb_regs[_EDI] = regs->edi; + gdb_regs[_EBP] = regs->ebp; + gdb_regs[_DS] = regs->xds; + gdb_regs[_ES] = regs->xes; + gdb_regs[_PS] = regs->eflags; + gdb_regs[_CS] = regs->xcs; + gdb_regs[_PC] = regs->eip; + gdb_regs[_ESP] = (int) (®s->esp); + gdb_regs[_SS] = __KERNEL_DS; + gdb_regs[_FS] = 0xFFFF; + gdb_regs[_GS] = 0xFFFF; +} /* regs_to_gdb_regs */ + +static void +gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) +{ + regs->eax = gdb_regs[_EAX]; + regs->ebx = gdb_regs[_EBX]; + regs->ecx = gdb_regs[_ECX]; + regs->edx = gdb_regs[_EDX]; + regs->esi = gdb_regs[_ESI]; + regs->edi = gdb_regs[_EDI]; + regs->ebp = gdb_regs[_EBP]; + regs->xds = gdb_regs[_DS]; + regs->xes = gdb_regs[_ES]; + regs->eflags = gdb_regs[_PS]; + regs->xcs = gdb_regs[_CS]; + regs->eip = gdb_regs[_PC]; +#if 0 /* can't change these */ + regs->esp = gdb_regs[_ESP]; + regs->xss = gdb_regs[_SS]; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif + +} /* gdb_regs_to_regs */ + +/* Indicate to caller of mem2hex or hex2mem that there has been an + error. */ +static volatile int kgdb_memerr = 0; +volatile int kgdb_memerr_expected = 0; +static volatile int kgdb_memerr_cnt = 0; +static int garbage_loc = -1; + +int +get_char(char *addr) +{ + return *addr; +} + +void +set_char(char *addr, int val) +{ + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set kgdb_memerr in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + + ch = get_char(mem++); + + if (may_fault && kgdb_memerr) { + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + kgdb_memerr_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch); + + if (may_fault && kgdb_memerr) { + return (mem); + } + } + if (may_fault) + kgdb_memerr_expected = 0; + return (mem); +} + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToInt(char **ptr, int *intValue) +{ + int numChars = 0; + int hexValue; + + *intValue = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *intValue = (*intValue << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#ifdef CONFIG_KGDB_THREAD +static int +stubhex(int ch) +{ + if (ch >= 'a' && ch <= 'f') + return ch - 'a' + 10; + if (ch >= '0' && ch <= '9') + return ch - '0'; + if (ch >= 'A' && ch <= 'F') + return ch - 'A' + 10; + return -1; +} + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif + +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +#ifdef CONFIG_KGDB_THREAD +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif + +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} + +#ifdef CONFIG_KGDB_THREAD +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } +#if 0 + thread = init_tasks[0]; + do { + if (thread->pid == pid) { + return thread; + } + thread = thread->next_task; + } while (thread != init_tasks[0]); +#endif + return NULL; +} +#endif + +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned addr; +} breakinfo[4] = { { +enabled:0}, { +enabled:0}, { +enabled:0}, { +enabled:0}}; + +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned dr7; + + asm volatile ("movl %%db7, %0\n":"=r" (dr7) + :); + do { + unsigned addr0, addr1, addr2, addr3; + asm volatile ("movl %%db0, %0\n" + "movl %%db1, %1\n" + "movl %%db2, %2\n" + "movl %%db3, %3\n":"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3):); + } while (0); + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movl %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movl %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movl %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movl %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movl %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +void +gdb_wait(void *arg) +{ + unsigned flags; + int processor; + + local_irq_save(flags); + processor = smp_processor_id(); + procindebug[processor] = 1; + current->thread.kgdbregs = arg; + spin_lock(slavecpulocks + processor); + correct_hw_break(); + procindebug[processor] = 0; + local_irq_restore(flags); +} + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + int addr, length; + int breakno, breaktype; + char *ptr; + int newPC; + unsigned long flags = ~0UL; + int gdb_regs[NUMREGBYTES / 4]; + int i; + int dr6; + int reboot = 0; +#ifdef CONFIG_KGDB_THREAD + int nothreads; + int maxthreads; + int threadid; + threadref thref; + struct task_struct *thread = NULL; +#endif +#define regs (*linux_regs) + + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { + return (0); + } + + if (kgdb_memerr_expected) { + /* + * This fault occured because of the get_char or set_char + * routines. These two routines use either eax of edx to + * indirectly reference the location in memory that they + * are working with. For a page fault, when we return + * the instruction will be retried, so we have to make + * sure that these registers point to valid memory. + */ + kgdb_memerr = 1; /* set mem error flag */ + kgdb_memerr_expected = 0; + kgdb_memerr_cnt++; /* helps in debugging */ + regs.eax = (long) &garbage_loc; /* make valid address */ + regs.edx = (long) &garbage_loc; /* make valid address */ + return (0); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_nmispinlock)) +#else + if (!kgdb_nmispinlock) +#endif + { + + /* Get kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_lock(&kgdb_spinlock); +#else + kgdb_spinlock = 1; +#endif + + local_irq_save(flags); + + /* Disable hardware debugging while we are in kgdb */ + __asm__("movl %0,%%db7": /* no output */ + :"r"(0)); + + for (i = 0; i < NR_CPUS; i++) { + spin_lock_init(&slavecpulocks[i]); + _raw_spin_lock(&slavecpulocks[i]); + } + + if (num_online_cpus() > 1) { + /* Force other cpus in debugger */ + if (smp_call_function(gdb_wait, NULL, 0, 99) != 0) { + return (1); + } + } + + procindebug[smp_processor_id()] = 1; + } + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'g': /* return the value of the CPU registers */ + if (!usethread || usethread == current) { + regs_to_gdb_regs(gdb_regs, ®s); + } else { + memset(gdb_regs, 0, NUMREGBYTES); + if (usethread->thread.kgdbregs) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + get_char((char *) usethread->thread. + kgdbregs); + kgdb_memerr_expected = 0; + if (kgdb_memerr) { + gdb_regs[_PC] = + (int) kgdb_usercode; + } else { + regs_to_gdb_regs(gdb_regs, + usethread-> + thread. + kgdbregs); + } + } else { + gdb_regs[_PC] = (int) kgdb_usercode; + } + } + mem2hex((char *) gdb_regs, remcomOutBuffer, NUMREGBYTES, + 0); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], (char *) gdb_regs, + NUMREGBYTES, 0); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) { + ptr = 0; + mem2hex((char *) addr, + remcomOutBuffer, length, + 1); + if (kgdb_memerr) { + strcpy(remcomOutBuffer, + "E03"); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + } + break; + + /* MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) + if (*(ptr++) == ':') { + hex2mem(ptr, + (char *) addr, + length, 1); + + if (kgdb_memerr) { + strcpy + (remcomOutBuffer, + "E03"); + } else { + strcpy + (remcomOutBuffer, + "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + } + break; + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + case 'c': + case 's': +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* try to read optional parameter, pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) { + regs.eip = addr; + } + + newPC = regs.eip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno)) { + if (breakinfo[breakno].type == + 0) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + } + correct_hw_break(); + asm volatile ("movl %0, %%db6\n"::"r" (0)); + for (i = 0; i < NR_CPUS; i++) { + _raw_spin_unlock(&slavecpulocks[i]); + } + + procindebug[smp_processor_id()] = 0; + /* Release kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_unlock(&kgdb_spinlock); +#else + kgdb_spinlock = 0; +#endif + if (flags != ~0UL) + local_irq_restore(flags); + return (0); + + /* kill the program */ + case 'k': + break; + + /* query */ + case 'q': + switch (remcomInBuffer[1]) { +#ifdef CONFIG_KGDB_THREAD + case 'L': + /* List threads */ + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads + && threadid < pid_max; threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + } + } + if (threadid == pid_max) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; + + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; +#endif + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, err_code, + remcomOutBuffer); + break; + } + break; + +#ifdef CONFIG_KGDB_THREAD + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + usethread = thread; + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; +#endif + + case 'r': + reboot = 1; + strcpy(remcomOutBuffer, "OK"); + break; + case 'Y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + ptr++; + hexToInt(&ptr, &breaktype); + ptr++; + hexToInt(&ptr, &length); + ptr++; + hexToInt(&ptr, &addr); + if (set_hw_break + (breakno & 0x3, breaktype & 0x3, length & 0x3, addr) + == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + if (reboot == 1) { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt)); + __asm__ __volatile__("int3"); + } + } +} + +/* this function is used to set up exception handlers for tracing and + breakpoints */ +void +set_debug_traps(void) +{ + /* + * linux_debug_hook is defined in traps.c. We store a pointer + * to our own exception handler into it. + */ + linux_debug_hook = handle_exception; + + /* + * In case GDB is started before us, ack any packets (presumably + * "$?#xx") sitting there. */ + putDebugChar('+'); + + initialized = 1; +} + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ + +void +breakpoint(void) +{ + if (initialized) + BREAKPOINT(); +} + +#ifdef CONFIG_GDB_CONSOLE +char gdbconbuf[BUFMAX]; + +void +gdb_console_write(struct console *co, const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + + if (!gdb_initialized) { + return; + } + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } +} +#endif +static int __init +kgdb_opt_gdb(char *dummy) +{ + gdb_enter = 1; + return 1; +} +static int __init +kgdb_opt_gdbttyS(char *str) +{ + gdb_ttyS = simple_strtoul(str, NULL, 10); + return 1; +} +static int __init +kgdb_opt_gdbbaud(char *str) +{ + gdb_baud = simple_strtoul(str, NULL, 10); + return 1; +} + +/* + * Sequence of these lines has to be maintained because gdb option is a prefix + * of the other two options + */ + +__setup("gdbttyS=", kgdb_opt_gdbttyS); +__setup("gdbbaud=", kgdb_opt_gdbbaud); +__setup("gdb", kgdb_opt_gdb); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/head.S 90-mjb/arch/i386/kernel/head.S --- 00-virgin/arch/i386/kernel/head.S Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/i386/kernel/head.S Sat Feb 1 22:11:38 2003 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -309,7 +310,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/i386_ksyms.c 90-mjb/arch/i386/kernel/i386_ksyms.c --- 00-virgin/arch/i386/kernel/i386_ksyms.c Mon Jan 13 21:09:20 2003 +++ 90-mjb/arch/i386/kernel/i386_ksyms.c Sun Feb 2 13:19:31 2003 @@ -67,7 +67,6 @@ EXPORT_SYMBOL(EISA_bus); EXPORT_SYMBOL(MCA_bus); #ifdef CONFIG_DISCONTIGMEM EXPORT_SYMBOL(node_data); -EXPORT_SYMBOL(pfn_to_nid); #endif #ifdef CONFIG_X86_NUMAQ EXPORT_SYMBOL(xquad_portio); @@ -146,6 +145,20 @@ EXPORT_SYMBOL(smp_num_siblings); EXPORT_SYMBOL(cpu_sibling_map); #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +void __this_fixmap_does_not_exist(void) +{ + BUG(); +} +EXPORT_SYMBOL(__this_fixmap_does_not_exist); + +void __br_lock_usage_bug(void) +{ + BUG(); +} +EXPORT_SYMBOL(__br_lock_usage_bug); +#endif + #ifdef CONFIG_SMP EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(cpu_online_map); @@ -213,4 +226,9 @@ EXPORT_SYMBOL(kmap_atomic_to_page); #ifdef CONFIG_EDD_MODULE EXPORT_SYMBOL(edd); EXPORT_SYMBOL(eddnr); +#endif + +#ifdef CONFIG_X86_STACK_CHECK +extern void mcount(void); +EXPORT_SYMBOL(mcount); #endif diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/init_task.c 90-mjb/arch/i386/kernel/init_task.c --- 00-virgin/arch/i386/kernel/init_task.c Sun Nov 17 20:29:56 2002 +++ 90-mjb/arch/i386/kernel/init_task.c Sat Feb 1 22:18:26 2003 @@ -13,6 +13,14 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); struct mm_struct init_mm = INIT_MM(init_mm); +union thread_union init_irq_union + __attribute__((__section__(".data.init_task"))); + +#ifdef CONFIG_X86_STACK_CHECK +union thread_union stack_overflow_stack + __attribute__((__section__(".data.init_task"))); +#endif + /* * Initial thread structure. * diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/io_apic.c 90-mjb/arch/i386/kernel/io_apic.c --- 00-virgin/arch/i386/kernel/io_apic.c Fri Jan 17 09:18:20 2003 +++ 90-mjb/arch/i386/kernel/io_apic.c Sun Feb 2 13:19:27 2003 @@ -116,40 +116,84 @@ static void __init replace_pin_at_irq(un } } -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_pin_list *entry = irq_2_pin + irq; \ - \ - for (;;) { \ - unsigned int reg; \ - pin = entry->pin; \ - if (pin == -1) \ - break; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ - if (!entry->next) \ - break; \ - entry = irq_2_pin + entry->next; \ - } \ - FINAL; \ -} - -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ -DO_ACTION( __mask_and_edge, 0, = (reg & 0xffff7fff) | 0x00010000, ) - /* mask = 1, trigger = 0 */ -DO_ACTION( __unmask_and_level, 0, = (reg & 0xfffeffff) | 0x00008000, ) - /* mask = 0, trigger = 1 */ +/* mask = 1 */ +static void __mask_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + io_apic_modify(entry->apic, 0x10 + pin*2, reg |= 0x00010000); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + io_apic_sync(entry->apic); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + io_apic_modify(entry->apic, 0x10 + pin*2, reg &= 0xfffeffff); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg = (reg & 0xffff7fff) | 0x00010000; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg = (reg & 0xfffeffff) | 0x00008000; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} static void mask_IO_APIC_irq (unsigned int irq) { @@ -197,13 +241,23 @@ static void clear_IO_APIC (void) static void set_ioapic_affinity (unsigned int irq, unsigned long mask) { unsigned long flags; + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; /* * Only the first 8 bits are valid. */ mask = mask << 24; spin_lock_irqsave(&ioapic_lock, flags); - __DO_ACTION(1, = mask, ) + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(entry->apic, 0x10 + 1 + pin*2, mask); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1131,7 +1185,7 @@ void disable_IO_APIC(void) * * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 */ - +#ifndef CONFIG_X86_NUMAQ static void __init setup_ioapic_ids_from_mpc (void) { struct IO_APIC_reg_00 reg_00; @@ -1225,6 +1279,9 @@ static void __init setup_ioapic_ids_from printk(" ok.\n"); } } +#else /* !CONFIG_X86_NUMAQ */ +static void __init setup_ioapic_ids_from_mpc(void) { } +#endif /* CONFIG_X86_NUMAQ */ /* * There is a nasty bug in some older SMP boards, their mptable lies diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/irq.c 90-mjb/arch/i386/kernel/irq.c --- 00-virgin/arch/i386/kernel/irq.c Sun Nov 17 20:29:22 2002 +++ 90-mjb/arch/i386/kernel/irq.c Sat Feb 1 22:11:39 2003 @@ -311,7 +311,8 @@ void enable_irq(unsigned int irq) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs regs) +struct pt_regs *do_IRQ(struct pt_regs *regs) __attribute__((regparm(1))); +struct pt_regs *do_IRQ(struct pt_regs *regs) { /* * We ack quickly, we don't want the irq controller @@ -323,7 +324,7 @@ asmlinkage unsigned int do_IRQ(struct pt * 0 return value means that this irq is already being * handled by some other CPU. (or is disabled) */ - int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */ + int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code */ int cpu = smp_processor_id(); irq_desc_t *desc = irq_desc + irq; struct irqaction * action; @@ -388,7 +389,7 @@ asmlinkage unsigned int do_IRQ(struct pt */ for (;;) { spin_unlock(&desc->lock); - handle_IRQ_event(irq, ®s, action); + handle_IRQ_event(irq, regs, action); spin_lock(&desc->lock); if (likely(!(desc->status & IRQ_PENDING))) @@ -407,7 +408,7 @@ out: irq_exit(); - return 1; + return regs; } /** diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/mpparse.c 90-mjb/arch/i386/kernel/mpparse.c --- 00-virgin/arch/i386/kernel/mpparse.c Fri Jan 17 09:18:20 2003 +++ 90-mjb/arch/i386/kernel/mpparse.c Sat Feb 1 22:04:51 2003 @@ -110,7 +110,7 @@ void __init MP_processor_info (struct mp if (!(m->mpc_cpuflag & CPU_ENABLED)) return; - apicid = mpc_apic_id(m, translation_table[mpc_record]->trans_quad); + apicid = mpc_apic_id(m, translation_table[mpc_record]); if (m->mpc_featureflag&(1<<0)) Dprintk(" Floating point unit present.\n"); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/nmi.c 90-mjb/arch/i386/kernel/nmi.c --- 00-virgin/arch/i386/kernel/nmi.c Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/i386/kernel/nmi.c Sun Feb 2 13:19:27 2003 @@ -20,11 +20,26 @@ #include #include #include +#include #include #include #include +#ifdef CONFIG_X86_REMOTE_DEBUG +extern gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs) ; \ + after; \ + } \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + unsigned int nmi_watchdog = NMI_NONE; static unsigned int nmi_hz = HZ; unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ @@ -63,8 +78,6 @@ extern void show_registers(struct pt_reg CRU_ESCR0 (with any non-null event selector) through a complemented max threshold. [IA32-Vol3, Section 14.9.9] */ #define MSR_P4_IQ_COUNTER0 0x30C -#define MSR_P4_IQ_CCCR0 0x36C -#define MSR_P4_CRU_ESCR0 0x3B8 #define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) #define P4_NMI_IQ_CCCR0 \ (P4_CCCR_OVF_PMI|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ @@ -363,12 +376,59 @@ void nmi_watchdog_tick (struct pt_regs * sum = irq_stat[cpu].apic_timer_irqs; if (last_irq_sums[cpu] == sum) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_spinlock)) +#else + if (kgdb_spinlock) +#endif + { + /* We are inside kgdb, this isn't a stuck cpu */ + alert_counter[cpu] = 0; + } else { +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + if (!procindebug[cpu]) { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } + return; + } + } +#endif /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; if (alert_counter[cpu] == 5*nmi_hz) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_trylock(&kgdb_nmispinlock)) +#else + kgdb_nmispinlock = 1; +#endif + { + procindebug[cpu] = 1; + CHK_REMOTE_DEBUG(2,SIGBUS,0,regs,) + } +#ifdef CONFIG_SMP + else { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } +#endif +#endif spin_lock(&nmi_print_lock); /* * We are in trouble anyway, lets at least try diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/numaq.c 90-mjb/arch/i386/kernel/numaq.c --- 00-virgin/arch/i386/kernel/numaq.c Sun Nov 17 20:29:51 2002 +++ 90-mjb/arch/i386/kernel/numaq.c Sun Feb 2 13:19:31 2003 @@ -27,6 +27,7 @@ #include #include #include +#include #include /* These are needed before the pgdat's are created */ @@ -82,19 +83,7 @@ static void __init smp_dump_qct(void) * physnode_map[8- ] = -1; */ int physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1}; - -#define PFN_TO_ELEMENT(pfn) (pfn / PAGES_PER_ELEMENT) -#define PA_TO_ELEMENT(pa) (PFN_TO_ELEMENT(pa >> PAGE_SHIFT)) - -int pfn_to_nid(unsigned long pfn) -{ - int nid = physnode_map[PFN_TO_ELEMENT(pfn)]; - - if (nid == -1) - BUG(); /* address is not present */ - - return nid; -} +EXPORT_SYMBOL(physnode_map); /* * for each node mark the regions diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/process.c 90-mjb/arch/i386/kernel/process.c --- 00-virgin/arch/i386/kernel/process.c Thu Jan 9 19:15:56 2003 +++ 90-mjb/arch/i386/kernel/process.c Sat Feb 1 22:18:26 2003 @@ -159,7 +159,25 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); -void show_regs(struct pt_regs * regs) +void stack_overflow(unsigned long esp, unsigned long eip) +{ + int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC); + + printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%x %d %d\n", + esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing ); + + if (panicing) + print_symbol("stack overflow from %s\n", eip); + else + print_symbol("excessive stack use from %s\n", eip); + printk("esp: %p\n", (void*)esp); + show_trace((void*)esp); + + if (panicing) + panic("stack overflow\n"); +} + +asmlinkage void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -432,6 +450,7 @@ void __switch_to(struct task_struct *pre /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + next_p->thread_info->irq_stack = prev_p->thread_info->irq_stack; unlazy_fpu(prev_p); /* diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/smp.c 90-mjb/arch/i386/kernel/smp.c --- 00-virgin/arch/i386/kernel/smp.c Fri Jan 17 09:18:20 2003 +++ 90-mjb/arch/i386/kernel/smp.c Sat Feb 1 22:14:22 2003 @@ -305,7 +305,8 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ -asmlinkage void smp_invalidate_interrupt (void) +struct pt_regs *smp_invalidate_interrupt(struct pt_regs *regs) __attribute__((regparm(1))); +struct pt_regs *smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; @@ -336,6 +337,7 @@ asmlinkage void smp_invalidate_interrupt out: put_cpu_no_resched(); + return regs; } static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, @@ -516,10 +518,17 @@ int smp_call_function (void (*func) (voi { struct call_data_struct data; int cpus = num_online_cpus()-1; + int count = 0; + int gdb; - if (!cpus) + if (cpus <= 0) return 0; + gdb = 0; + if (wait == 99) { + wait = 0; + gdb = 1; + } data.func = func; data.info = info; atomic_set(&data.started, 0); @@ -534,12 +543,27 @@ int smp_call_function (void (*func) (voi send_IPI_allbutself(CALL_FUNCTION_VECTOR); /* Wait for response */ - while (atomic_read(&data.started) != cpus) + while (atomic_read(&data.started) != cpus) { + if (gdb) { + if (count++ == 2000000) { + printk("%s: timeout\n", __FUNCTION__); + break; + } + if (count == 1000000) { + printk("looks bad\n"); + printk("cpus=%d, started=%d\n", cpus, + atomic_read(&data.started)); + } + if (count > 1000000) + udelay(1); + } barrier(); + } if (wait) while (atomic_read(&data.finished) != cpus) barrier(); + spin_unlock(&call_lock); return 0; @@ -576,14 +600,19 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) + +asmlinkage struct pt_regs * smp_reschedule_interrupt(struct pt_regs *regs) __attribute__((regparm(1))); +struct pt_regs * smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + return regs; } -asmlinkage void smp_call_function_interrupt(void) + +asmlinkage struct pt_regs * smp_call_function_interrupt(struct pt_regs *regs) __attribute__((regparm(1))); +struct pt_regs * smp_call_function_interrupt(struct pt_regs *regs) { - void (*func) (void *info) = call_data->func; + void (*func) (void *info, struct pt_regs *) = (void (*)(void *, struct pt_regs*))call_data->func; void *info = call_data->info; int wait = call_data->wait; @@ -598,12 +627,13 @@ asmlinkage void smp_call_function_interr * At this point the info structure may be out of scope unless wait==1 */ irq_enter(); - (*func)(info); + (*func)(info, regs); irq_exit(); if (wait) { mb(); atomic_inc(&call_data->finished); } + return regs; } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/smpboot.c 90-mjb/arch/i386/kernel/smpboot.c --- 00-virgin/arch/i386/kernel/smpboot.c Fri Jan 17 09:18:20 2003 +++ 90-mjb/arch/i386/kernel/smpboot.c Sat Feb 1 22:11:39 2003 @@ -62,7 +62,7 @@ int smp_num_siblings = 1; int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ /* Bitmask of currently online CPUs */ -unsigned long cpu_online_map; +unsigned long cpu_online_map = 1; static volatile unsigned long cpu_callin_map; volatile unsigned long cpu_callout_map; @@ -71,6 +71,11 @@ static unsigned long smp_commenced_mask; /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +/* Per CPU interrupt stacks */ +extern union thread_union init_irq_union; +union thread_union *irq_stacks[NR_CPUS] __cacheline_aligned = + { &init_irq_union, }; + /* Set when the idlers are all forked */ int smp_threads_ready; @@ -772,6 +777,28 @@ wakeup_secondary_cpu(int phys_apicid, un } #endif /* WAKE_SECONDARY_VIA_INIT */ +static void __init setup_irq_stack(struct task_struct *p, int cpu) +{ + unsigned long stk; + + stk = __get_free_pages(GFP_KERNEL, THREAD_ORDER); + if (!stk) + panic("I can't seem to allocate my irq stack. Oh well, giving up."); + + irq_stacks[cpu] = (void *)stk; + memset(irq_stacks[cpu], 0, THREAD_SIZE); + irq_stacks[cpu]->thread_info.cpu = cpu; + irq_stacks[cpu]->thread_info.preempt_count = 1; + /* interrupts are not preemptable */ + p->thread_info->irq_stack = &irq_stacks[cpu]->thread_info; + + /* If we want to make the irq stack more than one unit + * deep, we can chain then off of the irq_stack pointer + * here. + */ +} + + extern unsigned long cpu_initialized; static int __init do_boot_cpu(int apicid) @@ -795,6 +822,8 @@ static int __init do_boot_cpu(int apicid idle = fork_by_hand(); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + + setup_irq_stack(idle, cpu); /* * We remove it from the pidhash and the runqueue diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/srat.c 90-mjb/arch/i386/kernel/srat.c --- 00-virgin/arch/i386/kernel/srat.c Wed Dec 31 16:00:00 1969 +++ 90-mjb/arch/i386/kernel/srat.c Sat Feb 1 22:21:21 2003 @@ -0,0 +1,471 @@ + /* + * This code is taken from 64bit discontig mem support. + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + */ + +#include +#include +#include +#include +#include +#include +#include + + +#define SRAT_DEBUG + +#define NUM_KLUDGE_PAGES 4 /* Size of page descriptor kludge */ +#define PAGE_KLUDGE_START ((u32 *)empty_zero_page - NUM_KLUDGE_PAGES) + + +/* + * proximity macros and definitions + */ +#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ +#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ +#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) +#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) +#define MAX_PXM_DOMAINS 256 /* 1 byte and no promises about values */ +/* bitmap length; _PXM is at most 255 */ +#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) +static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ + +struct node_memory_chunk_s node_memory_chunk[MAXCLUMPS]; +struct node_cpuid_s node_cpuid[NR_CPUS]; + +static int srat_num_cpus; /* number of cpus */ +static int num_memory_chunks; /* total number of memory chunks */ +static unsigned long zholes_size[MAX_NUMNODES]; + +unsigned long node_start_pfn[MAX_NUMNODES]; +unsigned long node_end_pfn[MAX_NUMNODES]; + +/* extern unsigned char acpi_checksum(void *buffer, int length); */ + +/* Identify which cnode a physical address resides on */ +int pa_to_nid(u64 paddr) +{ + int i; + struct node_memory_chunk_s *nmcp; + + /* We've got a sorted list. Binary search here? Do we care?? */ + nmcp = node_memory_chunk; + for (i = num_memory_chunks; --i >= 0; nmcp++) + if (paddr >= nmcp->start_paddr && paddr <= nmcp->end_paddr) + return (int)nmcp->nid; + + return -1; +} + +int pfn_to_nid(unsigned long pfn) +{ + return pa_to_nid(((unsigned long long)pfn) << PAGE_SHIFT); +} + +/* Identify CPU proximity domains */ + +static void __init parse_cpu_affinity_structure(char *p) +{ + struct acpi_table_processor_affinity *cpu_affinity = + (struct acpi_table_processor_affinity *) p; + + if (!cpu_affinity->flags.enabled) + return; /* empty entry */ + + /* mark this node as "seen" in node bitmap */ + BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain); + + node_cpuid[srat_num_cpus].phys_id = cpu_affinity->apic_id; + /* nid should be overridden as logical node id later */ + node_cpuid[srat_num_cpus].pxm = cpu_affinity->proximity_domain; + srat_num_cpus++; + +#ifdef SRAT_DEBUG + printk("CPU 0x%02X in proximity domain 0x%02X\n", + cpu_affinity->apic_id, cpu_affinity->proximity_domain); +#endif +} + +/* + * Identify memory proximity domains and hot-remove capabilities. + * Fill node memory chunk list structure. + */ + +static void __init parse_memory_affinity_structure (char *sratp) +{ + struct acpi_table_memory_affinity *memory_affinity = + (struct acpi_table_memory_affinity *) sratp; + u64 paddr, size; + u8 pxm; + struct node_memory_chunk_s *p, *q, *pend; + + if (!memory_affinity->flags.enabled) + return; /* empty entry */ + + /* mark this node as "seen" in node bitmap */ + BMAP_SET(pxm_bitmap, memory_affinity->proximity_domain); + + /* calculate info for memory chunk structure */ + paddr = memory_affinity->base_addr_hi; + paddr = (paddr << 32) | memory_affinity->base_addr_lo; + size = memory_affinity->length_hi; + size = (size << 32) | memory_affinity->length_lo; + pxm = memory_affinity->proximity_domain; + + if (num_memory_chunks >= MAXCLUMPS) { + printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n", + size/(1024*1024), paddr); + return; + } + + /* Insertion sort based on base address */ + pend = &node_memory_chunk[num_memory_chunks]; + for (p = &node_memory_chunk[0]; p < pend; p++) { + if (paddr < p->start_paddr) + break; + } + if (p < pend) { + for (q = pend; q >= p; q--) + *(q + 1) = *q; + } + p->start_paddr = paddr; + p->size = size; + p->end_paddr = paddr + size - 1; + p->pxm = pxm; + + num_memory_chunks++; + + +#ifdef SRAT_DEBUG + printk("Memory range 0x%llX to 0x%llX (type 0x%X) in proximity domain 0x%02X %s\n", + paddr, paddr + size - 1, + memory_affinity->memory_type, + memory_affinity->proximity_domain, + (memory_affinity->flags.hot_pluggable ? + "enabled and removable" : "enabled" ) ); +#endif +} + + +/* Parse the ACPI Static Resource Affinity Table */ +static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) +{ + u8 *start, *end, *p; + int i, j, nid; + u8 pxm_to_nid_map[MAX_PXM_DOMAINS];/* _PXM to logical node ID map */ + u8 nid_to_pxm_map[MAX_NUMNODES];/* logical node ID to _PXM map */ + + start = (u8 *)(&(sratp->reserved) + 1); /* skip header */ + p = start; + end = (u8 *)sratp + sratp->header.length; +printk("In acpi20_parse_srat: sratp=0x%p, start=0x%p, end=0x%p\n", sratp, start, end); + + memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */ + memset(node_memory_chunk, 0, sizeof(node_memory_chunk)); + memset(node_cpuid, 0, sizeof(node_cpuid)); + memset(zholes_size, 0, sizeof(zholes_size)); + + /* -1 in these maps means not available */ + memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map)); + memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map)); + + num_memory_chunks = 0; + while (p < end) { + switch (*p) { + case ACPI_SRAT_PROCESSOR_AFFINITY: + parse_cpu_affinity_structure(p); + break; + case ACPI_SRAT_MEMORY_AFFINITY: + parse_memory_affinity_structure(p); + break; + default: + printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]); + break; + } + p += p[1]; + if (p[1] == 0) { + printk("acpi20_parse_srat: Entry length value is zero;" + " can't parse any further!\n"); + break; + } + } +printk("SRAT scan complete\n"); + + /* Calculate total number of nodes in system from PXM bitmap and create + * a set of sequential node IDs starting at zero. (ACPI doesn't seem + * to specify the range of _PXM values.) + */ + numnodes = 0; /* init total nodes in system */ + for (i = 0; i < MAX_PXM_DOMAINS; i++) { + if (BMAP_TEST(pxm_bitmap, i)) { + pxm_to_nid_map[i] = numnodes; + nid_to_pxm_map[numnodes] = i; + node_set_online(numnodes); + ++numnodes; + } + } +printk("numnodes=%d\n", numnodes); + if (numnodes == 0) + BUG(); + + /* set cnode id in memory chunk structure */ + for (i = 0; i < num_memory_chunks; i++) + node_memory_chunk[i].nid = pxm_to_nid_map[node_memory_chunk[i].pxm]; + + /* set cnode id in cpu structure */ + for (i = 0; i < srat_num_cpus; i++) + node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].pxm]; + + printk("pxm bitmap: "); + for (i = 0; i < sizeof(pxm_bitmap); i++) { + printk("%02X ", pxm_bitmap[i]); + } + printk("\n"); + printk("Number of logical nodes in system = %d\n", numnodes); + printk("Number of memory chunks in system = %d\n", num_memory_chunks); + + /* PAT NOTE: debug only */ + for (j = 0; j < num_memory_chunks; j++){ + printk("chunk %d nid %d start_paddr %16llx end_paddr %16llx size %16llx\n", + j, node_memory_chunk[j].nid, + node_memory_chunk[j].start_paddr, + node_memory_chunk[j].end_paddr, + node_memory_chunk[j].size); + } + printk("done with printing out the chunks\n"); + /* PAT NOTE: end debug only */ + +printk("Memory table:\n"); + + /*calculate start/size arrays*/ + for (nid = 0; nid < numnodes; nid++) { + u64 start_addr, size; + int been_here_before; + + start_addr = size = 0; + been_here_before = 0; + for (j = 0; j < num_memory_chunks; j++){ + if (node_memory_chunk[j].nid == nid) { + /* + * This should all be in pfns!!!! + * + * (1) move assignment into node_start_pfn and node_end_pfn into this function + * if node_start_pfn[nid] < (node_memory_chunk[j].start_addr >> PAGE_SHIFT) + * we've identified a hole... + * (do we need to validate that it's a hole?) + * make sure it handles multiple holes... so add zholes_size to zholes_size + * zholes_size[nid] = zholes_size[nid] + (node_memory_chunk[j].start_addr - node_end_pfn[nid] + * node_end_pfn[nid] gets updated to start_addr + size + * need to make sure to fill in if it's the first time through this code. + */ + if (been_here_before == 0) { + printk("found chunk for nid %d\n", nid); + + start_addr = node_memory_chunk[j].start_paddr; + size = node_memory_chunk[j].size; + + node_start_pfn[nid] = (start_addr >> PAGE_SHIFT); + node_end_pfn[nid] = ((start_addr + size) >> PAGE_SHIFT); + + been_here_before = 1; + } else { + start_addr = node_memory_chunk[j].start_paddr; + size = node_memory_chunk[j].size; + + printk("HOLE: chunk %d nid %d start_paddr %16llx end_paddr %16llx size %16llx\n", + j, node_memory_chunk[j].nid, + node_memory_chunk[j].start_paddr, + node_memory_chunk[j].end_paddr, + node_memory_chunk[j].size); + + if (node_start_pfn[nid] < (start_addr >> PAGE_SHIFT)) { + printk("found a whole on nid %d, chunk %d\n", nid, j); + zholes_size[nid] = zholes_size[nid] + + ((start_addr >> PAGE_SHIFT) - node_end_pfn[nid]); + node_end_pfn[nid] = ((start_addr + size) >> PAGE_SHIFT); + } + } + printk("%s (%d): start_pfn = 0x%08lx end_pfn = %08lx\n", + __FUNCTION__, nid, node_start_pfn[nid], node_end_pfn[nid]); + printk("%s (%d): start=0x%llX size=0x%llX\n", + __FUNCTION__, nid, start_addr, size); + } + } + printk("%s (%d): start_pfn = 0x%08lx end_pfn = %08lx\n", + __FUNCTION__, nid, node_start_pfn[nid], node_end_pfn[nid]); + } + return 0; +} + + +#define kludge_to_virt(idx) (PAGE_SIZE * ((unsigned long)((u32 *)empty_zero_page - (u32 *)pg0) - NUM_KLUDGE_PAGES + (unsigned long)(idx)) ) + +#define pde_kludge(idx, phys) (PAGE_KLUDGE_START[idx] = ((phys) & ~(PAGE_SIZE - 1)) | (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)) + +/* + * Temporarily use the virtual area starting from PAGE_KLUDGE_START, + * to map the target physical address. By using this area, we can + * map up to NUM_KLUDGE_PAGES pages temporarily, i.e. until the next + * page_kludge() call. + */ +static __init void * page_kludge(unsigned long phys, unsigned long size) +{ + unsigned long base, offset, mapped_size; + int idx; + + offset = phys & (PAGE_SIZE - 1); + mapped_size = PAGE_SIZE - offset; + pde_kludge(0, phys); + base = kludge_to_virt(0); + __flush_tlb_one(base); + wbinvd(); + + printk("page_kludge(0x%lx, 0x%lx): idx=%d mapped at %lx\n", phys, size, + FIX_IO_APIC_BASE_END, base); + + /* + * Most cases can be covered by the below. + */ + idx = 0; + while (mapped_size < size) { + if (idx >= NUM_KLUDGE_PAGES) + return NULL; /* cannot handle this */ + phys += PAGE_SIZE; + pde_kludge(idx, phys); + __flush_tlb_one(kludge_to_virt(idx)); + mapped_size += PAGE_SIZE; + ++idx; + } + + return((void *)(base + offset)); +} + + +void __init get_memcfg_from_srat(void) +{ + struct acpi_table_header *header = NULL; + struct acpi_table_rsdp *rsdp = NULL; + struct acpi_table_rsdt *rsdt = NULL; + struct acpi_pointer * rsdp_address; + struct acpi_table_rsdt saved_rsdt; + int tables = 0; + int i = 0; + u32 pde_save[NUM_KLUDGE_PAGES]; + + acpi_find_root_pointer(ACPI_PHYSICAL_ADDRESSING, rsdp_address); + + if (rsdp_address->pointer_type == ACPI_PHYSICAL_POINTER) { + printk("%s: assigning address to rsdp\n", __FUNCTION__); + rsdp = (struct acpi_table_rsdp *)rsdp_address->pointer.physical; + } else { + printk("%s: rsdp_address is not a physical pointer\n", __FUNCTION__); + return; + } + if (!rsdp) { + printk("%s: Didn't find ACPI root!\n", __FUNCTION__); + return; + } + + printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision, + rsdp->oem_id); + + if (strncmp(rsdp->signature, RSDP_SIG,strlen(RSDP_SIG))) { + printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__); + return; + } + + printk("%s: calling page_kludge(0x%08X, %d)\n", __FUNCTION__, + rsdp->rsdt_address, sizeof(struct acpi_table_rsdt)); + rsdt = (struct acpi_table_rsdt *) + page_kludge(rsdp->rsdt_address, sizeof(struct acpi_table_rsdt)); + + if (!rsdt) { + printk(KERN_WARNING "%s: ACPI: Invalid root system description tables (RSDT)\n", __FUNCTION__); + return; + } + printk("%s: page_kludge returned 0x%08X\n", __FUNCTION__, (ulong)rsdt); + + header = & rsdt->header; + + if (strncmp(header->signature, RSDT_SIG, strlen(RSDT_SIG))) { + printk(KERN_WARNING "ACPI: RSDT signature incorrect\n"); + return; + } + + /* + * The number of tables is computed by taking the + * size of all entries (header size minus total + * size of RSDT) divided by the size of each entry + * (4-byte table pointers). + */ + tables = (header->length - sizeof(struct acpi_table_header)) / 4; +printk("tables = %d\n", tables); + + memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt)); + + if (saved_rsdt.header.length > sizeof(saved_rsdt)) { + printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n", saved_rsdt.header.length); + return; + } +printk("Begin table scan....\n"); + memcpy(pde_save, PAGE_KLUDGE_START, sizeof(pde_save)); + + for (i = 0; i < tables; i++) { + /* Map in header, then map in full table length. */ + header = (struct acpi_table_header *) + page_kludge(saved_rsdt.entry[i], sizeof(struct acpi_table_header)); + if (!header) + break; + header = (struct acpi_table_header *) + page_kludge(saved_rsdt.entry[i], header->length); + if (!header) + break; + + if (strncmp((char *) &header->signature, "SRAT", 4)) + continue; +/* PATNOTE TRY THIS: acpi_table_compute_checksum() */ + /* if (acpi_checksum(header, header->length)) { */ + /* printk(KERN_WARNING "ACPI %s has invalid checksum\n", */ + /*XXX -john acpi_table_signatures[i]*/ /* i); */ + /* continue; */ + /* } */ + + acpi20_parse_srat((struct acpi_table_srat *)header); + goto out; + } + + printk("get_memcfg_from_srat: no SRAT found!\n"); + out: + /* Undo page kludge. */ + memcpy(PAGE_KLUDGE_START, pde_save, sizeof(pde_save)); + __flush_tlb(); + wbinvd(); +} + +unsigned long __init get_zholes_size(int nid) +{ + if((nid >= numnodes) | (nid >= MAX_NUMNODES)) + printk("%s: nid = %d is invalid. numnodes = %d", + __FUNCTION__, nid, numnodes); + return zholes_size[nid]; +} diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/time.c 90-mjb/arch/i386/kernel/time.c --- 00-virgin/arch/i386/kernel/time.c Tue Jan 14 10:06:13 2003 +++ 90-mjb/arch/i386/kernel/time.c Sun Feb 2 13:19:25 2003 @@ -70,7 +70,7 @@ u64 jiffies_64; unsigned long cpu_khz; /* Detected as we calibrate the TSC */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; @@ -87,19 +87,21 @@ struct timer_opts* timer = &timer_none; */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - usec = timer->get_offset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + + usec = timer->get_offset(); + { + unsigned long lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); + } + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } while (unlikely(seq != fr_read_end(&xtime_lock))); while (usec >= 1000000) { usec -= 1000000; @@ -112,7 +114,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -133,7 +135,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* @@ -265,6 +267,34 @@ static inline void do_timer_interrupt(in #endif } + +/* Lost tick detection and compensation */ +static inline void detect_lost_tick(void) +{ + /* read time since last interrupt */ + unsigned long delta = timer->get_offset(); + static unsigned long dbg_print; + + /* check if delta is greater then two ticks */ + if(delta >= 2*(1000000/HZ)){ + + /* only print debug info first 5 times */ + if(dbg_print < 5){ + printk(KERN_WARNING "\nWarning! Detected %lu micro-second" + " gap between interrupts.\n",delta); + printk(KERN_WARNING " Compensating for %lu lost ticks.\n", + delta/(1000000/HZ)-1); + /* dump trace info */ + show_trace(NULL); + dbg_print++; + } + /* calculate number of missed ticks */ + delta = delta/(1000000/HZ)-1; + jiffies += delta; + } + +} + /* * This is the same as the above, except we _also_ save the current * Time Stamp Counter value at the time of the timer interrupt, so that @@ -279,13 +309,14 @@ void timer_interrupt(int irq, void *dev_ * the irq version of write_lock because as just said we have irq * locally disabled. -arca */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); + detect_lost_tick(); timer->mark_offset(); do_timer_interrupt(irq, NULL, regs); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/timers/Makefile 90-mjb/arch/i386/kernel/timers/Makefile --- 00-virgin/arch/i386/kernel/timers/Makefile Tue Jan 14 10:06:13 2003 +++ 90-mjb/arch/i386/kernel/timers/Makefile Sun Feb 2 13:19:14 2003 @@ -4,4 +4,4 @@ obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o -obj-$(CONFIG_X86_CYCLONE) += timer_cyclone.o +obj-$(CONFIG_X86_SUMMIT) += timer_cyclone.o diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/timers/timer.c 90-mjb/arch/i386/kernel/timers/timer.c --- 00-virgin/arch/i386/kernel/timers/timer.c Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/i386/kernel/timers/timer.c Sun Feb 2 13:19:14 2003 @@ -4,9 +4,14 @@ /* list of externed timers */ extern struct timer_opts timer_pit; extern struct timer_opts timer_tsc; - +#ifdef CONFIG_X86_SUMMIT +extern struct timer_opts timer_cyclone; +#endif /* list of timers, ordered by preference, NULL terminated */ static struct timer_opts* timers[] = { +#ifdef CONFIG_X86_SUMMIT + &timer_cyclone, +#endif &timer_tsc, &timer_pit, NULL, diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/timers/timer_cyclone.c 90-mjb/arch/i386/kernel/timers/timer_cyclone.c --- 00-virgin/arch/i386/kernel/timers/timer_cyclone.c Tue Jan 14 10:06:13 2003 +++ 90-mjb/arch/i386/kernel/timers/timer_cyclone.c Sun Feb 2 13:19:14 2003 @@ -17,7 +17,7 @@ #include extern spinlock_t i8253_lock; - +extern unsigned long fast_gettimeoffset_quotient; /* Number of usecs that the last interrupt was delayed */ static int delay_at_last_interrupt; @@ -142,6 +142,28 @@ static int init_cyclone(void) printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); cyclone_timer = 0; return -ENODEV; + } + } + + /* init fast_gettimeoffset_quotent and cpu_khz. + * XXX - This should really be done elsewhere, + * and in a more generic fashion. -johnstul@us.ibm.com + */ + if (cpu_has_tsc) { + unsigned long tsc_quotient = calibrate_tsc(); + if (tsc_quotient) { + fast_gettimeoffset_quotient = tsc_quotient; + /* report CPU clock rate in Hz. + * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = + * clock/second. Our precision is about 100 ppm. + */ + { unsigned long eax=0, edx=1000; + __asm__("divl %2" + :"=a" (cpu_khz), "=d" (edx) + :"r" (tsc_quotient), + "0" (eax), "1" (edx)); + printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); + } } } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/timers/timer_tsc.c 90-mjb/arch/i386/kernel/timers/timer_tsc.c --- 00-virgin/arch/i386/kernel/timers/timer_tsc.c Tue Jan 14 10:06:13 2003 +++ 90-mjb/arch/i386/kernel/timers/timer_tsc.c Sun Feb 2 13:19:14 2003 @@ -130,7 +130,7 @@ static void delay_tsc(unsigned long loop #define CALIBRATE_LATCH (5 * LATCH) #define CALIBRATE_TIME (5 * 1000020/HZ) -static unsigned long __init calibrate_tsc(void) +unsigned long __init calibrate_tsc(void) { /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/traps.c 90-mjb/arch/i386/kernel/traps.c --- 00-virgin/arch/i386/kernel/traps.c Mon Jan 13 21:09:20 2003 +++ 90-mjb/arch/i386/kernel/traps.c Sat Feb 1 22:09:06 2003 @@ -50,6 +50,24 @@ #include #include +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + +#ifdef CONFIG_X86_REMOTE_DEBUG +gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs) ; \ + after; \ + } \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + asmlinkage int system_call(void); asmlinkage void lcall7(void); asmlinkage void lcall27(void); @@ -252,6 +270,7 @@ void die(const char * str, struct pt_reg bust_spinlocks(1); handle_BUG(regs); printk("%s: %04lx\n", str, err & 0xffff); + CHK_REMOTE_DEBUG(1,SIGTRAP,err,regs,) show_registers(regs); bust_spinlocks(0); spin_unlock_irq(&die_lock); @@ -312,6 +331,7 @@ static inline void do_trap(int trapnr, i #define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,)\ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -329,7 +349,9 @@ asmlinkage void do_##name(struct pt_regs #define DO_VM86_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,return)\ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + return; \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -374,8 +396,10 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) + if (!fixup_exception(regs)) { + CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,); die("general protection fault", regs, error_code); + } } static void mem_parity_error(unsigned char reason, struct pt_regs * regs) @@ -537,8 +561,10 @@ asmlinkage void do_debug(struct pt_regs * allowing programs to debug themselves without the ptrace() * interface. */ +#ifndef CONFIG_X86_REMOTE_DEBUG if ((regs->xcs & 3) == 0) goto clear_TF_reenable; +#endif if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } @@ -550,11 +576,13 @@ asmlinkage void do_debug(struct pt_regs info.si_errno = 0; info.si_code = TRAP_BRKPT; - /* If this is a kernel mode trap, save the user PC on entry to - * the kernel, that's what the debugger can make sense of. - */ - info.si_addr = ((regs->xcs & 3) == 0) ? (void *)tsk->thread.eip : - (void *)regs->eip; + + /* If this is a kernel mode trap, we need to reset db7 to allow us + * to continue sanely */ + if ((regs->xcs & 3) == 0) + goto clear_dr7; + + info.si_addr = (void *)regs->eip; force_sig_info(SIGTRAP, &info, tsk); /* Disable additional traps. They'll be re-enabled when @@ -564,13 +592,16 @@ clear_dr7: __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); + CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) return; debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); return; +#ifndef CONFIG_X86_REMOTE_DEBUG clear_TF_reenable: +#endif set_tsk_thread_flag(tsk, TIF_SINGLESTEP); clear_TF: regs->eflags &= ~TF_MASK; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/mm/discontig.c 90-mjb/arch/i386/mm/discontig.c --- 00-virgin/arch/i386/mm/discontig.c Sun Nov 17 20:29:47 2002 +++ 90-mjb/arch/i386/mm/discontig.c Sat Feb 1 22:21:15 2003 @@ -48,6 +48,14 @@ extern unsigned long max_low_pfn; extern unsigned long totalram_pages; extern unsigned long totalhigh_pages; +#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) + +unsigned long node_remap_start_pfn[MAX_NUMNODES]; +unsigned long node_remap_size[MAX_NUMNODES]; +unsigned long node_remap_offset[MAX_NUMNODES]; +void *node_remap_start_vaddr[MAX_NUMNODES]; +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + /* * Find the highest page frame number we have available for the node */ @@ -65,12 +73,13 @@ static void __init find_max_pfn_node(int */ static void __init allocate_pgdat(int nid) { - unsigned long node_datasz; - - node_datasz = PFN_UP(sizeof(struct pglist_data)); - NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); - min_low_pfn += node_datasz; - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + if (nid) + NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; + else { + NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); + min_low_pfn += PFN_UP(sizeof(pg_data_t)); + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + } } /* @@ -113,14 +122,6 @@ static void __init register_bootmem_low_ } } -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -unsigned long node_remap_start_pfn[MAX_NUMNODES]; -unsigned long node_remap_size[MAX_NUMNODES]; -unsigned long node_remap_offset[MAX_NUMNODES]; -void *node_remap_start_vaddr[MAX_NUMNODES]; -extern void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - void __init remap_numa_kva(void) { void *vaddr; @@ -145,7 +146,7 @@ static unsigned long calculate_numa_rema for (nid = 1; nid < numnodes; nid++) { /* calculate the size of the mem_map needed in bytes */ size = (node_end_pfn[nid] - node_start_pfn[nid] + 1) - * sizeof(struct page); + * sizeof(struct page) + sizeof(pg_data_t); /* convert size to large (pmd size) pages, rounding up */ size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; /* now the roundup is correct, convert to PAGE_SIZE pages */ @@ -195,9 +196,9 @@ unsigned long __init setup_memory(void) printk("Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); for (nid = 0; nid < numnodes; nid++) { - allocate_pgdat(nid); node_remap_start_vaddr[nid] = pfn_to_kaddr( highstart_pfn - node_remap_offset[nid]); + allocate_pgdat(nid); printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, (ulong) node_remap_start_vaddr[nid], (ulong) pfn_to_kaddr(highstart_pfn @@ -251,13 +252,6 @@ unsigned long __init setup_memory(void) */ find_smp_config(); - /*insert other nodes into pgdat_list*/ - for (nid = 1; nid < numnodes; nid++){ - NODE_DATA(nid)->pgdat_next = pgdat_list; - pgdat_list = NODE_DATA(nid); - } - - #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (system_max_low_pfn << PAGE_SHIFT)) { @@ -281,6 +275,19 @@ unsigned long __init setup_memory(void) void __init zone_sizes_init(void) { int nid; + unsigned long zholes_size; + + /* + * Insert nodes into pgdat_list backward so they appear in order. + * Clobber node 0's links and NULL out pgdat_list before starting. + */ + pgdat_list = NULL; + for (nid = numnodes - 1; nid >= 0; nid--) { + if (nid) + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + NODE_DATA(nid)->pgdat_next = pgdat_list; + pgdat_list = NODE_DATA(nid); + } for (nid = 0; nid < numnodes; nid++) { unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; @@ -307,18 +314,24 @@ void __init zone_sizes_init(void) #endif } } + zholes_size = get_zholes_size(nid); /* * We let the lmem_map for node 0 be allocated from the * normal bootmem allocator, but other nodes come from the * remapped KVA area - mbligh */ - if (nid) + if (!nid) + free_area_init_node(nid, NODE_DATA(nid), 0, zones_size, + start, (unsigned long *) zholes_size); + else { + unsigned long lmem_map; + lmem_map = (unsigned long)node_remap_start_vaddr[nid]; + lmem_map += sizeof(pg_data_t) + PAGE_SIZE - 1; + lmem_map &= PAGE_MASK; free_area_init_node(nid, NODE_DATA(nid), - node_remap_start_vaddr[nid], zones_size, - start, 0); - else - free_area_init_node(nid, NODE_DATA(nid), 0, - zones_size, start, 0); + (struct page *)lmem_map, zones_size, + start, (unsigned long *) zholes_size); + } } return; } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/mm/fault.c 90-mjb/arch/i386/mm/fault.c --- 00-virgin/arch/i386/mm/fault.c Mon Jan 13 21:09:20 2003 +++ 90-mjb/arch/i386/mm/fault.c Sat Feb 1 22:09:06 2003 @@ -2,6 +2,11 @@ * linux/arch/i386/mm/fault.c * * Copyright (C) 1995 Linus Torvalds + * + * Change History + * + * Tigran Aivazian Remote debugging support. + * */ #include @@ -20,6 +25,9 @@ #include #include /* For unblank_screen() */ #include +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif #include #include @@ -193,6 +201,15 @@ asmlinkage void do_page_fault(struct pt_ if (in_atomic() || !mm) goto no_context; +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs) ; + return; /* return w/modified regs */ + } + } +#endif + down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -291,6 +308,19 @@ bad_area: force_sig_info(SIGSEGV, &info, tsk); return; } + +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + return; /* Return with modified registers */ + } + } else { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + } + } +#endif #ifdef CONFIG_X86_F00F_BUG /* diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/mm/init.c 90-mjb/arch/i386/mm/init.c --- 00-virgin/arch/i386/mm/init.c Mon Jan 13 21:09:20 2003 +++ 90-mjb/arch/i386/mm/init.c Sun Feb 2 13:19:30 2003 @@ -508,20 +508,36 @@ void __init mem_init(void) #endif } -#if CONFIG_X86_PAE -struct kmem_cache_s *pae_pgd_cachep; +#include + +kmem_cache_t *pmd_cache; +kmem_cache_t *pgd_cache; + +void pmd_ctor(void *, kmem_cache_t *, unsigned long); +void pgd_ctor(void *, kmem_cache_t *, unsigned long); void __init pgtable_cache_init(void) { + if (PTRS_PER_PMD > 1) { + pmd_cache = kmem_cache_create("pae_pmd", + PTRS_PER_PMD*sizeof(pmd_t), + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + pmd_ctor, + NULL); + + if (!pmd_cache) + panic("pgtable_cache_init(): cannot create pmd cache"); + } + /* * PAE pgds must be 16-byte aligned: */ - pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); - if (!pae_pgd_cachep) - panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); + pgd_cache = kmem_cache_create("pgd", PTRS_PER_PGD*sizeof(pgd_t), 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, pgd_ctor, NULL); + if (!pgd_cache) + panic("pgtable_cache_init(): Cannot create pgd cache"); } -#endif /* Put this after the callers, so that it cannot be inlined */ static int do_test_wp_bit(void) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/mm/pgtable.c 90-mjb/arch/i386/mm/pgtable.c --- 00-virgin/arch/i386/mm/pgtable.c Sun Nov 17 20:29:59 2002 +++ 90-mjb/arch/i386/mm/pgtable.c Sun Feb 2 13:19:30 2003 @@ -166,61 +166,60 @@ struct page *pte_alloc_one(struct mm_str return pte; } -#if CONFIG_X86_PAE +extern kmem_cache_t *pmd_cache; +extern kmem_cache_t *pgd_cache; -pgd_t *pgd_alloc(struct mm_struct *mm) +void pmd_ctor(void *__pmd, kmem_cache_t *pmd_cache, unsigned long flags) { - int i; - pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); - - if (pgd) { - for (i = 0; i < USER_PTRS_PER_PGD; i++) { - unsigned long pmd = __get_free_page(GFP_KERNEL); - if (!pmd) - goto out_oom; - clear_page(pmd); - set_pgd(pgd + i, __pgd(1 + __pa(pmd))); - } - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } - return pgd; -out_oom: - for (i--; i >= 0; i--) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); - return NULL; + clear_page(__pmd); } -void pgd_free(pgd_t *pgd) +void pgd_ctor(void *__pgd, kmem_cache_t *pgd_cache, unsigned long flags) { - int i; + pgd_t *pgd = __pgd; - for (i = 0; i < USER_PTRS_PER_PGD; i++) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); + if (PTRS_PER_PMD == 1) + memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); } -#else - pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + int i; + pgd_t *pgd = kmem_cache_alloc(pgd_cache, SLAB_KERNEL); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + if (PTRS_PER_PMD == 1) + return pgd; + else if (!pgd) + return NULL; + + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd_t *pmd = kmem_cache_alloc(pmd_cache, SLAB_KERNEL); + if (!pmd) + goto out_oom; + set_pgd(pgd + i, __pgd(1 + __pa((unsigned long long)((unsigned long)pmd)))); } return pgd; + +out_oom: + for (i--; i >= 0; --i) + kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pgd_cache, (void *)pgd); + return NULL; } void pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); -} + int i; -#endif /* CONFIG_X86_PAE */ + if (PTRS_PER_PMD > 1) { + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + set_pgd(pgd + i, __pgd(0)); + } + } + kmem_cache_free(pgd_cache, (void *)pgd); +} diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/oprofile/Makefile 90-mjb/arch/i386/oprofile/Makefile --- 00-virgin/arch/i386/oprofile/Makefile Sun Dec 1 09:59:46 2002 +++ 90-mjb/arch/i386/oprofile/Makefile Sun Feb 2 13:19:27 2003 @@ -7,4 +7,4 @@ DRIVER_OBJS = $(addprefix ../../../drive oprofile-y := $(DRIVER_OBJS) init.o timer_int.o oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ - op_model_ppro.o + op_model_ppro.o op_model_p4.o diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/oprofile/nmi_int.c 90-mjb/arch/i386/oprofile/nmi_int.c --- 00-virgin/arch/i386/oprofile/nmi_int.c Thu Jan 2 22:04:59 2003 +++ 90-mjb/arch/i386/oprofile/nmi_int.c Sun Feb 2 13:19:27 2003 @@ -214,12 +214,61 @@ struct oprofile_operations nmi_ops = { .stop = nmi_stop }; + +#if !defined(CONFIG_X86_64) + +static int __init p4_init(enum oprofile_cpu * cpu) +{ + __u8 cpu_model = current_cpu_data.x86_model; + + if (cpu_model > 3) + return 0; + +#ifndef CONFIG_SMP + *cpu = OPROFILE_CPU_P4; + model = &op_p4_spec; +#else + switch (smp_num_siblings) { + case 1: + *cpu = OPROFILE_CPU_P4; + model = &op_p4_spec; + return 1; + + case 2: + *cpu = OPROFILE_CPU_P4_HT2; + model = &op_p4_ht2_spec; + return 1; + } +#endif + + printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n"); + printk(KERN_INFO "oprofile: Reverting to timer mode.\n"); + return 0; +} + + +static int __init ppro_init(enum oprofile_cpu * cpu) +{ + __u8 cpu_model = current_cpu_data.x86_model; + + if (cpu_model > 5) { + *cpu = OPROFILE_CPU_PIII; + } else if (cpu_model > 2) { + *cpu = OPROFILE_CPU_PII; + } else { + *cpu = OPROFILE_CPU_PPRO; + } + + model = &op_ppro_spec; + return 1; +} + +#endif /* !CONFIG_X86_64 */ int __init nmi_init(struct oprofile_operations ** ops, enum oprofile_cpu * cpu) { __u8 vendor = current_cpu_data.x86_vendor; __u8 family = current_cpu_data.x86; - __u8 cpu_model = current_cpu_data.x86_model; if (!cpu_has_apic) return 0; @@ -233,23 +282,26 @@ int __init nmi_init(struct oprofile_oper *cpu = OPROFILE_CPU_ATHLON; break; -#ifndef CONFIG_X86_64 +#if !defined(CONFIG_X86_64) case X86_VENDOR_INTEL: - /* Less than a P6-class processor */ - if (family != 6) - return 0; - - if (cpu_model > 5) { - *cpu = OPROFILE_CPU_PIII; - } else if (cpu_model > 2) { - *cpu = OPROFILE_CPU_PII; - } else { - *cpu = OPROFILE_CPU_PPRO; + switch (family) { + /* Pentium IV */ + case 0xf: + if (!p4_init(cpu)) + return 0; + break; + + /* A P6-class processor */ + case 6: + if (!ppro_init(cpu)) + return 0; + break; + + default: + return 0; } - - model = &op_ppro_spec; break; -#endif +#endif /* !CONFIG_X86_64 */ default: return 0; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/oprofile/op_counter.h 90-mjb/arch/i386/oprofile/op_counter.h --- 00-virgin/arch/i386/oprofile/op_counter.h Sun Nov 17 20:29:46 2002 +++ 90-mjb/arch/i386/oprofile/op_counter.h Sun Feb 2 13:19:27 2003 @@ -10,7 +10,7 @@ #ifndef OP_COUNTER_H #define OP_COUNTER_H -#define OP_MAX_COUNTER 4 +#define OP_MAX_COUNTER 8 /* Per-perfctr configuration as set via * oprofilefs. diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/oprofile/op_model_p4.c 90-mjb/arch/i386/oprofile/op_model_p4.c --- 00-virgin/arch/i386/oprofile/op_model_p4.c Wed Dec 31 16:00:00 1969 +++ 90-mjb/arch/i386/oprofile/op_model_p4.c Sun Feb 2 13:19:27 2003 @@ -0,0 +1,670 @@ +/** + * @file op_model_p4.c + * P4 model-specific MSR operations + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author Graydon Hoare + */ + +#include +#include +#include +#include +#include +#include + +#include "op_x86_model.h" +#include "op_counter.h" + +#define NUM_EVENTS 39 + +#define NUM_COUNTERS_NON_HT 8 +#define NUM_ESCRS_NON_HT 45 +#define NUM_CCCRS_NON_HT 18 +#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT) + +#define NUM_COUNTERS_HT2 4 +#define NUM_ESCRS_HT2 23 +#define NUM_CCCRS_HT2 9 +#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) + +static unsigned int num_counters = NUM_COUNTERS_NON_HT; +static unsigned int num_cccrs = NUM_CCCRS_NON_HT; + + +/* this has to be checked dynamically since the + hyper-threadedness of a chip is discovered at + kernel boot-time. */ +static inline void setup_num_counters(void) +{ +#ifdef CONFIG_SMP + if (smp_num_siblings == 2) { + num_counters = NUM_COUNTERS_HT2; + num_cccrs = NUM_CCCRS_HT2; + } +#endif +} + + +/* tables to simulate simplified hardware view of p4 registers */ +struct p4_counter_binding { + int virt_counter; + int counter_address; + int cccr_address; +}; + +struct p4_event_binding { + int escr_select; /* value to put in CCCR */ + int event_select; /* value to put in ESCR */ + struct { + int virt_counter; /* for this counter... */ + int escr_address; /* use this ESCR */ + } bindings[2]; +}; + +/* nb: these CTR_* defines are a duplicate of defines in + libop/op_events.c. */ + + +#define CTR_BPU_0 (1 << 0) +#define CTR_MS_0 (1 << 1) +#define CTR_FLAME_0 (1 << 2) +#define CTR_IQ_4 (1 << 3) +#define CTR_BPU_2 (1 << 4) +#define CTR_MS_2 (1 << 5) +#define CTR_FLAME_2 (1 << 6) +#define CTR_IQ_5 (1 << 7) + +static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { + { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, + { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, + { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, + { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 }, + { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 }, + { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 }, + { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 }, + { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } +}; + +/* p4 event codes in libop/op_event.h are indices into this table. */ + +static struct p4_event_binding p4_events[NUM_EVENTS] = { + + { /* BRANCH_RETIRED */ + 0x05, 0x06, + { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, + {CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* MISPRED_BRANCH_RETIRED */ + 0x04, 0x03, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* TC_DELIVER_MODE */ + 0x01, 0x01, + { { CTR_MS_0, MSR_P4_TC_ESCR0}, + { CTR_MS_2, MSR_P4_TC_ESCR1} } + }, + + { /* BPU_FETCH_REQUEST */ + 0x00, 0x03, + { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, + { CTR_BPU_2, MSR_P4_BPU_ESCR1} } + }, + + { /* ITLB_REFERENCE */ + 0x03, 0x18, + { { CTR_BPU_0, MSR_P4_ITLB_ESCR0}, + { CTR_BPU_2, MSR_P4_ITLB_ESCR1} } + }, + + { /* MEMORY_CANCEL */ + 0x05, 0x02, + { { CTR_FLAME_0, MSR_P4_DAC_ESCR0}, + { CTR_FLAME_2, MSR_P4_DAC_ESCR1} } + }, + + { /* MEMORY_COMPLETE */ + 0x02, 0x08, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* LOAD_PORT_REPLAY */ + 0x02, 0x04, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* STORE_PORT_REPLAY */ + 0x02, 0x05, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* MOB_LOAD_REPLAY */ + 0x02, 0x03, + { { CTR_BPU_0, MSR_P4_MOB_ESCR0}, + { CTR_BPU_2, MSR_P4_MOB_ESCR1} } + }, + + { /* PAGE_WALK_TYPE */ + 0x04, 0x01, + { { CTR_BPU_0, MSR_P4_PMH_ESCR0}, + { CTR_BPU_2, MSR_P4_PMH_ESCR1} } + }, + + { /* BSQ_CACHE_REFERENCE */ + 0x07, 0x0c, + { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, + { CTR_BPU_2, MSR_P4_BSU_ESCR1} } + }, + + { /* IOQ_ALLOCATION */ + 0x06, 0x03, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + {-1,-1} } + }, + + { /* IOQ_ACTIVE_ENTRIES */ + 0x06, 0x1a, + { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, + {-1,-1} } + }, + + { /* FSB_DATA_ACTIVITY */ + 0x06, 0x17, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { CTR_BPU_2, MSR_P4_FSB_ESCR1} } + }, + + { /* BSQ_ALLOCATION */ + 0x07, 0x05, + { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, + {-1,-1} } + }, + + { /* BSQ_ACTIVE_ENTRIES */ + 0x07, 0x06, + { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, + {-1,-1} } + }, + + { /* X87_ASSIST */ + 0x05, 0x03, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* SSE_INPUT_ASSIST */ + 0x01, 0x34, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* PACKED_SP_UOP */ + 0x01, 0x08, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* PACKED_DP_UOP */ + 0x01, 0x0c, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* SCALAR_SP_UOP */ + 0x01, 0x0a, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* SCALAR_DP_UOP */ + 0x01, 0x0e, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* 64BIT_MMX_UOP */ + 0x01, 0x02, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* 128BIT_MMX_UOP */ + 0x01, 0x1a, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* X87_FP_UOP */ + 0x01, 0x04, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* X87_SIMD_MOVES_UOP */ + 0x01, 0x2e, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* MACHINE_CLEAR */ + 0x05, 0x02, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* GLOBAL_POWER_EVENTS */ + 0x06, 0x13 /* manual says 0x05 */, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { CTR_BPU_2, MSR_P4_FSB_ESCR1} } + }, + + { /* TC_MS_XFER */ + 0x00, 0x05, + { { CTR_MS_0, MSR_P4_MS_ESCR0}, + { CTR_MS_2, MSR_P4_MS_ESCR1} } + }, + + { /* UOP_QUEUE_WRITES */ + 0x00, 0x09, + { { CTR_MS_0, MSR_P4_MS_ESCR0}, + { CTR_MS_2, MSR_P4_MS_ESCR1} } + }, + + { /* FRONT_END_EVENT */ + 0x05, 0x08, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* EXECUTION_EVENT */ + 0x05, 0x0c, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* REPLAY_EVENT */ + 0x05, 0x09, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* INSTR_RETIRED */ + 0x04, 0x02, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* UOPS_RETIRED */ + 0x04, 0x01, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* UOP_TYPE */ + 0x02, 0x02, + { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, + { CTR_IQ_5, MSR_P4_RAT_ESCR1} } + }, + + { /* RETIRED_MISPRED_BRANCH_TYPE */ + 0x02, 0x05, + { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, + { CTR_MS_2, MSR_P4_TBPU_ESCR1} } + }, + + { /* RETIRED_BRANCH_TYPE */ + 0x02, 0x04, + { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, + { CTR_MS_2, MSR_P4_TBPU_ESCR1} } + } +}; + + +#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7) + +#define ESCR_RESERVED_BITS 0x80000003 +#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS) +#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2)) +#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3)) +#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1))) +#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) +#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x1f) << 25)) +#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) +#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0); +#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0); + +#define CCCR_RESERVED_BITS 0x38030FFF +#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) +#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000) +#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13)) +#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26)) +#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) +#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) +#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) +#define CCCR_READ(low, high, i) do {rdmsr (p4_counters[(i)].cccr_address, (low), (high));} while (0); +#define CCCR_WRITE(low, high, i) do {wrmsr (p4_counters[(i)].cccr_address, (low), (high));} while (0); +#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) +#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) + +#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0); +#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0); +#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) + +/* these access the underlying cccrs 1-18, not the subset of 8 bound to "virtual counters" */ +#define RAW_CCCR_READ(low, high, i) do {rdmsr (MSR_P4_BPU_CCCR0 + (i), (low), (high));} while (0); +#define RAW_CCCR_WRITE(low, high, i) do {wrmsr (MSR_P4_BPU_CCCR0 + (i), (low), (high));} while (0); + + +/* this assigns a "stagger" to the current CPU, which is used throughout + the code in this module as an extra array offset, to select the "even" + or "odd" part of all the divided resources. */ +static inline unsigned int get_stagger(void) +{ +#ifdef CONFIG_SMP + int cpu; + if (smp_num_siblings > 1) { + cpu = smp_processor_id(); + return (cpu_sibling_map[cpu] > cpu) ? 0 : 1; + } +#endif + return 0; +} + + +/* finally, mediate access to a real hardware counter + by passing a "virtual" counter numer to this macro, + along with your stagger setting. */ +#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger))) + +static unsigned long reset_value[NUM_COUNTERS_NON_HT]; + + +static void p4_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + unsigned int addr, stag; + + setup_num_counters(); + stag = get_stagger(); + + /* the 8 counter registers we pay attention to */ + for (i = 0; i < num_counters; ++i) + msrs->counters.addrs[i] = + p4_counters[VIRT_CTR(stag, i)].counter_address; + + /* 18 CCCR registers */ + for (i=stag, addr = MSR_P4_BPU_CCCR0; + addr <= MSR_P4_IQ_CCCR5; ++i, addr += (1 + stag)) + msrs->controls.addrs[i] = addr; + + /* 43 ESCR registers */ + for (addr = MSR_P4_BSU_ESCR0; + addr <= MSR_P4_SSU_ESCR0; ++i, addr += (1 + stag)){ + msrs->controls.addrs[i] = addr; + } + + for (addr = MSR_P4_MS_ESCR0; + addr <= MSR_P4_TC_ESCR1; ++i, addr += (1 + stag)){ + msrs->controls.addrs[i] = addr; + } + + for (addr = MSR_P4_IX_ESCR0; + addr <= MSR_P4_CRU_ESCR3; ++i, addr += (1 + stag)){ + msrs->controls.addrs[i] = addr; + } + + /* there are 2 remaining non-contiguously located ESCRs */ + + if (num_counters == NUM_COUNTERS_NON_HT) { + /* standard non-HT CPUs handle both remaining ESCRs*/ + msrs->controls.addrs[i++] = MSR_P4_CRU_ESCR5; + msrs->controls.addrs[i++] = MSR_P4_CRU_ESCR4; + + } else if (stag == 0) { + /* HT CPUs give the first remainder to the even thread, as + the 32nd control register */ + msrs->controls.addrs[i++] = MSR_P4_CRU_ESCR4; + + } else { + /* and two copies of the second to the odd thread, + for the 31st and 32nd control registers */ + msrs->controls.addrs[i++] = MSR_P4_CRU_ESCR5; + msrs->controls.addrs[i++] = MSR_P4_CRU_ESCR5; + } +} + + +static void pmc_setup_one_p4_counter(unsigned int ctr) +{ + int i; + int const maxbind = 2; + unsigned int cccr = 0; + unsigned int escr = 0; + unsigned int high = 0; + unsigned int counter_bit; + struct p4_event_binding * ev = 0; + unsigned int stag; + + stag = get_stagger(); + + /* convert from counter *number* to counter *bit* */ + counter_bit = 1 << ctr; + + /* find our event binding structure. */ + if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { + printk(KERN_ERR + "oprofile: P4 event code 0x%lx out of range\n", + counter_config[ctr].event); + return; + } + + ev = &(p4_events[counter_config[ctr].event - 1]); + + for (i = 0; i < maxbind; i++) { + if (ev->bindings[i].virt_counter & counter_bit) { + + /* modify ESCR */ + ESCR_READ(escr, high, ev, i); + ESCR_CLEAR(escr); + if (stag == 0) { + ESCR_SET_USR_0(escr, counter_config[ctr].user); + ESCR_SET_OS_0(escr, counter_config[ctr].kernel); + } else { + ESCR_SET_USR_1(escr, counter_config[ctr].user); + ESCR_SET_OS_1(escr, counter_config[ctr].kernel); + } + ESCR_SET_EVENT_SELECT(escr, ev->event_select); + ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); + ESCR_WRITE(escr, high, ev, i); + + /* modify CCCR */ + CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); + CCCR_CLEAR(cccr); + CCCR_SET_REQUIRED_BITS(cccr); + CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); + if (stag == 0) { + CCCR_SET_PMI_OVF_0(cccr); + } else { + CCCR_SET_PMI_OVF_1(cccr); + } + CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); + return; + } + } +} + + +static void p4_setup_ctrs(struct op_msrs const * const msrs) +{ + unsigned int i; + unsigned int low, high; + unsigned int addr; + unsigned int stag; + + stag = get_stagger(); + + rdmsr(MSR_IA32_MISC_ENABLE, low, high); + if (! MISC_PMC_ENABLED_P(low)) { + printk(KERN_ERR "oprofile: P4 PMC not available\n"); + return; + } + + /* clear all cccrs (including those outside our concern) */ + for (i = stag ; i < num_cccrs ; i += (1 + stag)) { + RAW_CCCR_READ(low, high, i); + CCCR_CLEAR(low); + CCCR_SET_REQUIRED_BITS(low); + RAW_CCCR_WRITE(low, high, i); + } + + /* clear all escrs (including those outside out concern) */ + for (addr = MSR_P4_BSU_ESCR0 + stag; + addr <= MSR_P4_SSU_ESCR0; addr += (1 + stag)){ + wrmsr(addr, 0, 0); + } + + for (addr = MSR_P4_MS_ESCR0 + stag; + addr <= MSR_P4_TC_ESCR1; addr += (1 + stag)){ + wrmsr(addr, 0, 0); + } + + for (addr = MSR_P4_IX_ESCR0 + stag; + addr <= MSR_P4_CRU_ESCR3; addr += (1 + stag)){ + wrmsr(addr, 0, 0); + } + + if (num_counters == NUM_COUNTERS_NON_HT) { + wrmsr(MSR_P4_CRU_ESCR4, 0, 0); + wrmsr(MSR_P4_CRU_ESCR5, 0, 0); + } else if (stag == 0) { + wrmsr(MSR_P4_CRU_ESCR4, 0, 0); + } else { + wrmsr(MSR_P4_CRU_ESCR5, 0, 0); + } + + /* setup all counters */ + for (i = 0 ; i < num_counters ; ++i) { + if (counter_config[i].event) { + reset_value[i] = counter_config[i].count; + pmc_setup_one_p4_counter(i); + CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); + } else { + reset_value[i] = 0; + } + } +} + + +static int p4_check_ctrs(unsigned int const cpu, + struct op_msrs const * const msrs, + struct pt_regs * const regs) +{ + unsigned long ctr, low, high, stag, real; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + + if (!counter_config[i].event) + continue; + + /* + * there is some eccentricity in the hardware which + * requires that we perform 2 extra corrections: + * + * - check both the CCCR:OVF flag for overflow and the + * counter high bit for un-flagged overflows. + * + * - write the counter back twice to ensure it gets + * updated properly. + * + * the former seems to be related to extra NMIs happening + * during the current NMI; the latter is reported as errata + * N15 in intel doc 249199-029, pentium 4 specification + * update, though their suggested work-around does not + * appear to solve the problem. + */ + + real = VIRT_CTR(stag, i); + + CCCR_READ(low, high, real); + CTR_READ(ctr, high, real); + if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { + oprofile_add_sample(regs->eip, i, cpu); + CTR_WRITE(reset_value[i], real); + CCCR_CLEAR_OVF(low); + CCCR_WRITE(low, high, real); + CTR_WRITE(reset_value[i], real); + /* P4 quirk: you have to re-unmask the apic vector */ + apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + return 1; + } + } + + /* P4 quirk: you have to re-unmask the apic vector */ + apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + return 0; +} + + +static void p4_start(struct op_msrs const * const msrs) +{ + unsigned int low, high, stag; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + if (!reset_value[i]) continue; + CCCR_READ(low, high, VIRT_CTR(stag, i)); + CCCR_SET_ENABLE(low); + CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + } +} + + +static void p4_stop(struct op_msrs const * const msrs) +{ + unsigned int low, high, stag; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + CCCR_READ(low, high, VIRT_CTR(stag, i)); + CCCR_SET_DISABLE(low); + CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + } +} + + +#ifdef CONFIG_SMP +struct op_x86_model_spec const op_p4_ht2_spec = { + .num_counters = NUM_COUNTERS_HT2, + .num_controls = NUM_CONTROLS_HT2, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop +}; +#endif + +struct op_x86_model_spec const op_p4_spec = { + .num_counters = NUM_COUNTERS_NON_HT, + .num_controls = NUM_CONTROLS_NON_HT, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop +}; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/oprofile/op_x86_model.h 90-mjb/arch/i386/oprofile/op_x86_model.h --- 00-virgin/arch/i386/oprofile/op_x86_model.h Sun Nov 17 20:29:28 2002 +++ 90-mjb/arch/i386/oprofile/op_x86_model.h Sun Feb 2 13:19:27 2003 @@ -11,8 +11,8 @@ #ifndef OP_X86_MODEL_H #define OP_X86_MODEL_H -/* will need re-working for Pentium IV */ -#define MAX_MSR 4 +/* Pentium IV needs all these */ +#define MAX_MSR 63 struct op_saved_msr { unsigned int high; @@ -47,6 +47,8 @@ struct op_x86_model_spec { }; extern struct op_x86_model_spec const op_ppro_spec; +extern struct op_x86_model_spec const op_p4_spec; +extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_athlon_spec; #endif /* OP_X86_MODEL_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/pci/numa.c 90-mjb/arch/i386/pci/numa.c --- 00-virgin/arch/i386/pci/numa.c Thu Jan 9 19:15:56 2003 +++ 90-mjb/arch/i386/pci/numa.c Sun Feb 2 13:19:29 2003 @@ -17,7 +17,7 @@ static int __pci_conf1_mq_read (int seg, { unsigned long flags; - if (!value || (bus > 255) || (dev > 31) || (fn > 7) || (reg > 255)) + if (!value || (bus > MAX_MP_BUSSES) || (dev > 31) || (fn > 7) || (reg > 255)) return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); @@ -45,7 +45,7 @@ static int __pci_conf1_mq_write (int seg { unsigned long flags; - if ((bus > 255) || (dev > 31) || (fn > 7) || (reg > 255)) + if ((bus > MAX_MP_BUSSES) || (dev > 31) || (fn > 7) || (reg > 255)) return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/vmlinux.lds.S 90-mjb/arch/i386/vmlinux.lds.S --- 00-virgin/arch/i386/vmlinux.lds.S Fri Jan 17 09:18:20 2003 +++ 90-mjb/arch/i386/vmlinux.lds.S Sat Feb 1 22:04:44 2003 @@ -10,7 +10,7 @@ ENTRY(_start) jiffies = jiffies_64; SECTIONS { - . = 0xC0000000 + 0x100000; + . = __PAGE_OFFSET + 0x100000; /* read-only */ _text = .; /* Text and read-only data */ .text : { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/ia64/kernel/time.c 90-mjb/arch/ia64/kernel/time.c --- 00-virgin/arch/ia64/kernel/time.c Sun Nov 17 20:29:28 2002 +++ 90-mjb/arch/ia64/kernel/time.c Sun Feb 2 13:19:25 2003 @@ -24,7 +24,7 @@ #include #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; extern unsigned long last_time_offset; @@ -89,7 +89,7 @@ gettimeoffset (void) void do_settimeofday (struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); { /* * This is revolting. We need to set "xtime" correctly. However, the value @@ -112,21 +112,21 @@ do_settimeofday (struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; } - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } void do_gettimeofday (struct timeval *tv) { - unsigned long flags, usec, sec, old; + unsigned long seq, usec, sec, old; - read_lock_irqsave(&xtime_lock, flags); - { + do { + seq = fr_read_begin(&xtime_lock); usec = gettimeoffset(); /* - * Ensure time never goes backwards, even when ITC on different CPUs are - * not perfectly synchronized. + * Ensure time never goes backwards, even when ITC on + * different CPUs are not perfectly synchronized. */ do { old = last_time_offset; @@ -138,8 +138,8 @@ do_gettimeofday (struct timeval *tv) sec = xtime.tv_sec; usec += xtime.tv_nsec / 1000; - } - read_unlock_irqrestore(&xtime_lock, flags); + } while (seq != fr_read_end(&xtime_lock)); + while (usec >= 1000000) { usec -= 1000000; @@ -182,10 +182,10 @@ timer_interrupt(int irq, void *dev_id, s * another CPU. We need to avoid to SMP race by acquiring the * xtime_lock. */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); do_timer(regs); local_cpu_data->itm_next = new_itm; - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } else local_cpu_data->itm_next = new_itm; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/m68k/kernel/time.c 90-mjb/arch/m68k/kernel/time.c --- 00-virgin/arch/m68k/kernel/time.c Sun Nov 17 20:29:29 2002 +++ 90-mjb/arch/m68k/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -129,7 +129,7 @@ void time_init(void) mach_sched_init(timer_interrupt); } -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* * This version of gettimeofday has near microsecond resolution. @@ -137,17 +137,20 @@ extern rwlock_t xtime_lock; void do_gettimeofday(struct timeval *tv) { extern unsigned long wall_jiffies; - unsigned long flags; + unsigned long seq; unsigned long usec, sec, lost; - read_lock_irqsave(&xtime_lock, flags); - usec = mach_gettimeoffset(); - lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000/HZ); - sec = xtime.tv_sec; - usec += xtime.tv_nsec/1000; - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + + usec = mach_gettimeoffset(); + lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000/HZ); + sec = xtime.tv_sec; + usec += xtime.tv_nsec/1000; + } while (seq != fr_read_end(&xtime_lock)); + while (usec >= 1000000) { usec -= 1000000; @@ -162,7 +165,7 @@ void do_settimeofday(struct timeval *tv) { extern unsigned long wall_jiffies; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_nsec * correctly. However, the value in this location is * is value at the last tick. @@ -183,5 +186,5 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/m68knommu/kernel/time.c 90-mjb/arch/m68knommu/kernel/time.c --- 00-virgin/arch/m68knommu/kernel/time.c Sun Nov 17 20:29:49 2002 +++ 90-mjb/arch/m68knommu/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -126,21 +126,24 @@ void time_init(void) mach_sched_init(timer_interrupt); } -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* * This version of gettimeofday has near microsecond resolution. */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - usec = mach_gettimeoffset ? mach_gettimeoffset() : 0; - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + + usec = mach_gettimeoffset ? mach_gettimeoffset() : 0; + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } while (seq != fr_read_end(&xtime_lock)); + while (usec >= 1000000) { usec -= 1000000; @@ -153,7 +156,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec * correctly. However, the value in this location is * is value at the last tick. @@ -174,5 +177,5 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/au1000/common/time.c 90-mjb/arch/mips/au1000/common/time.c --- 00-virgin/arch/mips/au1000/common/time.c Sun Nov 17 20:29:31 2002 +++ 90-mjb/arch/mips/au1000/common/time.c Sun Feb 2 13:19:26 2003 @@ -44,7 +44,7 @@ unsigned long uart_baud_base; static unsigned long r4k_offset; /* Amount to increment compare reg each time */ static unsigned long r4k_cur; /* What counter should be at next timer irq */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; #define ALLINTS (IE_IRQ0 | IE_IRQ1 | IE_IRQ2 | IE_IRQ3 | IE_IRQ4 | IE_IRQ5) @@ -150,10 +150,10 @@ void __init time_init(void) set_cp0_status(ALLINTS); /* Read time from the RTC chipset. */ - write_lock_irqsave (&xtime_lock, flags); + fr_write_lock_irqsave (&xtime_lock, flags); xtime.tv_sec = get_mips_time(); xtime.tv_usec = 0; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); } /* This is for machines which generate the exact clock. */ @@ -229,20 +229,23 @@ static unsigned long do_fast_gettimeoffs void do_gettimeofday(struct timeval *tv) { - unsigned int flags; + unsigned long seq; - read_lock_irqsave (&xtime_lock, flags); - *tv = xtime; - tv->tv_usec += do_fast_gettimeoffset(); + do { + seq = fr_read_begin(&xtime_lock); - /* - * xtime is atomically updated in timer_bh. jiffies - wall_jiffies - * is nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; + *tv = xtime; + tv->tv_usec += do_fast_gettimeoffset(); + + /* + * xtime is atomically updated in timer_bh. jiffies - wall_jiffies + * is nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; + + } while (seq != fr_read_end(&xtime_lock)); - read_unlock_irqrestore (&xtime_lock, flags); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -252,7 +255,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec correctly. * However, the value in this location is is value at the last tick. @@ -272,7 +275,7 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } /* diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/baget/time.c 90-mjb/arch/mips/baget/time.c --- 00-virgin/arch/mips/baget/time.c Sun Nov 17 20:29:20 2002 +++ 90-mjb/arch/mips/baget/time.c Sun Feb 2 13:19:26 2003 @@ -23,7 +23,7 @@ #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* * To have precision clock, we need to fix available clock frequency @@ -79,20 +79,21 @@ void __init time_init(void) void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; - read_lock_irqsave (&xtime_lock, flags); - *tv = xtime; - read_unlock_irqrestore (&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + *tv = xtime; + } while (seq != fr_read_end(&xtime_lock)); } void do_settimeofday(struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); xtime = *tv; time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/dec/time.c 90-mjb/arch/mips/dec/time.c --- 00-virgin/arch/mips/dec/time.c Thu Jan 2 22:05:00 2003 +++ 90-mjb/arch/mips/dec/time.c Sun Feb 2 13:19:26 2003 @@ -35,7 +35,7 @@ extern void (*board_time_init)(struct irqaction *irq); extern volatile unsigned long wall_jiffies; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* * Change this if you have some constant time drift @@ -210,20 +210,22 @@ static unsigned long (*do_gettimeoffset) */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; - read_lock_irqsave(&xtime_lock, flags); - *tv = xtime; - tv->tv_usec += do_gettimeoffset(); + do { + seq = fr_read_begin(&xtime_lock); + *tv = xtime; + tv->tv_usec += do_gettimeoffset(); - /* - * xtime is atomically updated in timer_bh. jiffies - wall_jiffies - * is nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; + /* + * xtime is atomically updated in timer_bh. jiffies - wall_jiffies + * is nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; + + } while (seq != fr_read_end(&xtime_lock)); - read_unlock_irqrestore(&xtime_lock, flags); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -233,7 +235,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec * correctly. However, the value in this location is @@ -254,7 +256,7 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* @@ -330,6 +332,7 @@ static inline void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { volatile unsigned char dummy; + unsigned long seq; dummy = CMOS_READ(RTC_REG_C); /* ACK RTC Interrupt */ @@ -357,23 +360,27 @@ timer_interrupt(int irq, void *dev_id, s * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be * called as close as possible to 500 ms before the new second starts. */ - read_lock(&xtime_lock); - if ((time_status & STA_UNSYNC) == 0 - && xtime.tv_sec > last_rtc_update + 660 - && xtime.tv_usec >= 500000 - tick / 2 - && xtime.tv_usec <= 500000 + tick / 2) { - if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - /* do it again in 60 s */ - last_rtc_update = xtime.tv_sec - 600; - } + do { + seq = fr_read_begin(&xtime_lock); + + if ((time_status & STA_UNSYNC) == 0 + && xtime.tv_sec > last_rtc_update + 660 + && xtime.tv_usec >= 500000 - tick / 2 + && xtime.tv_usec <= 500000 + tick / 2) { + if (set_rtc_mmss(xtime.tv_sec) == 0) + last_rtc_update = xtime.tv_sec; + else + /* do it again in 60 s */ + last_rtc_update = xtime.tv_sec - 600; + } + } while (seq != fr_read_end(&xtime_lock)); + /* As we return to user mode fire off the other CPU schedulers.. this is basically because we don't yet share IRQ's around. This message is rigged to be safe on the 386 - basically it's a hack, so don't look closely for now.. */ /*smp_message_pass(MSG_ALL_BUT_SELF, MSG_RESCHEDULE, 0L, 0); */ - read_unlock(&xtime_lock); + } static void r4k_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) @@ -470,10 +477,10 @@ void __init time_init(void) real_year = CMOS_READ(RTC_DEC_YEAR); year += real_year - 72 + 2000; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime.tv_sec = mktime(year, mon, day, hour, min, sec); xtime.tv_usec = 0; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); if (mips_cpu.options & MIPS_CPU_COUNTER) { write_32bit_cp0_register(CP0_COUNT, 0); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/ite-boards/generic/time.c 90-mjb/arch/mips/ite-boards/generic/time.c --- 00-virgin/arch/mips/ite-boards/generic/time.c Sun Nov 17 20:29:32 2002 +++ 90-mjb/arch/mips/ite-boards/generic/time.c Sun Feb 2 13:19:26 2003 @@ -38,7 +38,7 @@ extern void enable_cpu_timer(void); extern volatile unsigned long wall_jiffies; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; unsigned long missed_heart_beats = 0; static long last_rtc_update = 0; @@ -119,6 +119,8 @@ static int set_rtc_mmss(unsigned long no */ void mips_timer_interrupt(struct pt_regs *regs) { + unsigned long seq; + if (r4k_offset == 0) goto null; @@ -133,18 +135,22 @@ void mips_timer_interrupt(struct pt_regs * within 500ms before the * next second starts, * thus the following code. */ - read_lock(&xtime_lock); - if ((time_status & STA_UNSYNC) == 0 - && xtime.tv_sec > last_rtc_update + 660 - && xtime.tv_usec >= 500000 - (tick >> 1) - && xtime.tv_usec <= 500000 + (tick >> 1)) - if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else { - /* do it again in 60 s */ - last_rtc_update = xtime.tv_sec - 600; - } - read_unlock(&xtime_lock); + do { + seq = fr_read_begin(&xtime_lock); + + + if ((time_status & STA_UNSYNC) == 0 + && xtime.tv_sec > last_rtc_update + 660 + && xtime.tv_usec >= 500000 - (tick >> 1) + && xtime.tv_usec <= 500000 + (tick >> 1)) + if (set_rtc_mmss(xtime.tv_sec) == 0) + last_rtc_update = xtime.tv_sec; + else { + /* do it again in 60 s */ + last_rtc_update = xtime.tv_sec - 600; + } + + } while (seq != fr_read_end(&xtime_lock)); r4k_cur += r4k_offset; ack_r4ktimer(r4k_cur); @@ -247,10 +253,10 @@ void __init time_init(void) enable_cpu_timer(); /* Read time from the RTC chipset. */ - write_lock_irqsave (&xtime_lock, flags); + fr_write_lock_irqsave (&xtime_lock, flags); xtime.tv_sec = get_mips_time(); xtime.tv_usec = 0; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); } /* This is for machines which generate the exact clock. */ @@ -332,20 +338,23 @@ static unsigned long do_fast_gettimeoffs void do_gettimeofday(struct timeval *tv) { - unsigned int flags; + unsigned int seq; - read_lock_irqsave (&xtime_lock, flags); - *tv = xtime; - tv->tv_usec += do_fast_gettimeoffset(); + do { + seq = fr_read_begin(&xtime_lock); - /* - * xtime is atomically updated in timer_bh. jiffies - wall_jiffies - * is nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; + *tv = xtime; + tv->tv_usec += do_fast_gettimeoffset(); + + /* + * xtime is atomically updated in timer_bh. + * jiffies - wall_jiffies + * is nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; - read_unlock_irqrestore (&xtime_lock, flags); + } while (seq != fr_read_end(&xtime_lock)); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -355,7 +364,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec correctly. * However, the value in this location is is value at the last tick. @@ -375,5 +384,5 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/kernel/sysirix.c 90-mjb/arch/mips/kernel/sysirix.c --- 00-virgin/arch/mips/kernel/sysirix.c Thu Jan 2 22:05:00 2003 +++ 90-mjb/arch/mips/kernel/sysirix.c Sun Feb 2 13:19:26 2003 @@ -615,19 +615,19 @@ asmlinkage int irix_getgid(struct pt_reg return current->gid; } -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; asmlinkage int irix_stime(int value) { if (!capable(CAP_SYS_TIME)) return -EPERM; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime.tv_sec = value; xtime.tv_usec = 0; time_maxerror = MAXPHASE; time_esterror = MAXPHASE; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); return 0; } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/kernel/time.c 90-mjb/arch/mips/kernel/time.c --- 00-virgin/arch/mips/kernel/time.c Sun Nov 17 20:29:32 2002 +++ 90-mjb/arch/mips/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -37,7 +37,7 @@ u64 jiffies_64; /* * forward reference */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern volatile unsigned long wall_jiffies; /* @@ -62,20 +62,23 @@ int (*rtc_set_time)(unsigned long) = nul */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; - read_lock_irqsave (&xtime_lock, flags); - *tv = xtime; - tv->tv_usec += do_gettimeoffset(); + do { + seq = fr_read_begin(&xtime_lock); - /* - * xtime is atomically updated in timer_bh. jiffies - wall_jiffies - * is nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; + *tv = xtime; + tv->tv_usec += do_gettimeoffset(); + + /* + * xtime is atomically updated in timer_bh. + * jiffies - wall_jiffies + * is nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; + } while (seq != fr_read_end(&xtime_lock)); - read_unlock_irqrestore (&xtime_lock, flags); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -85,7 +88,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec * correctly. However, the value in this location is @@ -105,7 +108,7 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } @@ -291,6 +294,8 @@ unsigned long calibrate_div64_gettimeoff */ void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { + unsigned long seq; + if (mips_cpu.options & MIPS_CPU_COUNTER) { unsigned int count; @@ -340,19 +345,21 @@ void timer_interrupt(int irq, void *dev_ * CMOS clock accordingly every ~11 minutes. rtc_set_time() has to be * called as close as possible to 500 ms before the new second starts. */ - read_lock (&xtime_lock); - if ((time_status & STA_UNSYNC) == 0 && - xtime.tv_sec > last_rtc_update + 660 && - xtime.tv_usec >= 500000 - ((unsigned) tick) / 2 && - xtime.tv_usec <= 500000 + ((unsigned) tick) / 2) { - if (rtc_set_time(xtime.tv_sec) == 0) { - last_rtc_update = xtime.tv_sec; - } else { - last_rtc_update = xtime.tv_sec - 600; - /* do it again in 60 s */ + do { + seq = fr_read_begin(&xtime_lock); + + if ((time_status & STA_UNSYNC) == 0 && + xtime.tv_sec > last_rtc_update + 660 && + xtime.tv_usec >= 500000 - ((unsigned) tick) / 2 && + xtime.tv_usec <= 500000 + ((unsigned) tick) / 2) { + if (rtc_set_time(xtime.tv_sec) == 0) { + last_rtc_update = xtime.tv_sec; + } else { + last_rtc_update = xtime.tv_sec - 600; + /* do it again in 60 s */ + } } - } - read_unlock (&xtime_lock); + } while (seq != fr_read_end(&xtime_lock)); /* * If jiffies has overflowed in this timer_interrupt we must diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/mips-boards/generic/time.c 90-mjb/arch/mips/mips-boards/generic/time.c --- 00-virgin/arch/mips/mips-boards/generic/time.c Sun Nov 17 20:29:49 2002 +++ 90-mjb/arch/mips/mips-boards/generic/time.c Sun Feb 2 13:19:26 2003 @@ -45,7 +45,7 @@ unsigned long missed_heart_beats = 0; static unsigned long r4k_offset; /* Amount to increment compare reg each time */ static unsigned long r4k_cur; /* What counter should be at next timer irq */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; #define ALLINTS (IE_IRQ0 | IE_IRQ1 | IE_IRQ2 | IE_IRQ3 | IE_IRQ4 | IE_IRQ5) @@ -134,6 +134,7 @@ static int set_rtc_mmss(unsigned long no void mips_timer_interrupt(struct pt_regs *regs) { int irq = 7; + unsigned long seq; if (r4k_offset == 0) goto null; @@ -149,18 +150,21 @@ void mips_timer_interrupt(struct pt_regs * within 500ms before the * next second starts, * thus the following code. */ - read_lock(&xtime_lock); - if ((time_status & STA_UNSYNC) == 0 - && xtime.tv_sec > last_rtc_update + 660 - && xtime.tv_usec >= 500000 - (tick >> 1) - && xtime.tv_usec <= 500000 + (tick >> 1)) - if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - /* do it again in 60 s */ - last_rtc_update = xtime.tv_sec - 600; - read_unlock(&xtime_lock); + do { + seq = fr_read_begin(&xtime_lock); + + if ((time_status & STA_UNSYNC) == 0 + && xtime.tv_sec > last_rtc_update + 660 + && xtime.tv_usec >= 500000 - (tick >> 1) + && xtime.tv_usec <= 500000 + (tick >> 1)) + if (set_rtc_mmss(xtime.tv_sec) == 0) + last_rtc_update = xtime.tv_sec; + else + /* do it again in 60 s */ + last_rtc_update = xtime.tv_sec - 600; + } while (seq != fr_read_end(&xtime_lock)); + if ((timer_tick_count++ % HZ) == 0) { mips_display_message(&display_string[display_count++]); if (display_count == MAX_DISPLAY_COUNT) @@ -267,10 +271,10 @@ void __init time_init(void) change_cp0_status(ST0_IM, ALLINTS); /* Read time from the RTC chipset. */ - write_lock_irqsave (&xtime_lock, flags); + fr_write_lock_irqsave (&xtime_lock, flags); xtime.tv_sec = get_mips_time(); xtime.tv_usec = 0; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); } /* This is for machines which generate the exact clock. */ @@ -363,20 +367,23 @@ static unsigned long do_fast_gettimeoffs void do_gettimeofday(struct timeval *tv) { - unsigned int flags; + unsigned long seq; + + do { + seq = fr_read_begin(&xtime_lock); - read_lock_irqsave (&xtime_lock, flags); - *tv = xtime; - tv->tv_usec += do_fast_gettimeoffset(); + *tv = xtime; + tv->tv_usec += do_fast_gettimeoffset(); - /* - * xtime is atomically updated in timer_bh. jiffies - wall_jiffies - * is nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; + /* + * xtime is atomically updated in timer_bh. + * jiffies - wall_jiffies + * is nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; - read_unlock_irqrestore (&xtime_lock, flags); + } while (seq != fr_read_end(&xtime_lock)); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -386,7 +393,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec correctly. * However, the value in this location is is value at the last tick. @@ -406,5 +413,5 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/philips/nino/time.c 90-mjb/arch/mips/philips/nino/time.c --- 00-virgin/arch/mips/philips/nino/time.c Sun Nov 17 20:29:20 2002 +++ 90-mjb/arch/mips/philips/nino/time.c Sun Feb 2 13:19:26 2003 @@ -24,7 +24,7 @@ #include extern volatile unsigned long wall_jiffies; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; static struct timeval xbase; @@ -61,30 +61,31 @@ void inline readRTC(unsigned long *high, */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long high, low; - read_lock_irqsave(&xtime_lock, flags); - // 40 bit RTC, driven by 32khz source: - // +-----------+-----------------------------------------+ - // | HHHH.HHHH | LLLL.LLLL.LLLL.LLLL.LMMM.MMMM.MMMM.MMMM | - // +-----------+-----------------------------------------+ - readRTC(&high,&low); - tv->tv_sec = (high << 17) | (low >> 15); - tv->tv_usec = (low % 32768) * 1953 / 64; - tv->tv_sec += xbase.tv_sec; - tv->tv_usec += xbase.tv_usec; + do { + seq = fr_read_begin(&xtime_lock); - tv->tv_usec += do_gettimeoffset(); - - /* - * xtime is atomically updated in timer_bh. lost_ticks is - * nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; - - read_unlock_irqrestore(&xtime_lock, flags); + // 40 bit RTC, driven by 32khz source: + // +-----------+-----------------------------------------+ + // | HHHH.HHHH | LLLL.LLLL.LLLL.LLLL.LMMM.MMMM.MMMM.MMMM | + // +-----------+-----------------------------------------+ + readRTC(&high,&low); + tv->tv_sec = (high << 17) | (low >> 15); + tv->tv_usec = (low % 32768) * 1953 / 64; + tv->tv_sec += xbase.tv_sec; + tv->tv_usec += xbase.tv_usec; + + tv->tv_usec += do_gettimeoffset(); + + /* + * xtime is atomically updated in timer_bh. lost_ticks is + * nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; + } while (seq != fr_read_end(&xtime_lock)); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -94,7 +95,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec * correctly. However, the value in this location is * is value at the last tick. @@ -118,7 +119,7 @@ void do_settimeofday(struct timeval *tv) time_state = TIME_BAD; time_maxerror = MAXPHASE; time_esterror = MAXPHASE; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } static int set_rtc_mmss(unsigned long nowtime) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips64/mips-boards/generic/time.c 90-mjb/arch/mips64/mips-boards/generic/time.c --- 00-virgin/arch/mips64/mips-boards/generic/time.c Sun Nov 17 20:29:27 2002 +++ 90-mjb/arch/mips64/mips-boards/generic/time.c Sun Feb 2 13:19:26 2003 @@ -44,7 +44,7 @@ unsigned long missed_heart_beats = 0; static unsigned long r4k_offset; /* Amount to increment compare reg each time */ static unsigned long r4k_cur; /* What counter should be at next timer irq */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; #define ALLINTS (IE_IRQ0 | IE_IRQ1 | IE_IRQ2 | IE_IRQ3 | IE_IRQ4 | IE_IRQ5) @@ -133,6 +133,7 @@ static int set_rtc_mmss(unsigned long no void mips_timer_interrupt(struct pt_regs *regs) { int irq = 7; + unsigned long seq; if (r4k_offset == 0) goto null; @@ -148,17 +149,20 @@ void mips_timer_interrupt(struct pt_regs * within 500ms before the * next second starts, * thus the following code. */ - read_lock(&xtime_lock); - if ((time_status & STA_UNSYNC) == 0 - && xtime.tv_sec > last_rtc_update + 660 - && xtime.tv_usec >= 500000 - (tick >> 1) - && xtime.tv_usec <= 500000 + (tick >> 1)) - if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - /* do it again in 60 s */ - last_rtc_update = xtime.tv_sec - 600; - read_unlock(&xtime_lock); + do { + seq = fr_read_begin(&xtime_lock); + + if ((time_status & STA_UNSYNC) == 0 + && xtime.tv_sec > last_rtc_update + 660 + && xtime.tv_usec >= 500000 - (tick >> 1) + && xtime.tv_usec <= 500000 + (tick >> 1)) + if (set_rtc_mmss(xtime.tv_sec) == 0) + last_rtc_update = xtime.tv_sec; + else + /* do it again in 60 s */ + last_rtc_update = xtime.tv_sec - 600; + } while (seq != fr_read_end(&xtime_lock)); + if ((timer_tick_count++ % HZ) == 0) { mips_display_message(&display_string[display_count++]); @@ -266,10 +270,10 @@ void __init time_init(void) set_cp0_status(ST0_IM, ALLINTS); /* Read time from the RTC chipset. */ - write_lock_irqsave (&xtime_lock, flags); + fr_write_lock_irqsave (&xtime_lock, flags); xtime.tv_sec = get_mips_time(); xtime.tv_usec = 0; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); } /* This is for machines which generate the exact clock. */ @@ -352,20 +356,24 @@ static unsigned long do_fast_gettimeoffs void do_gettimeofday(struct timeval *tv) { - unsigned int flags; + unsigned long seq; - read_lock_irqsave (&xtime_lock, flags); - *tv = xtime; - tv->tv_usec += do_fast_gettimeoffset(); + do { + seq = fr_read_begin(&xtime_lock); - /* - * xtime is atomically updated in timer_bh. jiffies - wall_jiffies - * is nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; + *tv = xtime; + tv->tv_usec += do_fast_gettimeoffset(); + + /* + * xtime is atomically updated in timer_bh. + * jiffies - wall_jiffies + * is nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; + + } while (seq != fr_read_end(&xtime_lock)); - read_unlock_irqrestore (&xtime_lock, flags); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -375,7 +383,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec correctly. * However, the value in this location is is value at the last tick. @@ -395,5 +403,5 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips64/sgi-ip22/ip22-timer.c 90-mjb/arch/mips64/sgi-ip22/ip22-timer.c --- 00-virgin/arch/mips64/sgi-ip22/ip22-timer.c Sun Nov 17 20:29:46 2002 +++ 90-mjb/arch/mips64/sgi-ip22/ip22-timer.c Sun Feb 2 13:19:26 2003 @@ -32,7 +32,7 @@ static unsigned long r4k_offset; /* Amount to increment compare reg each time */ static unsigned long r4k_cur; /* What counter should be at next timer irq */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; static inline void ack_r4ktimer(unsigned long newval) { @@ -86,7 +86,7 @@ void indy_timer_interrupt(struct pt_regs unsigned long count; int irq = 7; - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); /* Ack timer and compute new compare. */ count = read_32bit_cp0_register(CP0_COUNT); /* This has races. */ @@ -116,7 +116,7 @@ void indy_timer_interrupt(struct pt_regs /* do it again in 60s */ last_rtc_update = xtime.tv_sec - 600; } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } static unsigned long dosample(volatile unsigned char *tcwp, @@ -224,10 +224,10 @@ void __init indy_timer_init(void) set_cp0_status(ST0_IM, ALLINTS); sti(); - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime.tv_sec = get_indy_time(); /* Read time from RTC. */ xtime.tv_usec = 0; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } void indy_8254timer_irq(void) @@ -243,20 +243,21 @@ void indy_8254timer_irq(void) void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; - read_lock_irqsave(&xtime_lock, flags); - *tv = xtime; - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + *tv = xtime; + } while (seq != fr_read_end(&xtime_lock)); } void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime = *tv; time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips64/sgi-ip27/ip27-timer.c 90-mjb/arch/mips64/sgi-ip27/ip27-timer.c --- 00-virgin/arch/mips64/sgi-ip27/ip27-timer.c Thu Jan 2 22:05:00 2003 +++ 90-mjb/arch/mips64/sgi-ip27/ip27-timer.c Sun Feb 2 13:19:26 2003 @@ -40,7 +40,7 @@ static unsigned long ct_cur[NR_CPUS]; /* What counter should be at next timer irq */ static long last_rtc_update; /* Last time the rtc clock got updated */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern volatile unsigned long wall_jiffies; @@ -94,7 +94,7 @@ void rt_timer_interrupt(struct pt_regs * int cpuA = ((cputoslice(cpu)) == 0); int irq = 7; /* XXX Assign number */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); again: LOCAL_HUB_S(cpuA ? PI_RT_PEND_A : PI_RT_PEND_B, 0); /* Ack */ @@ -145,7 +145,7 @@ again: } } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); if (softirq_pending(cpu)) do_softirq(); @@ -160,19 +160,21 @@ unsigned long inline do_gettimeoffset(vo void do_gettimeofday(struct timeval *tv) { - unsigned long flags; unsigned long usec, sec; + unsigned long seq; - read_lock_irqsave(&xtime_lock, flags); - usec = do_gettimeoffset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } - sec = xtime.tv_sec; - usec += xtime.tv_usec; - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + + usec = do_gettimeoffset(); + { + unsigned long lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); + } + sec = xtime.tv_sec; + usec += xtime.tv_usec; + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -185,7 +187,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); tv->tv_usec -= do_gettimeoffset(); tv->tv_usec -= (jiffies - wall_jiffies) * (1000000 / HZ); @@ -199,7 +201,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* Includes for ioc3_init(). */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/parisc/kernel/sys_parisc32.c 90-mjb/arch/parisc/kernel/sys_parisc32.c --- 00-virgin/arch/parisc/kernel/sys_parisc32.c Tue Jan 14 10:06:14 2003 +++ 90-mjb/arch/parisc/kernel/sys_parisc32.c Sun Feb 2 13:19:26 2003 @@ -2428,22 +2428,25 @@ struct sysinfo32 { asmlinkage int sys32_sysinfo(struct sysinfo32 *info) { struct sysinfo val; + unsigned long seq; int err; - extern rwlock_t xtime_lock; + extern frlock_t xtime_lock; /* We don't need a memset here because we copy the * struct to userspace once element at a time. */ - read_lock_irq(&xtime_lock); - val.uptime = jiffies / HZ; + do { + seq = fr_read_begin(&xtime_lock); - val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + val.uptime = jiffies / HZ; + + val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - val.procs = nr_threads; - read_unlock_irq(&xtime_lock); + val.procs = nr_threads; + } while (seq != fr_read_end(&xtime_lock)); si_meminfo(&val); si_swapinfo(&val); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/parisc/kernel/time.c 90-mjb/arch/parisc/kernel/time.c --- 00-virgin/arch/parisc/kernel/time.c Tue Jan 14 10:06:14 2003 +++ 90-mjb/arch/parisc/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -36,7 +36,7 @@ u64 jiffies_64; /* xtime and wall_jiffies keep wall-clock time */ extern unsigned long wall_jiffies; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; static long clocktick; /* timer cycles per tick */ static long halftick; @@ -115,9 +115,9 @@ void timer_interrupt(int irq, void *dev_ smp_do_timer(regs); #endif if (cpu == 0) { - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); do_timer(regs); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } } @@ -172,16 +172,18 @@ gettimeoffset (void) void do_gettimeofday (struct timeval *tv) { - unsigned long flags, usec, sec; + unsigned long seq, usec, sec; - read_lock_irqsave(&xtime_lock, flags); - { - usec = gettimeoffset(); + do { + seq = fr_read_begin(&xtime_lock); + + { + usec = gettimeoffset(); - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - } - read_unlock_irqrestore(&xtime_lock, flags); + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -195,7 +197,7 @@ do_gettimeofday (struct timeval *tv) void do_settimeofday (struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); { /* * This is revolting. We need to set "xtime" @@ -219,7 +221,7 @@ do_settimeofday (struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; } - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } @@ -241,10 +243,10 @@ void __init time_init(void) mtctl(next_tick, 16); if(pdc_tod_read(&tod_data) == 0) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime.tv_sec = tod_data.tod_sec; xtime.tv_nsec = tod_data.tod_usec * 1000; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } else { printk(KERN_ERR "Error reading tod clock\n"); xtime.tv_sec = 0; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/ppc/kernel/time.c 90-mjb/arch/ppc/kernel/time.c --- 00-virgin/arch/ppc/kernel/time.c Thu Jan 9 19:15:58 2003 +++ 90-mjb/arch/ppc/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -76,7 +76,7 @@ extern struct timezone sys_tz; /* keep track of when we need to update the rtc */ time_t last_rtc_update; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* The decrementer counts down by 128 every 128ns on a 601. */ #define DECREMENTER_COUNT_601 (1000000000 / HZ) @@ -161,7 +161,7 @@ void timer_interrupt(struct pt_regs * re continue; /* We are in an interrupt, no need to save/restore flags */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); tb_last_stamp = jiffy_stamp; do_timer(regs); @@ -191,7 +191,7 @@ void timer_interrupt(struct pt_regs * re /* Try again one minute later */ last_rtc_update += 60; } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } if ( !disarm_decr[smp_processor_id()] ) set_dec(next_dec); @@ -212,22 +212,23 @@ void timer_interrupt(struct pt_regs * re */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned delta, lost_ticks, usec, sec; - read_lock_irqsave(&xtime_lock, flags); - sec = xtime.tv_sec; - usec = (xtime.tv_nsec / 1000); - delta = tb_ticks_since(tb_last_stamp); + do { + seq = fr_read_begin(&xtime_lock); + sec = xtime.tv_sec; + usec = (xtime.tv_nsec / 1000); + delta = tb_ticks_since(tb_last_stamp); #ifdef CONFIG_SMP - /* As long as timebases are not in sync, gettimeofday can only - * have jiffy resolution on SMP. - */ - if (!smp_tb_synchronized) - delta = 0; + /* As long as timebases are not in sync, gettimeofday can only + * have jiffy resolution on SMP. + */ + if (!smp_tb_synchronized) + delta = 0; #endif /* CONFIG_SMP */ - lost_ticks = jiffies - wall_jiffies; - read_unlock_irqrestore(&xtime_lock, flags); + lost_ticks = jiffies - wall_jiffies; + } while (seq != fr_read_end(&xtime_lock)); usec += mulhwu(tb_to_us, tb_ticks_per_jiffy * lost_ticks + delta); while (usec >= 1000000) { @@ -243,7 +244,7 @@ void do_settimeofday(struct timeval *tv) unsigned long flags; int tb_delta, new_usec, new_sec; - write_lock_irqsave(&xtime_lock, flags); + fr_write_lock_irqsave(&xtime_lock, flags); /* Updating the RTC is not the job of this code. If the time is * stepped under NTP, the RTC will be update after STA_UNSYNC * is cleared. Tool like clock/hwclock either copy the RTC @@ -283,7 +284,7 @@ void do_settimeofday(struct timeval *tv) time_state = TIME_ERROR; /* p. 24, (a) */ time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); } /* This function is only called on the boot processor */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/ppc/platforms/pmac_time.c 90-mjb/arch/ppc/platforms/pmac_time.c --- 00-virgin/arch/ppc/platforms/pmac_time.c Sun Nov 17 20:29:29 2002 +++ 90-mjb/arch/ppc/platforms/pmac_time.c Sun Feb 2 13:19:26 2003 @@ -29,7 +29,7 @@ #include #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* Apparently the RTC stores seconds since 1 Jan 1904 */ #define RTC_OFFSET 2082844800 @@ -218,16 +218,17 @@ time_sleep_notify(struct pmu_sleep_notif switch (when) { case PBOOK_SLEEP_NOW: - read_lock_irqsave(&xtime_lock, flags); - time_diff = xtime.tv_sec - pmac_get_rtc_time(); - read_unlock_irqrestore(&xtime_lock, flags); + do { + flags = fr_read_begin(&xtime_lock); + time_diff = xtime.tv_sec - pmac_get_rtc_time(); + } while (seq != fr_read_end(&xtime_lock)); break; case PBOOK_WAKE: - write_lock_irqsave(&xtime_lock, flags); + fr_write_lock_irqsave(&xtime_lock, flags); xtime.tv_sec = pmac_get_rtc_time() + time_diff; xtime.tv_nsec = 0; last_rtc_update = xtime.tv_sec; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); break; } return PBOOK_SLEEP_OK; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/ppc64/kernel/time.c 90-mjb/arch/ppc64/kernel/time.c --- 00-virgin/arch/ppc64/kernel/time.c Thu Jan 9 19:16:00 2003 +++ 90-mjb/arch/ppc64/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -69,7 +69,7 @@ u64 jiffies_64; /* keep track of when we need to update the rtc */ time_t last_rtc_update; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern int piranha_simulator; #ifdef CONFIG_PPC_ISERIES unsigned long iSeries_recal_titan = 0; @@ -284,12 +284,12 @@ int timer_interrupt(struct pt_regs * reg smp_local_timer_interrupt(regs); #endif if (cpu == boot_cpuid) { - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); tb_last_stamp = lpaca->next_jiffy_update_tb; do_timer(regs); timer_sync_xtime( cur_tb ); timer_check_rtc(); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); if ( adjusting_time && (time_adjust == 0) ) ppc_adjtimex(); } @@ -348,7 +348,7 @@ void do_settimeofday(struct timeval *tv) long int tb_delta, new_usec, new_sec; unsigned long new_xsec; - write_lock_irqsave(&xtime_lock, flags); + fr_write_lock_irqsave(&xtime_lock, flags); /* Updating the RTC is not the job of this code. If the time is * stepped under NTP, the RTC will be update after STA_UNSYNC * is cleared. Tool like clock/hwclock either copy the RTC @@ -399,7 +399,7 @@ void do_settimeofday(struct timeval *tv) do_gtod.tb_orig_stamp = tb_last_stamp; } - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); } /* @@ -465,7 +465,7 @@ void __init time_init(void) #endif ppc_md.get_boot_time(&tm); - write_lock_irqsave(&xtime_lock, flags); + fr_write_lock_irqsave(&xtime_lock, flags); xtime.tv_sec = mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec); tb_last_stamp = get_tb(); @@ -484,7 +484,7 @@ void __init time_init(void) xtime.tv_nsec = 0; last_rtc_update = xtime.tv_sec; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); /* Not exact, but the timer interrupt takes care of this */ set_dec(tb_ticks_per_jiffy); @@ -587,7 +587,7 @@ void ppc_adjtimex(void) new_tb_to_xs = divres.result_low; new_xsec = mulhdu( tb_ticks, new_tb_to_xs ); - write_lock_irqsave( &xtime_lock, flags ); + fr_write_lock_irqsave( &xtime_lock, flags ); old_xsec = mulhdu( tb_ticks, do_gtod.varp->tb_to_xs ); new_stamp_xsec = do_gtod.varp->stamp_xsec + old_xsec - new_xsec; @@ -609,7 +609,7 @@ void ppc_adjtimex(void) do_gtod.varp = temp_varp; do_gtod.var_idx = temp_idx; - write_unlock_irqrestore( &xtime_lock, flags ); + fr_write_unlock_irqrestore( &xtime_lock, flags ); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/s390/kernel/time.c 90-mjb/arch/s390/kernel/time.c --- 00-virgin/arch/s390/kernel/time.c Sun Dec 1 09:59:46 2002 +++ 90-mjb/arch/s390/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -52,7 +52,7 @@ static ext_int_info_t ext_int_info_timer static uint64_t xtime_cc; static uint64_t init_timer_cc; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; void tod_to_timeval(__u64 todval, struct timespec *xtime) @@ -82,13 +82,15 @@ static inline unsigned long do_gettimeof */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - sec = xtime.tv_sec; - usec = xtime.tv_nsec / 1000 + do_gettimeoffset(); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + + sec = xtime.tv_sec; + usec = xtime.tv_nsec / 1000 + do_gettimeoffset(); + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -102,7 +104,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_nsec * correctly. However, the value in this location is * is value at the last tick. @@ -122,7 +124,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } static inline __u32 div64_32(__u64 dividend, __u32 divisor) @@ -166,7 +168,7 @@ static void do_comparator_interrupt(stru * Do not rely on the boot cpu to do the calls to do_timer. * Spread it over all cpus instead. */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); if (S390_lowcore.jiffy_timer > xtime_cc) { __u32 xticks; @@ -181,7 +183,7 @@ static void do_comparator_interrupt(stru while (xticks--) do_timer(regs); } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); while (ticks--) update_process_times(user_mode(regs)); #else diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/s390x/kernel/time.c 90-mjb/arch/s390x/kernel/time.c --- 00-virgin/arch/s390x/kernel/time.c Sun Dec 1 09:59:46 2002 +++ 90-mjb/arch/s390x/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -51,7 +51,7 @@ static ext_int_info_t ext_int_info_timer static uint64_t xtime_cc; static uint64_t init_timer_cc; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; void tod_to_timeval(__u64 todval, struct timespec *xtime) @@ -77,13 +77,14 @@ static inline unsigned long do_gettimeof */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - sec = xtime.tv_sec; - usec = xtime.tv_nsec + do_gettimeoffset(); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + sec = xtime.tv_sec; + usec = xtime.tv_nsec + do_gettimeoffset(); + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -97,7 +98,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec * correctly. However, the value in this location is * is value at the last tick. @@ -117,7 +118,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* @@ -152,7 +153,7 @@ static void do_comparator_interrupt(stru * Do not rely on the boot cpu to do the calls to do_timer. * Spread it over all cpus instead. */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); if (S390_lowcore.jiffy_timer > xtime_cc) { __u32 xticks; @@ -167,7 +168,7 @@ static void do_comparator_interrupt(stru while (xticks--) do_timer(regs); } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); while (ticks--) update_process_times(user_mode(regs)); #else diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/sh/kernel/time.c 90-mjb/arch/sh/kernel/time.c --- 00-virgin/arch/sh/kernel/time.c Sun Nov 17 20:29:20 2002 +++ 90-mjb/arch/sh/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -72,7 +72,7 @@ u64 jiffies_64; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; #define TICK_SIZE tick @@ -127,19 +127,20 @@ static unsigned long do_gettimeoffset(vo void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - usec = do_gettimeoffset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } - sec = xtime.tv_sec; - usec += xtime.tv_usec; - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + usec = do_gettimeoffset(); + { + unsigned long lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); + } + sec = xtime.tv_sec; + usec += xtime.tv_usec; + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -152,7 +153,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -172,7 +173,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* last time the RTC clock got updated */ @@ -231,9 +232,9 @@ static void timer_interrupt(int irq, voi * the irq version of write_lock because as just said we have irq * locally disabled. -arca */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); do_timer_interrupt(irq, NULL, regs); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } static unsigned int __init get_timer_frequency(void) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/sparc/kernel/pcic.c 90-mjb/arch/sparc/kernel/pcic.c --- 00-virgin/arch/sparc/kernel/pcic.c Sun Nov 17 20:29:46 2002 +++ 90-mjb/arch/sparc/kernel/pcic.c Sun Feb 2 13:19:26 2003 @@ -34,7 +34,7 @@ #include #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; #ifndef CONFIG_PCI @@ -739,10 +739,10 @@ static void pcic_clear_clock_irq(void) static void pcic_timer_handler (int irq, void *h, struct pt_regs *regs) { - write_lock(&xtime_lock); /* Dummy, to show that we remember */ + fr_write_lock(&xtime_lock); /* Dummy, to show that we remember */ pcic_clear_clock_irq(); do_timer(regs); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } #define USECS_PER_JIFFY 10000 /* We have 100HZ "standard" timer for sparc */ @@ -794,19 +794,20 @@ extern unsigned long wall_jiffies; static void pci_do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - usec = do_gettimeoffset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + usec = do_gettimeoffset(); + { + unsigned long lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); + } + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/sparc/kernel/time.c 90-mjb/arch/sparc/kernel/time.c --- 00-virgin/arch/sparc/kernel/time.c Sun Nov 17 20:29:47 2002 +++ 90-mjb/arch/sparc/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -42,7 +42,7 @@ #include #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; @@ -131,7 +131,7 @@ void timer_interrupt(int irq, void *dev_ #endif /* Protect counter clear so that do_gettimeoffset works */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); #ifdef CONFIG_SUN4 if((idprom->id_machtype == (SM_SUN4 | SM_4_260)) || (idprom->id_machtype == (SM_SUN4 | SM_4_110))) { @@ -155,7 +155,7 @@ void timer_interrupt(int irq, void *dev_ else last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */ } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } /* Kick start a stopped clock (procedure from the Sun NVRAM/hostid FAQ). */ @@ -469,19 +469,20 @@ extern __inline__ unsigned long do_getti */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - usec = do_gettimeoffset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + usec = do_gettimeoffset(); + { + unsigned long lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); + } + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -494,9 +495,9 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); bus_do_settimeofday(tv); - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } static void sbus_do_settimeofday(struct timeval *tv) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/sparc64/kernel/rtrap.S 90-mjb/arch/sparc64/kernel/rtrap.S --- 00-virgin/arch/sparc64/kernel/rtrap.S Sun Nov 17 20:29:45 2002 +++ 90-mjb/arch/sparc64/kernel/rtrap.S Sat Feb 1 22:09:06 2003 @@ -33,7 +33,7 @@ __handle_softirq: ba,a,pt %xcc, __handle_softirq_continue nop __handle_preemption: - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate ba,pt %xcc, __handle_preemption_continue wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate @@ -48,7 +48,7 @@ __handle_user_windows: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -92,7 +92,7 @@ __handle_perfctrs: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -271,7 +271,7 @@ to_kernel: brnz %l5, kern_fpucheck sethi %hi(PREEMPT_ACTIVE), %l6 stw %l6, [%g6 + TI_PRE_COUNT] - call schedule + call user_schedule nop ba,pt %xcc, rtrap stw %g0, [%g6 + TI_PRE_COUNT] diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/sparc64/kernel/time.c 90-mjb/arch/sparc64/kernel/time.c --- 00-virgin/arch/sparc64/kernel/time.c Thu Jan 2 22:05:01 2003 +++ 90-mjb/arch/sparc64/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -37,7 +37,7 @@ #include #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; spinlock_t mostek_lock = SPIN_LOCK_UNLOCKED; spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; @@ -134,7 +134,7 @@ static void timer_interrupt(int irq, voi { unsigned long ticks, pstate; - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); do { #ifndef CONFIG_SMP @@ -196,13 +196,13 @@ static void timer_interrupt(int irq, voi timer_check_rtc(); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } #ifdef CONFIG_SMP void timer_tick_interrupt(struct pt_regs *regs) { - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); do_timer(regs); @@ -225,7 +225,7 @@ void timer_tick_interrupt(struct pt_regs timer_check_rtc(); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } #endif @@ -665,7 +665,7 @@ void do_settimeofday(struct timeval *tv) if (this_is_starfire) return; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -686,7 +686,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* Ok, my cute asm atomicity trick doesn't work anymore. @@ -695,19 +695,20 @@ void do_settimeofday(struct timeval *tv) */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - usec = do_gettimeoffset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + usec = do_gettimeoffset(); + { + unsigned long lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); + } + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/um/kernel/time_kern.c 90-mjb/arch/um/kernel/time_kern.c --- 00-virgin/arch/um/kernel/time_kern.c Thu Jan 2 22:05:02 2003 +++ 90-mjb/arch/um/kernel/time_kern.c Sun Feb 2 13:19:26 2003 @@ -21,7 +21,7 @@ u64 jiffies_64; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; int hz(void) { @@ -57,9 +57,9 @@ void boot_timer_handler(int sig) void um_timer(int irq, void *dev, struct pt_regs *regs) { do_timer(regs); - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); timer(); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } long um_time(int * tloc) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/v850/kernel/time.c 90-mjb/arch/v850/kernel/time.c --- 00-virgin/arch/v850/kernel/time.c Mon Dec 23 23:01:49 2002 +++ 90-mjb/arch/v850/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -107,7 +107,7 @@ static void timer_interrupt (int irq, vo #endif /* 0 */ } -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* * This version of gettimeofday has near microsecond resolution. @@ -118,23 +118,25 @@ void do_gettimeofday (struct timeval *tv extern volatile unsigned long lost_ticks; unsigned long lost; #endif - unsigned long flags; unsigned long usec, sec; + unsigned long seq; + + do { + seq = fr_read_begin(&xtime_lock); - read_lock_irqsave (&xtime_lock, flags); #if 0 - usec = mach_gettimeoffset ? mach_gettimeoffset () : 0; + usec = mach_gettimeoffset ? mach_gettimeoffset () : 0; #else - usec = 0; + usec = 0; #endif #if 0 /* DAVIDM later if possible */ - lost = lost_ticks; - if (lost) - usec += lost * (1000000/HZ); + lost = lost_ticks; + if (lost) + usec += lost * (1000000/HZ); #endif - sec = xtime.tv_sec; - usec += xtime.tv_nsec / 1000; - read_unlock_irqrestore (&xtime_lock, flags); + sec = xtime.tv_sec; + usec += xtime.tv_nsec / 1000; + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -147,7 +149,7 @@ void do_gettimeofday (struct timeval *tv void do_settimeofday (struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); /* This is revolting. We need to set the xtime.tv_nsec * correctly. However, the value in this location is @@ -172,7 +174,7 @@ void do_settimeofday (struct timeval *tv time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } static int timer_dev_id; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/x86_64/kernel/early_printk.c 90-mjb/arch/x86_64/kernel/early_printk.c --- 00-virgin/arch/x86_64/kernel/early_printk.c Sun Nov 17 20:29:50 2002 +++ 90-mjb/arch/x86_64/kernel/early_printk.c Wed Dec 31 16:00:00 1969 @@ -1,218 +0,0 @@ -#include -#include -#include -#include -#include - -/* Simple VGA output */ - -#define VGABASE 0xffffffff800b8000UL - -#define MAX_YPOS 25 -#define MAX_XPOS 80 - -static int current_ypos = 1, current_xpos = 0; - -static void early_vga_write(struct console *con, const char *str, unsigned n) -{ - char c; - int i, k, j; - - while ((c = *str++) != '\0' && n-- > 0) { - if (current_ypos >= MAX_YPOS) { - /* scroll 1 line up */ - for(k = 1, j = 0; k < MAX_YPOS; k++, j++) { - for(i = 0; i < MAX_XPOS; i++) { - writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), - VGABASE + 2*(MAX_XPOS*j + i)); - } - } - for(i = 0; i < MAX_XPOS; i++) { - writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); - } - current_ypos = MAX_YPOS-1; - } - if (c == '\n') { - current_xpos = 0; - current_ypos++; - } else if (c != '\r') { - writew(((0x7 << 8) | (unsigned short) c), - VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++)); - if (current_xpos >= MAX_XPOS) { - current_xpos = 0; - current_ypos++; - } - } - } -} - -static struct console early_vga_console = { - .name = "earlyvga", - .write = early_vga_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ - -int early_serial_base = 0x3f8; /* ttyS0 */ - -#define XMTRDY 0x20 - -#define DLAB 0x80 - -#define TXR 0 /* Transmit register (WRITE) */ -#define RXR 0 /* Receive register (READ) */ -#define IER 1 /* Interrupt Enable */ -#define IIR 2 /* Interrupt ID */ -#define FCR 2 /* FIFO control */ -#define LCR 3 /* Line control */ -#define MCR 4 /* Modem control */ -#define LSR 5 /* Line Status */ -#define MSR 6 /* Modem Status */ -#define DLL 0 /* Divisor Latch Low */ -#define DLH 1 /* Divisor latch High */ - -static int early_serial_putc(unsigned char ch) -{ - unsigned timeout = 0xffff; - while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) - rep_nop(); - outb(ch, early_serial_base + TXR); - return timeout ? 0 : -1; -} - -static void early_serial_write(struct console *con, const char *s, unsigned n) -{ - while (*s && n-- > 0) { - early_serial_putc(*s); - if (*s == '\n') - early_serial_putc('\r'); - s++; - } -} - -static __init void early_serial_init(char *opt) -{ - unsigned char c; - unsigned divisor, baud = 38400; - char *s, *e; - - if (*opt == ',') - ++opt; - - s = strsep(&opt, ","); - if (s != NULL) { - unsigned port; - if (!strncmp(s,"0x",2)) - early_serial_base = simple_strtoul(s, &e, 16); - else { - static int bases[] = { 0x3f8, 0x2f8 }; - if (!strncmp(s,"ttyS",4)) - s+=4; - port = simple_strtoul(s, &e, 10); - if (port > 1 || s == e) - port = 0; - early_serial_base = bases[port]; - } - } - - outb(0x3, early_serial_base + LCR); /* 8n1 */ - outb(0, early_serial_base + IER); /* no interrupt */ - outb(0, early_serial_base + FCR); /* no fifo */ - outb(0x3, early_serial_base + MCR); /* DTR + RTS */ - - s = strsep(&opt, ","); - if (s != NULL) { - baud = simple_strtoul(s, &e, 0); - if (baud == 0 || s == e) - baud = 38400; - } - - divisor = 115200 / baud; - c = inb(early_serial_base + LCR); - outb(c | DLAB, early_serial_base + LCR); - outb(divisor & 0xff, early_serial_base + DLL); - outb((divisor >> 8) & 0xff, early_serial_base + DLH); - outb(c & ~DLAB, early_serial_base + LCR); -} - -static struct console early_serial_console = { - .name = "earlyser", - .write = early_serial_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -/* Direct interface for emergencies */ -struct console *early_console = &early_vga_console; -static int early_console_initialized = 0; - -void early_printk(const char *fmt, ...) -{ - char buf[512]; - int n; - va_list ap; - va_start(ap,fmt); - n = vsnprintf(buf,512,fmt,ap); - early_console->write(early_console,buf,n); - va_end(ap); -} - -static int keep_early; - -int __init setup_early_printk(char *opt) -{ - char *space; - char buf[256]; - - if (early_console_initialized) - return -1; - - strncpy(buf,opt,256); - buf[255] = 0; - space = strchr(buf, ' '); - if (space) - *space = 0; - - if (strstr(buf,"keep")) - keep_early = 1; - - if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); - early_console = &early_serial_console; - } else if (!strncmp(buf, "ttyS", 4)) { - early_serial_init(buf); - early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3)) { - early_console = &early_vga_console; - } else { - early_console = NULL; - return -1; - } - early_console_initialized = 1; - register_console(early_console); - return 0; -} - -void __init disable_early_printk(void) -{ - if (!early_console_initialized || !early_console) - return; - if (!keep_early) { - printk("disabling early console...\n"); - unregister_console(early_console); - early_console_initialized = 0; - } else { - printk("keeping early console.\n"); - } -} - -/* syntax: earlyprintk=vga - earlyprintk=serial[,ttySn[,baudrate]] - Append ,keep to not disable it when the real console takes over. - Only vga or serial at a time, not both. - Currently only ttyS0 and ttyS1 are supported. - Interaction with the standard serial driver is not very good. - The VGA output is eventually overwritten by the real console. */ -__setup("earlyprintk=", setup_early_printk); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/x86_64/kernel/entry.S 90-mjb/arch/x86_64/kernel/entry.S --- 00-virgin/arch/x86_64/kernel/entry.S Fri Jan 17 09:18:25 2003 +++ 90-mjb/arch/x86_64/kernel/entry.S Sat Feb 1 22:09:06 2003 @@ -187,7 +187,7 @@ sysret_careful: jnc sysret_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp sysret_check @@ -256,7 +256,7 @@ int_careful: jnc int_very_careful sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp int_with_check @@ -420,7 +420,7 @@ retint_careful: jnc retint_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi GET_THREAD_INFO(%rcx) cli @@ -454,7 +454,7 @@ retint_kernel: jc retint_restore_args movl $PREEMPT_ACTIVE,threadinfo_preempt_count(%rcx) sti - call schedule + call user_schedule cli GET_THREAD_INFO(%rcx) movl $0,threadinfo_preempt_count(%rcx) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/x86_64/kernel/head64.c 90-mjb/arch/x86_64/kernel/head64.c --- 00-virgin/arch/x86_64/kernel/head64.c Sun Nov 17 20:29:32 2002 +++ 90-mjb/arch/x86_64/kernel/head64.c Sat Feb 1 22:00:06 2003 @@ -70,7 +70,7 @@ static void __init setup_boot_cpu_data(v boot_cpu_data.x86_mask = eax & 0xf; } -extern void start_kernel(void), pda_init(int), setup_early_printk(char *); +extern void start_kernel(void), pda_init(int); extern int disable_apic; void __init x86_64_start_kernel(char * real_mode_data) @@ -80,9 +80,6 @@ void __init x86_64_start_kernel(char * r clear_bss(); pda_init(0); copy_bootdata(real_mode_data); - s = strstr(saved_command_line, "earlyprintk="); - if (s != NULL) - setup_early_printk(s+12); #ifdef CONFIG_X86_IO_APIC if (strstr(saved_command_line, "disableapic")) disable_apic = 1; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/x86_64/kernel/time.c 90-mjb/arch/x86_64/kernel/time.c --- 00-virgin/arch/x86_64/kernel/time.c Thu Jan 2 22:05:03 2003 +++ 90-mjb/arch/x86_64/kernel/time.c Sun Feb 2 13:19:26 2003 @@ -27,7 +27,7 @@ u64 jiffies_64; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; unsigned int cpu_khz; /* TSC clocks / usec, not used here */ @@ -70,21 +70,22 @@ inline unsigned int do_gettimeoffset(voi void do_gettimeofday(struct timeval *tv) { - unsigned long flags, t; + unsigned long flags, t, seq; unsigned int sec, usec; - read_lock_irqsave(&xtime_lock, flags); - spin_lock(&time_offset_lock); + spin_lock_irqsave(&time_offset_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + + sec = xtime.tv_sec; + usec = xtime.tv_nsec / 1000; + + t = (jiffies - wall_jiffies) * (1000000L / HZ) + do_gettimeoffset(); + if (t > timeoffset) timeoffset = t; + usec += timeoffset; - sec = xtime.tv_sec; - usec = xtime.tv_nsec / 1000; - - t = (jiffies - wall_jiffies) * (1000000L / HZ) + do_gettimeoffset(); - if (t > timeoffset) timeoffset = t; - usec += timeoffset; - - spin_unlock(&time_offset_lock); - read_unlock_irqrestore(&xtime_lock, flags); + } while (seq != fr_read_end(&xtime_lock)); + spin_unlock_irqrestore(&time_offset_lock, flags); tv->tv_sec = sec + usec / 1000000; tv->tv_usec = usec % 1000000; @@ -98,7 +99,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); vxtime_lock(); tv->tv_usec -= do_gettimeoffset() + @@ -118,7 +119,7 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* @@ -201,7 +202,7 @@ static void timer_interrupt(int irq, voi * variables, because both do_timer() and us change them -arca+vojtech */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); vxtime_lock(); { @@ -250,7 +251,7 @@ static void timer_interrupt(int irq, voi } vxtime_unlock(); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } unsigned long get_cmos_time(void) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/acpi/events/evevent.c 90-mjb/drivers/acpi/events/evevent.c --- 00-virgin/drivers/acpi/events/evevent.c Mon Jan 13 21:09:09 2003 +++ 90-mjb/drivers/acpi/events/evevent.c Sun Feb 2 13:19:12 2003 @@ -103,6 +103,10 @@ acpi_ev_handler_initialize ( ACPI_FUNCTION_TRACE ("ev_handler_initialize"); +#ifdef CONFIG_X86_SUMMIT +/*horrible horrible hack to avoid interrupt storm*/ +return_ACPI_STATUS (0); +#endif /* Install the SCI handler */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/char/Makefile 90-mjb/drivers/char/Makefile --- 00-virgin/drivers/char/Makefile Tue Jan 14 10:06:15 2003 +++ 90-mjb/drivers/char/Makefile Sat Feb 1 22:09:06 2003 @@ -32,6 +32,7 @@ obj-$(CONFIG_COMPUTONE) += ip2.o ip2main obj-$(CONFIG_RISCOM8) += riscom8.o obj-$(CONFIG_ISI) += isicom.o obj-$(CONFIG_ESPSERIAL) += esp.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbserial.o obj-$(CONFIG_SYNCLINK) += synclink.o obj-$(CONFIG_SYNCLINKMP) += synclinkmp.o obj-$(CONFIG_N_HDLC) += n_hdlc.o diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/char/gdbserial.c 90-mjb/drivers/char/gdbserial.c --- 00-virgin/drivers/char/gdbserial.c Wed Dec 31 16:00:00 1969 +++ 90-mjb/drivers/char/gdbserial.c Sat Feb 1 22:09:06 2003 @@ -0,0 +1,274 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * + * Modified by Scott Foehner (sfoehner@engr.sgi.com) to allow connect + * on boot-up + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#undef PRNT /* define for debug printing */ + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +extern void set_debug_traps(void); /* GDB routine */ +extern int gdb_serial_setup(int ttyS, int baud, int *port, int *irq); +extern void shutdown_for_gdb(struct async_struct *info); + /* in serial.c */ + +int gdb_irq; +int gdb_port; +int gdb_ttyS = 1; /* Default: ttyS1 */ +int gdb_baud = 38400; +int gdb_enter = 0; /* Default: do not do gdb_hook on boot */ +int gdb_initialized = 0; + +static int initialized = -1; + +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(void) +{ + if (inb(gdb_port + UART_LSR) & UART_LSR_DR) + return (inb(gdb_port + UART_RX)); + + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + */ +static int +read_char(void) +{ + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + int chr; + + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + return (chr); + } + + return (read_data_bfr()); /* read from hardware */ + +} /* read_char */ + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(int chr) +{ + while (!(inb(gdb_port + UART_LSR) & UART_LSR_THRE)) ; + + outb(chr, gdb_port + UART_TX); + +} /* write_char */ + +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static void +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + int chr; + int iir; + + do { + chr = read_data_bfr(); + iir = inb(gdb_port + UART_IIR); +#ifdef PRNT + printk("gdb_interrupt: chr=%02x '%c' after read iir=%02x\n", + chr, chr > ' ' && chr < 0x7F ? chr : ' ', iir); +#endif + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + breakpoint(); + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { /* buffer overflow, clear it */ + gdb_buf_in_inx = 0; + atomic_set(&gdb_buf_in_cnt, 0); + gdb_buf_out_inx = 0; + break; + } + + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + atomic_inc(&gdb_buf_in_cnt); + } + while (iir & UART_IIR_RDI); + +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +extern int serial8250_init(void); + +int +gdb_hook(void) +{ + int retval; + +#ifdef CONFIG_SMP + if (NR_CPUS > KGDB_MAX_NO_CPUS) { + printk + ("kgdb: too manu cpus. Cannot enable debugger with more than 8 cpus\n"); + return (-1); + } +#endif + + /* + * Call first time just to get the ser ptr + */ + + serial8250_init(); + + if (gdb_serial_setup(gdb_ttyS, gdb_baud, &gdb_port, &gdb_irq)) { + printk("gdb_serial_setup() error"); + return (-1); + } + + retval = request_irq(gdb_irq, + gdb_interrupt, SA_INTERRUPT, "GDB-stub", NULL); + if (retval == 0) + initialized = 1; + else { + initialized = 0; + printk("gdb_hook: request_irq(irq=%d) failed: %d\n", gdb_irq, + retval); + } + + /* + * Call GDB routine to setup the exception vectors for the debugger + */ + set_debug_traps(); + + /* + * Call the breakpoint() routine in GDB to start the debugging + * session. + */ + printk("Waiting for connection from remote gdb... "); + breakpoint(); + gdb_null(); + + printk("Connected.\n"); + + gdb_initialized = 1; + return (0); + +} /* gdb_hook_interrupt2 */ + +/* + * getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. + */ +int +getDebugChar(void) +{ + volatile int chr; + +#ifdef PRNT + printk("getDebugChar: "); +#endif + + while ((chr = read_char()) < 0) + touch_nmi_watchdog(); + +#ifdef PRNT + printk("%c\n", chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + return (chr); + +} /* getDebugChar */ + +/* + * putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. + */ +void +putDebugChar(int chr) +{ +#ifdef PRNT + printk("putDebugChar: chr=%02x '%c'\n", chr, + chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + + write_char(chr); /* this routine will wait */ + +} /* putDebugChar */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/char/sysrq.c 90-mjb/drivers/char/sysrq.c --- 00-virgin/drivers/char/sysrq.c Thu Jan 2 22:05:04 2003 +++ 90-mjb/drivers/char/sysrq.c Sat Feb 1 22:09:06 2003 @@ -107,6 +107,18 @@ static struct sysrq_key_op sysrq_reboot_ .action_msg = "Resetting", }; +#ifdef CONFIG_X86_REMOTE_DEBUG +static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) { + int gdb_hook(void); + gdb_hook(); +} +static struct sysrq_key_op sysrq_gdb_op = { + handler: sysrq_handle_gdb, + help_msg: "Gdb", + action_msg: "Entering debugger", +}; +#endif /* SYNC SYSRQ HANDLERS BLOCK */ @@ -357,7 +369,11 @@ static struct sysrq_key_op *sysrq_key_ta /* d */ NULL, /* e */ &sysrq_term_op, /* f */ NULL, +#ifdef CONFIG_X86_REMOTE_DEBUG +/* g */ &sysrq_gdb_op, +#else /* CONFIG_X86_REMOTE_DEBUG */ /* g */ NULL, +#endif /* CONFIG_X86_REMOTE_DEBUG */ /* h */ NULL, /* i */ &sysrq_kill_op, /* j */ NULL, diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/char/tty_io.c 90-mjb/drivers/char/tty_io.c --- 00-virgin/drivers/char/tty_io.c Fri Jan 17 09:18:26 2003 +++ 90-mjb/drivers/char/tty_io.c Sat Feb 1 22:09:06 2003 @@ -91,6 +91,9 @@ #include #include #include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif #include #include @@ -2211,6 +2214,9 @@ void __init console_init(void) #endif #ifdef CONFIG_VT con_init(); +#endif +#ifdef CONFIG_GDB_CONSOLE + gdb_console_init(); #endif #ifdef CONFIG_AU1000_SERIAL_CONSOLE au1000_serial_console_init(); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/net/starfire.c 90-mjb/drivers/net/starfire.c --- 00-virgin/drivers/net/starfire.c Thu Jan 9 19:16:08 2003 +++ 90-mjb/drivers/net/starfire.c Sun Feb 2 13:19:28 2003 @@ -101,15 +101,35 @@ - Better stats and error handling (Ion Badulescu) - Use new pci_set_mwi() PCI API function (jgarzik) -TODO: - - implement tx_timeout() properly + LK1.3.7 (Ion Badulescu) + - minimal implementation of tx_timeout() + - correctly shutdown the Rx/Tx engines in netdev_close() + - added calls to netif_carrier_on/off + (patch from Stefan Rompf ) - VLAN support + + LK1.3.8 (Ion Badulescu) + - adjust DMA burst size on sparc64 + - 64-bit support + - reworked zerocopy support for 64-bit buffers + - working and usable interrupt mitigation/latency + - reduced Tx interrupt frequency for lower interrupt overhead + + LK1.3.9 (Ion Badulescu) + - bugfix for mcast filter + - enable the right kind of Tx interrupts (TxDMADone, not TxDone) + + LK1.4.0 (Ion Badulescu) + - NAPI support + +TODO: bugfixes (no bugs known as of right now) */ #define DRV_NAME "starfire" -#define DRV_VERSION "1.03+LK1.3.6" -#define DRV_RELDATE "March 7, 2002" +#define DRV_VERSION "1.03+LK1.4.0" +#define DRV_RELDATE "December 23, 2002" +#include #include #include #include @@ -118,7 +138,6 @@ TODO: #include #include #include -#include #include /* Processor type for cache alignment. */ #include #include @@ -128,17 +147,14 @@ TODO: * firmware files) does not allow one to redistribute them. Thus, we can't * include the firmware with this driver. * - * However, an end-user is allowed to download and use it, after - * converting it to C header files using starfire_firmware.pl. + * However, should a legal-to-use firmware become available, + * the driver developer would need only to obtain the firmware in the + * form of a C header file. * Once that's done, the #undef below must be changed into a #define * for this driver to really use the firmware. Note that Rx/Tx * hardware TCP checksumming is not possible without the firmware. * - * If Adaptec could allow redistribution of the firmware (even in binary - * format), life would become a lot easier. Unfortunately, I've lost my - * Adaptec contacts, so progress on this front is rather unlikely to - * occur. If anybody from Adaptec reads this and can help with this matter, - * please let me know... + * WANTED: legal firmware to include with this GPL'd driver. */ #undef HAS_FIRMWARE /* @@ -157,11 +173,16 @@ TODO: #include "starfire_firmware.h" #endif /* HAS_FIRMWARE */ +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) +#define VLAN_SUPPORT +#endif + /* The user-configurable values. These may be modified when a driver module is loaded.*/ /* Used for tuning interrupt latency vs. overhead. */ -static int interrupt_mitigation; +static int intr_latency; +static int small_frames; static int debug = 1; /* 1 normal messages, 0 quiet .. 7 verbose. */ static int max_interrupt_work = 20; @@ -169,6 +190,12 @@ static int mtu; /* Maximum number of multicast addresses to filter (vs. rx-all-multicast). The Starfire has a 512 element hash table based on the Ethernet CRC. */ static int multicast_filter_limit = 512; +/* Whether to do TCP/UDP checksums in hardware */ +#ifdef HAS_FIRMWARE +static int enable_hw_cksum = 1; +#else +static int enable_hw_cksum = 0; +#endif #define PKT_BUF_SZ 1536 /* Size of each temporary Rx buffer.*/ /* @@ -181,7 +208,9 @@ static int multicast_filter_limit = 512; * packets as the starfire doesn't allow for misaligned DMAs ;-( * 23/10/2000 - Jes * - * The Alpha and the Sparc don't allow unaligned loads, either. -Ion + * The Alpha and the Sparc don't like unaligned loads, either. On Sparc64, + * at least, having unaligned frames leads to a rather serious performance + * penalty. -Ion */ #if defined(__ia64__) || defined(__alpha__) || defined(__sparc__) static int rx_copybreak = PKT_BUF_SZ; @@ -189,9 +218,17 @@ static int rx_copybreak = PKT_BUF_SZ; static int rx_copybreak /* = 0 */; #endif +/* PCI DMA burst size -- on sparc64 we want to force it to 64 bytes, on the others the default of 128 is fine. */ +#ifdef __sparc__ +#define DMA_BURST_SIZE 64 +#else +#define DMA_BURST_SIZE 128 +#endif + /* Used to pass the media type, etc. Both 'options[]' and 'full_duplex[]' exist for driver interoperability. The media type is usually passed in 'options[]'. + These variables are deprecated, use ethtool instead. -Ion */ #define MAX_UNITS 8 /* More are supported, limit only on options */ static int options[MAX_UNITS] = {0, }; @@ -201,33 +238,55 @@ static int full_duplex[MAX_UNITS] = {0, /* The "native" ring sizes are either 256 or 2048. However in some modes a descriptor may be marked to wrap the ring earlier. - The driver allocates a single page for each descriptor ring, constraining - the maximum size in an architecture-dependent way. */ #define RX_RING_SIZE 256 #define TX_RING_SIZE 32 /* The completion queues are fixed at 1024 entries i.e. 4K or 8KB. */ #define DONE_Q_SIZE 1024 +/* All queues must be aligned on a 256-byte boundary */ +#define QUEUE_ALIGN 256 + +#if RX_RING_SIZE > 256 +#define RX_Q_ENTRIES Rx2048QEntries +#else +#define RX_Q_ENTRIES Rx256QEntries +#endif /* Operational parameters that usually are not changed. */ /* Time in jiffies before concluding the transmitter is hung. */ #define TX_TIMEOUT (2 * HZ) -#ifdef ZEROCOPY -#if MAX_SKB_FRAGS <= 6 -#define MAX_STARFIRE_FRAGS 6 -#else /* MAX_STARFIRE_FRAGS > 6 */ -#warning This driver will not work with more than 6 skb fragments. -#warning Turning off zerocopy support. -#undef ZEROCOPY -#endif /* MAX_STARFIRE_FRAGS > 6 */ -#endif /* ZEROCOPY */ +/* + * This SUCKS. + * We need a much better method to determine if dma_addr_t is 64-bit. + */ +#if (defined(__i386__) && defined(CONFIG_HIGHMEM) && (LINUX_VERSION_CODE > 0x20500 || defined(CONFIG_HIGHMEM64G))) || defined(__x86_64__) || defined (__ia64__) || defined(__mips64__) || (defined(__mips__) && defined(CONFIG_HIGHMEM) && defined(CONFIG_64BIT_PHYS_ADDR)) +/* 64-bit dma_addr_t */ +#define ADDR_64BITS /* This chip uses 64 bit addresses. */ +#define cpu_to_dma(x) cpu_to_le64(x) +#define dma_to_cpu(x) le64_to_cpu(x) +#define RX_DESC_Q_ADDR_SIZE RxDescQAddr64bit +#define TX_DESC_Q_ADDR_SIZE TxDescQAddr64bit +#define RX_COMPL_Q_ADDR_SIZE RxComplQAddr64bit +#define TX_COMPL_Q_ADDR_SIZE TxComplQAddr64bit +#define RX_DESC_ADDR_SIZE RxDescAddr64bit +#else /* 32-bit dma_addr_t */ +#define cpu_to_dma(x) cpu_to_le32(x) +#define dma_to_cpu(x) le32_to_cpu(x) +#define RX_DESC_Q_ADDR_SIZE RxDescQAddr32bit +#define TX_DESC_Q_ADDR_SIZE TxDescQAddr32bit +#define RX_COMPL_Q_ADDR_SIZE RxComplQAddr32bit +#define TX_COMPL_Q_ADDR_SIZE TxComplQAddr32bit +#define RX_DESC_ADDR_SIZE RxDescAddr32bit +#endif -#ifdef ZEROCOPY +#ifdef MAX_SKB_FRAGS #define skb_first_frag_len(skb) skb_headlen(skb) -#else /* not ZEROCOPY */ +#define skb_num_frags(skb) (skb_shinfo(skb)->nr_frags + 1) +#else /* not MAX_SKB_FRAGS */ #define skb_first_frag_len(skb) (skb->len) -#endif /* not ZEROCOPY */ +#define skb_num_frags(skb) 1 +#endif /* not MAX_SKB_FRAGS */ /* 2.2.x compatibility code */ #if LINUX_VERSION_CODE < 0x20300 @@ -236,9 +295,12 @@ static int full_duplex[MAX_UNITS] = {0, #else /* LINUX_VERSION_CODE > 0x20300 */ +#include #include #include +#include + #define COMPAT_MOD_INC_USE_COUNT #define COMPAT_MOD_DEC_USE_COUNT @@ -253,6 +315,40 @@ static int full_duplex[MAX_UNITS] = {0, #define PCI_SLOT_NAME(pci_dev) (pci_dev)->slot_name #endif /* LINUX_VERSION_CODE > 0x20300 */ + +#ifdef HAVE_NETDEV_POLL +#define init_poll(dev) \ + dev->poll = &netdev_poll; \ + dev->weight = max_interrupt_work; +#define netdev_rx(dev, ioaddr) \ +do { \ + u32 intr_enable; \ + if (netif_rx_schedule_prep(dev)) { \ + __netif_rx_schedule(dev); \ + intr_enable = readl(ioaddr + IntrEnable); \ + intr_enable &= ~(IntrRxDone | IntrRxEmpty); \ + writel(intr_enable, ioaddr + IntrEnable); \ + } else { \ + /* Paranoia check */ \ + intr_enable = readl(ioaddr + IntrEnable); \ + if (intr_enable & (IntrRxDone | IntrRxEmpty)) { \ + printk("%s: interrupt while in polling mode!\n", dev->name); \ + intr_enable &= ~(IntrRxDone | IntrRxEmpty); \ + writel(intr_enable, ioaddr + IntrEnable); \ + } \ + } \ +} while (0) +static int netdev_poll(struct net_device *dev, int *budget); +#else /* not HAVE_NETDEV_POLL */ +#define init_poll(dev) +#define netif_receive_skb(skb) netif_rx(skb) +#define vlan_hwaccel_receive_skb(skb, vlgrp, vlid) vlan_hwaccel_rx(skb, vlgrp, vlid) +#define netdev_rx(dev, ioaddr) \ +do { \ + int quota = np->dirty_rx + RX_RING_SIZE - np->cur_rx; \ + __netdev_rx(dev, "a);\ +} while (0) +#endif /* not HAVE_NETDEV_POLL */ /* end of compatibility code */ @@ -269,15 +365,20 @@ MODULE_PARM(max_interrupt_work, "i"); MODULE_PARM(mtu, "i"); MODULE_PARM(debug, "i"); MODULE_PARM(rx_copybreak, "i"); -MODULE_PARM(interrupt_mitigation, "i"); +MODULE_PARM(intr_latency, "i"); +MODULE_PARM(small_frames, "i"); MODULE_PARM(options, "1-" __MODULE_STRING(MAX_UNITS) "i"); MODULE_PARM(full_duplex, "1-" __MODULE_STRING(MAX_UNITS) "i"); -MODULE_PARM_DESC(max_interrupt_work, "Starfire maximum events handled per interrupt"); -MODULE_PARM_DESC(mtu, "Starfire MTU (all boards)"); -MODULE_PARM_DESC(debug, "Starfire debug level (0-6)"); -MODULE_PARM_DESC(rx_copybreak, "Starfire copy breakpoint for copy-only-tiny-frames"); -MODULE_PARM_DESC(options, "Starfire: Bits 0-3: media type, bit 17: full duplex"); -MODULE_PARM_DESC(full_duplex, "Starfire full duplex setting(s) (1)"); +MODULE_PARM(enable_hw_cksum, "i"); +MODULE_PARM_DESC(max_interrupt_work, "Maximum events handled per interrupt"); +MODULE_PARM_DESC(mtu, "MTU (all boards)"); +MODULE_PARM_DESC(debug, "Debug level (0-6)"); +MODULE_PARM_DESC(rx_copybreak, "Copy breakpoint for copy-only-tiny-frames"); +MODULE_PARM_DESC(intr_latency, "Maximum interrupt latency, in microseconds"); +MODULE_PARM_DESC(small_frames, "Maximum size of receive frames that bypass interrupt latency (0,64,128,256,512)"); +MODULE_PARM_DESC(options, "Deprecated: Bits 0-3: media type, bit 17: full duplex"); +MODULE_PARM_DESC(full_duplex, "Deprecated: Forced full-duplex setting (0/1)"); +MODULE_PARM_DESC(enable_hw_cksum, "Enable/disable hardware cksum support (0/1)"); /* Theory of Operation @@ -363,13 +464,6 @@ IVc. Errata enum chip_capability_flags {CanHaveMII=1, }; -#define PCI_IOTYPE (PCI_USES_MASTER | PCI_USES_MEM | PCI_ADDR0) - -#if 0 -#define ADDR_64BITS 1 /* This chip uses 64 bit addresses. */ -#endif - -#define HAS_IP_COPYSUM 1 enum chipset { CH_6915 = 0, @@ -401,7 +495,7 @@ static struct chip_info { enum register_offsets { PCIDeviceConfig=0x50040, GenCtrl=0x50070, IntrTimerCtrl=0x50074, IntrClear=0x50080, IntrStatus=0x50084, IntrEnable=0x50088, - MIICtrl=0x52000, StationAddr=0x50120, EEPROMCtrl=0x51000, + MIICtrl=0x52000, TxStationAddr=0x50120, EEPROMCtrl=0x51000, GPIOCtrl=0x5008C, TxDescCtrl=0x50090, TxRingPtr=0x50098, HiPriTxRingPtr=0x50094, /* Low and High priority. */ TxRingHiAddr=0x5009C, /* 64 bit address extension. */ @@ -412,11 +506,16 @@ enum register_offsets { CompletionQConsumerIdx=0x500C4, RxDMACtrl=0x500D0, RxDescQCtrl=0x500D4, RxDescQHiAddr=0x500DC, RxDescQAddr=0x500E0, RxDescQIdx=0x500E8, RxDMAStatus=0x500F0, RxFilterMode=0x500F4, - TxMode=0x55000, PerfFilterTable=0x56000, HashTable=0x56100, + TxMode=0x55000, VlanType=0x55064, + PerfFilterTable=0x56000, HashTable=0x56100, TxGfpMem=0x58000, RxGfpMem=0x5a000, }; -/* Bits in the interrupt status/mask registers. */ +/* + * Bits in the interrupt status/mask registers. + * Warning: setting Intr[Ab]NormalSummary in the IntrEnable register + * enables all the interrupt sources that are or'ed into those status bits. + */ enum intr_status_bits { IntrLinkChange=0xf0000000, IntrStatsMax=0x08000000, IntrAbnormalSummary=0x02000000, IntrGeneralTimer=0x01000000, @@ -441,7 +540,16 @@ enum intr_status_bits { /* Bits in the RxFilterMode register. */ enum rx_mode_bits { AcceptBroadcast=0x04, AcceptAllMulticast=0x02, AcceptAll=0x01, - AcceptMulticast=0x10, AcceptMyPhys=0xE040, + AcceptMulticast=0x10, PerfectFilter=0x40, HashFilter=0x30, + PerfectFilterVlan=0x80, MinVLANPrio=0xE000, VlanMode=0x0200, + WakeupOnGFP=0x0800, +}; + +/* Bits in the TxMode register */ +enum tx_mode_bits { + MiiSoftReset=0x8000, MIILoopback=0x4000, + TxFlowEnable=0x0800, RxFlowEnable=0x0400, + PadEnable=0x04, FullDuplex=0x02, HugeFrame=0x01, }; /* Bits in the TxDescCtrl register. */ @@ -450,7 +558,8 @@ enum tx_ctrl_bits { TxDescSpace128=0x30, TxDescSpace256=0x40, TxDescType0=0x00, TxDescType1=0x01, TxDescType2=0x02, TxDescType3=0x03, TxDescType4=0x04, - TxNoDMACompletion=0x08, TxDescQ64bit=0x80, + TxNoDMACompletion=0x08, + TxDescQAddr64bit=0x80, TxDescQAddr32bit=0, TxHiPriFIFOThreshShift=24, TxPadLenShift=16, TxDMABurstSizeShift=8, }; @@ -458,81 +567,144 @@ enum tx_ctrl_bits { /* Bits in the RxDescQCtrl register. */ enum rx_ctrl_bits { RxBufferLenShift=16, RxMinDescrThreshShift=0, - RxPrefetchMode=0x8000, Rx2048QEntries=0x4000, - RxVariableQ=0x2000, RxDesc64bit=0x1000, - RxDescQAddr64bit=0x0100, + RxPrefetchMode=0x8000, RxVariableQ=0x2000, + Rx2048QEntries=0x4000, Rx256QEntries=0, + RxDescAddr64bit=0x1000, RxDescAddr32bit=0, + RxDescQAddr64bit=0x0100, RxDescQAddr32bit=0, RxDescSpace4=0x000, RxDescSpace8=0x100, RxDescSpace16=0x200, RxDescSpace32=0x300, RxDescSpace64=0x400, RxDescSpace128=0x500, RxConsumerWrEn=0x80, }; +/* Bits in the RxDMACtrl register. */ +enum rx_dmactrl_bits { + RxReportBadFrames=0x80000000, RxDMAShortFrames=0x40000000, + RxDMABadFrames=0x20000000, RxDMACrcErrorFrames=0x10000000, + RxDMAControlFrame=0x08000000, RxDMAPauseFrame=0x04000000, + RxChecksumIgnore=0, RxChecksumRejectTCPUDP=0x02000000, + RxChecksumRejectTCPOnly=0x01000000, + RxCompletionQ2Enable=0x800000, + RxDMAQ2Disable=0, RxDMAQ2FPOnly=0x100000, + RxDMAQ2SmallPkt=0x200000, RxDMAQ2HighPrio=0x300000, + RxDMAQ2NonIP=0x400000, + RxUseBackupQueue=0x080000, RxDMACRC=0x040000, + RxEarlyIntThreshShift=12, RxHighPrioThreshShift=8, + RxBurstSizeShift=0, +}; + /* Bits in the RxCompletionAddr register */ enum rx_compl_bits { - RxComplQAddr64bit=0x80, TxComplProducerWrEn=0x40, + RxComplQAddr64bit=0x80, RxComplQAddr32bit=0, + RxComplProducerWrEn=0x40, RxComplType0=0x00, RxComplType1=0x10, RxComplType2=0x20, RxComplType3=0x30, RxComplThreshShift=0, }; +/* Bits in the TxCompletionAddr register */ +enum tx_compl_bits { + TxComplQAddr64bit=0x80, TxComplQAddr32bit=0, + TxComplProducerWrEn=0x40, + TxComplIntrStatus=0x20, + CommonQueueMode=0x10, + TxComplThreshShift=0, +}; + +/* Bits in the GenCtrl register */ +enum gen_ctrl_bits { + RxEnable=0x05, TxEnable=0x0a, + RxGFPEnable=0x10, TxGFPEnable=0x20, +}; + +/* Bits in the IntrTimerCtrl register */ +enum intr_ctrl_bits { + Timer10X=0x800, EnableIntrMasking=0x60, SmallFrameBypass=0x100, + SmallFrame64=0, SmallFrame128=0x200, SmallFrame256=0x400, SmallFrame512=0x600, + IntrLatencyMask=0x1f, +}; + /* The Rx and Tx buffer descriptors. */ struct starfire_rx_desc { - u32 rxaddr; /* Optionally 64 bits. */ + dma_addr_t rxaddr; }; enum rx_desc_bits { RxDescValid=1, RxDescEndRing=2, }; -/* Completion queue entry. - You must update the page allocation, init_ring and the shift count in rx() - if using a larger format. */ -#ifdef HAS_FIRMWARE -#define csum_rx_status -#endif /* HAS_FIRMWARE */ -struct rx_done_desc { +/* Completion queue entry. */ +struct short_rx_done_desc { + u32 status; /* Low 16 bits is length. */ +}; +struct basic_rx_done_desc { u32 status; /* Low 16 bits is length. */ -#ifdef csum_rx_status - u32 status2; /* Low 16 bits is csum */ -#endif /* csum_rx_status */ -#ifdef full_rx_status - u32 status2; + u16 vlanid; + u16 status2; +}; +struct csum_rx_done_desc { + u32 status; /* Low 16 bits is length. */ + u16 csum; /* Partial checksum */ + u16 status2; +}; +struct full_rx_done_desc { + u32 status; /* Low 16 bits is length. */ + u16 status3; + u16 status2; u16 vlanid; u16 csum; /* partial checksum */ u32 timestamp; -#endif /* full_rx_status */ }; +/* XXX: this is ugly and I'm not sure it's worth the trouble -Ion */ +#ifdef HAS_FIRMWARE +#ifdef VLAN_SUPPORT +typedef struct full_rx_done_desc rx_done_desc; +#define RxComplType RxComplType3 +#else /* not VLAN_SUPPORT */ +typedef struct csum_rx_done_desc rx_done_desc; +#define RxComplType RxComplType2 +#endif /* not VLAN_SUPPORT */ +#else /* not HAS_FIRMWARE */ +#ifdef VLAN_SUPPORT +typedef struct basic_rx_done_desc rx_done_desc; +#define RxComplType RxComplType1 +#else /* not VLAN_SUPPORT */ +typedef struct short_rx_done_desc rx_done_desc; +#define RxComplType RxComplType0 +#endif /* not VLAN_SUPPORT */ +#endif /* not HAS_FIRMWARE */ + enum rx_done_bits { RxOK=0x20000000, RxFIFOErr=0x10000000, RxBufQ2=0x08000000, }; -#ifdef ZEROCOPY -/* Type 0 Tx descriptor. */ -/* If more fragments are needed, don't forget to change the - descriptor spacing as well! */ -struct starfire_tx_desc { - u32 status; - u32 nbufs; - u32 first_addr; - u16 first_len; - u16 total_len; - struct { - u32 addr; - u32 len; - } frag[MAX_STARFIRE_FRAGS]; -}; -#else /* not ZEROCOPY */ /* Type 1 Tx descriptor. */ -struct starfire_tx_desc { +struct starfire_tx_desc_1 { + u32 status; /* Upper bits are status, lower 16 length. */ + u32 addr; +}; + +/* Type 2 Tx descriptor. */ +struct starfire_tx_desc_2 { u32 status; /* Upper bits are status, lower 16 length. */ - u32 first_addr; + u32 reserved; + u64 addr; }; -#endif /* not ZEROCOPY */ + +#ifdef ADDR_64BITS +typedef struct starfire_tx_desc_2 starfire_tx_desc; +#define TX_DESC_TYPE TxDescType2 +#else /* not ADDR_64BITS */ +typedef struct starfire_tx_desc_1 starfire_tx_desc; +#define TX_DESC_TYPE TxDescType1 +#endif /* not ADDR_64BITS */ +#define TX_DESC_SPACING TxDescSpaceUnlim + enum tx_desc_bits { TxDescID=0xB0000000, TxCRCEn=0x01000000, TxDescIntr=0x08000000, TxRingWrap=0x04000000, TxCalTCP=0x02000000, }; -struct tx_done_report { +struct tx_done_desc { u32 status; /* timestamp, index. */ #if 0 u32 intrstatus; /* interrupt status */ @@ -545,41 +717,45 @@ struct rx_ring_info { }; struct tx_ring_info { struct sk_buff *skb; - dma_addr_t first_mapping; -#ifdef ZEROCOPY - dma_addr_t frag_mapping[MAX_STARFIRE_FRAGS]; -#endif /* ZEROCOPY */ + dma_addr_t mapping; + unsigned int used_slots; }; #define PHY_CNT 2 struct netdev_private { /* Descriptor rings first for alignment. */ struct starfire_rx_desc *rx_ring; - struct starfire_tx_desc *tx_ring; + starfire_tx_desc *tx_ring; dma_addr_t rx_ring_dma; dma_addr_t tx_ring_dma; /* The addresses of rx/tx-in-place skbuffs. */ struct rx_ring_info rx_info[RX_RING_SIZE]; struct tx_ring_info tx_info[TX_RING_SIZE]; /* Pointers to completion queues (full pages). */ - struct rx_done_desc *rx_done_q; + rx_done_desc *rx_done_q; dma_addr_t rx_done_q_dma; unsigned int rx_done; - struct tx_done_report *tx_done_q; + struct tx_done_desc *tx_done_q; dma_addr_t tx_done_q_dma; unsigned int tx_done; struct net_device_stats stats; struct pci_dev *pci_dev; +#ifdef VLAN_SUPPORT + struct vlan_group *vlgrp; +#endif + void *queue_mem; + dma_addr_t queue_mem_dma; + size_t queue_mem_size; + /* Frequently used values: keep some adjacent for cache effect. */ spinlock_t lock; unsigned int cur_rx, dirty_rx; /* Producer/consumer ring indices */ - unsigned int cur_tx, dirty_tx; + unsigned int cur_tx, dirty_tx, reap_tx; unsigned int rx_buf_sz; /* Based on MTU+slack. */ - unsigned int tx_full:1, /* The Tx queue is full. */ /* These values keep track of the transceiver/media in use. */ - speed100:1; /* Set if speed == 100MBit. */ - unsigned int intr_mitigation; + int speed100; /* Set if speed == 100MBit. */ u32 tx_mode; + u32 intr_timer_ctrl; u8 tx_threshold; /* MII transceiver section. */ struct mii_if_info mii_if; /* MII lib hooks/info */ @@ -597,7 +773,8 @@ static void init_ring(struct net_device static int start_tx(struct sk_buff *skb, struct net_device *dev); static void intr_handler(int irq, void *dev_instance, struct pt_regs *regs); static void netdev_error(struct net_device *dev, int intr_status); -static int netdev_rx(struct net_device *dev); +static int __netdev_rx(struct net_device *dev, int *quota); +static void refill_rx_ring(struct net_device *dev); static void netdev_error(struct net_device *dev, int intr_status); static void set_rx_mode(struct net_device *dev); static struct net_device_stats *get_stats(struct net_device *dev); @@ -606,6 +783,44 @@ static int netdev_close(struct net_devic static void netdev_media_change(struct net_device *dev); +#ifdef VLAN_SUPPORT +static void netdev_vlan_rx_register(struct net_device *dev, struct vlan_group *grp) +{ + struct netdev_private *np = dev->priv; + + spin_lock(&np->lock); + if (debug > 2) + printk("%s: Setting vlgrp to %p\n", dev->name, grp); + np->vlgrp = grp; + set_rx_mode(dev); + spin_unlock(&np->lock); +} + +static void netdev_vlan_rx_add_vid(struct net_device *dev, unsigned short vid) +{ + struct netdev_private *np = dev->priv; + + spin_lock(&np->lock); + if (debug > 1) + printk("%s: Adding vlanid %d to vlan filter\n", dev->name, vid); + set_rx_mode(dev); + spin_unlock(&np->lock); +} + +static void netdev_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid) +{ + struct netdev_private *np = dev->priv; + + spin_lock(&np->lock); + if (debug > 1) + printk("%s: removing vlanid %d from vlan filter\n", dev->name, vid); + if (np->vlgrp) + np->vlgrp->vlan_devices[vid] = NULL; + set_rx_mode(dev); + spin_unlock(&np->lock); +} +#endif /* VLAN_SUPPORT */ + static int __devinit starfire_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) @@ -617,10 +832,6 @@ static int __devinit starfire_init_one(s long ioaddr; int drv_flags, io_size; int boguscnt; -#ifndef HAVE_PCI_SET_MWI - u16 cmd; - u8 cache; -#endif /* when built into the kernel, we only print version if device is found */ #ifndef MODULE @@ -637,13 +848,13 @@ static int __devinit starfire_init_one(s ioaddr = pci_resource_start(pdev, 0); io_size = pci_resource_len(pdev, 0); if (!ioaddr || ((pci_resource_flags(pdev, 0) & IORESOURCE_MEM) == 0)) { - printk (KERN_ERR DRV_NAME " %d: no PCI MEM resources, aborting\n", card_idx); + printk(KERN_ERR DRV_NAME " %d: no PCI MEM resources, aborting\n", card_idx); return -ENODEV; } dev = alloc_etherdev(sizeof(*np)); if (!dev) { - printk (KERN_ERR DRV_NAME " %d: cannot alloc etherdev, aborting\n", card_idx); + printk(KERN_ERR DRV_NAME " %d: cannot alloc etherdev, aborting\n", card_idx); return -ENOMEM; } SET_MODULE_OWNER(dev); @@ -651,7 +862,7 @@ static int __devinit starfire_init_one(s irq = pdev->irq; if (pci_request_regions (pdev, dev->name)) { - printk (KERN_ERR DRV_NAME " %d: cannot reserve PCI resources, aborting\n", card_idx); + printk(KERN_ERR DRV_NAME " %d: cannot reserve PCI resources, aborting\n", card_idx); goto err_out_free_netdev; } @@ -659,7 +870,7 @@ static int __devinit starfire_init_one(s #if !defined(CONFIG_SPARC64) || LINUX_VERSION_CODE > 0x20300 ioaddr = (long) ioremap(ioaddr, io_size); if (!ioaddr) { - printk (KERN_ERR DRV_NAME " %d: cannot remap 0x%x @ 0x%lx, aborting\n", + printk(KERN_ERR DRV_NAME " %d: cannot remap %#x @ %#lx, aborting\n", card_idx, io_size, ioaddr); goto err_out_free_res; } @@ -667,29 +878,26 @@ static int __devinit starfire_init_one(s pci_set_master(pdev); -#ifdef HAVE_PCI_SET_MWI - pci_set_mwi(pdev); -#else /* enable MWI -- it vastly improves Rx performance on sparc64 */ - pci_read_config_word(pdev, PCI_COMMAND, &cmd); - cmd |= PCI_COMMAND_INVALIDATE; - pci_write_config_word(pdev, PCI_COMMAND, cmd); - - /* set PCI cache size */ - pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache); - if ((cache << 2) != SMP_CACHE_BYTES) { - printk(KERN_INFO " PCI cache line size set incorrectly " - "(%i bytes) by BIOS/FW, correcting to %i\n", - (cache << 2), SMP_CACHE_BYTES); - pci_write_config_byte(pdev, PCI_CACHE_LINE_SIZE, - SMP_CACHE_BYTES >> 2); - } -#endif + pci_set_mwi(pdev); +#ifdef MAX_SKB_FRAGS + dev->features |= NETIF_F_SG; +#endif /* MAX_SKB_FRAGS */ #ifdef ZEROCOPY - /* Starfire can do SG and TCP/UDP checksumming */ - dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; + /* Starfire can do TCP/UDP checksumming */ + if (enable_hw_cksum) + dev->features |= NETIF_F_IP_CSUM; #endif /* ZEROCOPY */ +#ifdef VLAN_SUPPORT + dev->features |= NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER; + dev->vlan_rx_register = netdev_vlan_rx_register; + dev->vlan_rx_add_vid = netdev_vlan_rx_add_vid; + dev->vlan_rx_kill_vid = netdev_vlan_rx_kill_vid; +#endif /* VLAN_RX_KILL_VID */ +#ifdef ADDR_64BITS + dev->features |= NETIF_F_HIGHDMA; +#endif /* ADDR_64BITS */ /* Serial EEPROM reads are hidden by the hardware. */ for (i = 0; i < 6; i++) @@ -704,7 +912,7 @@ static int __devinit starfire_init_one(s #endif /* Issue soft reset */ - writel(0x8000, ioaddr + TxMode); + writel(MiiSoftReset, ioaddr + TxMode); udelay(1000); writel(0, ioaddr + TxMode); @@ -750,15 +958,40 @@ static int __devinit starfire_init_one(s np->mii_if.full_duplex = 1; if (np->mii_if.full_duplex) - np->mii_if.force_media = 0; - else np->mii_if.force_media = 1; + else + np->mii_if.force_media = 0; np->speed100 = 1; + /* timer resolution is 128 * 0.8us */ + np->intr_timer_ctrl = (((intr_latency * 10) / 1024) & IntrLatencyMask) | + Timer10X | EnableIntrMasking; + + if (small_frames > 0) { + np->intr_timer_ctrl |= SmallFrameBypass; + switch (small_frames) { + case 1 ... 64: + np->intr_timer_ctrl |= SmallFrame64; + break; + case 65 ... 128: + np->intr_timer_ctrl |= SmallFrame128; + break; + case 129 ... 256: + np->intr_timer_ctrl |= SmallFrame256; + break; + default: + np->intr_timer_ctrl |= SmallFrame512; + if (small_frames > 512) + printk("Adjusting small_frames down to 512\n"); + break; + } + } + /* The chip-specific entries in the device structure. */ dev->open = &netdev_open; dev->hard_start_xmit = &start_tx; init_tx_timer(dev, tx_timeout, TX_TIMEOUT); + init_poll(dev); dev->stop = &netdev_close; dev->get_stats = &get_stats; dev->set_multicast_list = &set_rx_mode; @@ -767,11 +1000,10 @@ static int __devinit starfire_init_one(s if (mtu) dev->mtu = mtu; - i = register_netdev(dev); - if (i) + if (register_netdev(dev)) goto err_out_cleardev; - printk(KERN_INFO "%s: %s at 0x%lx, ", + printk(KERN_INFO "%s: %s at %#lx, ", dev->name, netdrv_tbl[chip_idx].name, ioaddr); for (i = 0; i < 5; i++) printk("%2.2x:", dev->dev_addr[i]); @@ -796,7 +1028,7 @@ static int __devinit starfire_init_one(s np->phys[phy_idx++] = phy; np->mii_if.advertising = mdio_read(dev, phy, MII_ADVERTISE); printk(KERN_INFO "%s: MII PHY found at address %d, status " - "0x%4.4x advertising %4.4x.\n", + "%#4.4x advertising %#4.4x.\n", dev->name, phy, mii_status, np->mii_if.advertising); /* there can be only one PHY on-board */ break; @@ -809,14 +1041,8 @@ static int __devinit starfire_init_one(s memset(&np->mii_if, 0, sizeof(np->mii_if)); } -#ifdef ZEROCOPY - printk(KERN_INFO "%s: scatter-gather and hardware TCP cksumming enabled.\n", - dev->name); -#else /* not ZEROCOPY */ - printk(KERN_INFO "%s: scatter-gather and hardware TCP cksumming disabled.\n", - dev->name); -#endif /* not ZEROCOPY */ - + printk(KERN_INFO "%s: scatter-gather and hardware TCP cksumming %s.\n", + dev->name, enable_hw_cksum ? "enabled" : "disabled"); return 0; err_out_cleardev: @@ -825,7 +1051,6 @@ err_out_cleardev: err_out_free_res: pci_release_regions (pdev); err_out_free_netdev: - unregister_netdev(dev); kfree(dev); return -ENODEV; } @@ -861,6 +1086,7 @@ static int netdev_open(struct net_device struct netdev_private *np = dev->priv; long ioaddr = dev->base_addr; int i, retval; + size_t tx_done_q_size, rx_done_q_size, tx_ring_size, rx_ring_size; /* Do we ever need to reset the chip??? */ @@ -878,62 +1104,58 @@ static int netdev_open(struct net_device if (debug > 1) printk(KERN_DEBUG "%s: netdev_open() irq %d.\n", dev->name, dev->irq); - /* Allocate the various queues, failing gracefully. */ - if (np->tx_done_q == 0) - np->tx_done_q = pci_alloc_consistent(np->pci_dev, PAGE_SIZE, &np->tx_done_q_dma); - if (np->rx_done_q == 0) - np->rx_done_q = pci_alloc_consistent(np->pci_dev, sizeof(struct rx_done_desc) * DONE_Q_SIZE, &np->rx_done_q_dma); - if (np->tx_ring == 0) - np->tx_ring = pci_alloc_consistent(np->pci_dev, PAGE_SIZE, &np->tx_ring_dma); - if (np->rx_ring == 0) - np->rx_ring = pci_alloc_consistent(np->pci_dev, PAGE_SIZE, &np->rx_ring_dma); - if (np->tx_done_q == 0 || np->rx_done_q == 0 - || np->rx_ring == 0 || np->tx_ring == 0) { - if (np->tx_done_q) - pci_free_consistent(np->pci_dev, PAGE_SIZE, - np->tx_done_q, np->tx_done_q_dma); - if (np->rx_done_q) - pci_free_consistent(np->pci_dev, sizeof(struct rx_done_desc) * DONE_Q_SIZE, - np->rx_done_q, np->rx_done_q_dma); - if (np->tx_ring) - pci_free_consistent(np->pci_dev, PAGE_SIZE, - np->tx_ring, np->tx_ring_dma); - if (np->rx_ring) - pci_free_consistent(np->pci_dev, PAGE_SIZE, - np->rx_ring, np->rx_ring_dma); + /* Allocate the various queues. */ + tx_done_q_size = ((sizeof(struct tx_done_desc) * DONE_Q_SIZE + QUEUE_ALIGN - 1) / QUEUE_ALIGN) * QUEUE_ALIGN; + rx_done_q_size = ((sizeof(rx_done_desc) * DONE_Q_SIZE + QUEUE_ALIGN - 1) / QUEUE_ALIGN) * QUEUE_ALIGN; + tx_ring_size = ((sizeof(starfire_tx_desc) * TX_RING_SIZE + QUEUE_ALIGN - 1) / QUEUE_ALIGN) * QUEUE_ALIGN; + rx_ring_size = sizeof(struct starfire_rx_desc) * RX_RING_SIZE; + np->queue_mem_size = tx_done_q_size + rx_done_q_size + tx_ring_size + rx_ring_size; + np->queue_mem = pci_alloc_consistent(np->pci_dev, np->queue_mem_size, &np->queue_mem_dma); + if (np->queue_mem == 0) { COMPAT_MOD_DEC_USE_COUNT; return -ENOMEM; } + np->tx_done_q = np->queue_mem; + np->tx_done_q_dma = np->queue_mem_dma; + np->rx_done_q = (void *) np->tx_done_q + tx_done_q_size; + np->rx_done_q_dma = np->tx_done_q_dma + tx_done_q_size; + np->tx_ring = (void *) np->rx_done_q + rx_done_q_size; + np->tx_ring_dma = np->rx_done_q_dma + rx_done_q_size; + np->rx_ring = (void *) np->tx_ring + tx_ring_size; + np->rx_ring_dma = np->tx_ring_dma + tx_ring_size; + + /* Start with no carrier, it gets adjusted later */ netif_carrier_off(dev); init_ring(dev); /* Set the size of the Rx buffers. */ writel((np->rx_buf_sz << RxBufferLenShift) | (0 << RxMinDescrThreshShift) | RxPrefetchMode | RxVariableQ | + RX_Q_ENTRIES | + RX_DESC_Q_ADDR_SIZE | RX_DESC_ADDR_SIZE | RxDescSpace4, ioaddr + RxDescQCtrl); -#ifdef ZEROCOPY - /* Set Tx descriptor to type 0 and spacing to 64 bytes. */ - writel((2 << TxHiPriFIFOThreshShift) | - (0 << TxPadLenShift) | - (4 << TxDMABurstSizeShift) | - TxDescSpace64 | TxDescType0, - ioaddr + TxDescCtrl); -#else /* not ZEROCOPY */ - /* Set Tx descriptor to type 1 and padding to 0 bytes. */ + /* Set up the Rx DMA controller. */ + writel(RxChecksumIgnore | + (0 << RxEarlyIntThreshShift) | + (6 << RxHighPrioThreshShift) | + ((DMA_BURST_SIZE / 32) << RxBurstSizeShift), + ioaddr + RxDMACtrl); + + /* Set Tx descriptor */ writel((2 << TxHiPriFIFOThreshShift) | (0 << TxPadLenShift) | - (4 << TxDMABurstSizeShift) | - TxDescSpaceUnlim | TxDescType1, + ((DMA_BURST_SIZE / 32) << TxDMABurstSizeShift) | + TX_DESC_Q_ADDR_SIZE | + TX_DESC_SPACING | TX_DESC_TYPE, ioaddr + TxDescCtrl); -#endif /* not ZEROCOPY */ -#if defined(ADDR_64BITS) && defined(__alpha__) - /* XXX We really need a 64-bit PCI dma interfaces too... -DaveM */ - writel(np->rx_ring_dma >> 32, ioaddr + RxDescQHiAddr); - writel(np->tx_ring_dma >> 32, ioaddr + TxRingHiAddr); +#if defined(ADDR_64BITS) + writel(np->queue_mem_dma >> 32, ioaddr + RxDescQHiAddr); + writel(np->queue_mem_dma >> 32, ioaddr + TxRingHiAddr); + writel(np->queue_mem_dma >> 32, ioaddr + CompletionHiAddr); #else writel(0, ioaddr + RxDescQHiAddr); writel(0, ioaddr + TxRingHiAddr); @@ -943,32 +1165,23 @@ static int netdev_open(struct net_device writel(np->tx_ring_dma, ioaddr + TxRingPtr); writel(np->tx_done_q_dma, ioaddr + TxCompletionAddr); -#ifdef full_rx_status - writel(np->rx_done_q_dma | - RxComplType3 | - (0 << RxComplThreshShift), - ioaddr + RxCompletionAddr); -#else /* not full_rx_status */ -#ifdef csum_rx_status - writel(np->rx_done_q_dma | - RxComplType2 | - (0 << RxComplThreshShift), - ioaddr + RxCompletionAddr); -#else /* not csum_rx_status */ writel(np->rx_done_q_dma | - RxComplType0 | + RxComplType | (0 << RxComplThreshShift), ioaddr + RxCompletionAddr); -#endif /* not csum_rx_status */ -#endif /* not full_rx_status */ if (debug > 1) printk(KERN_DEBUG "%s: Filling in the station address.\n", dev->name); - /* Fill both the unused Tx SA register and the Rx perfect filter. */ + /* Fill both the Tx SA register and the Rx perfect filter. */ for (i = 0; i < 6; i++) - writeb(dev->dev_addr[i], ioaddr + StationAddr + 5 - i); - for (i = 0; i < 16; i++) { + writeb(dev->dev_addr[i], ioaddr + TxStationAddr + 5 - i); + /* The first entry is special because it bypasses the VLAN filter. + Don't use it. */ + writew(0, ioaddr + PerfFilterTable); + writew(0, ioaddr + PerfFilterTable + 4); + writew(0, ioaddr + PerfFilterTable + 8); + for (i = 1; i < 16; i++) { u16 *eaddrs = (u16 *)dev->dev_addr; long setup_frm = ioaddr + PerfFilterTable + i * 16; writew(cpu_to_be16(eaddrs[2]), setup_frm); setup_frm += 4; @@ -978,16 +1191,14 @@ static int netdev_open(struct net_device /* Initialize other registers. */ /* Configure the PCI bus bursts and FIFO thresholds. */ - np->tx_mode = 0x0C04; /* modified when link is up. */ - writel(0x8000 | np->tx_mode, ioaddr + TxMode); + np->tx_mode = TxFlowEnable|RxFlowEnable|PadEnable; /* modified when link is up. */ + writel(MiiSoftReset | np->tx_mode, ioaddr + TxMode); udelay(1000); writel(np->tx_mode, ioaddr + TxMode); np->tx_threshold = 4; writel(np->tx_threshold, ioaddr + TxThreshold); - interrupt_mitigation &= 0x1f; - np->intr_mitigation = interrupt_mitigation; - writel(np->intr_mitigation, ioaddr + IntrTimerCtrl); + writel(np->intr_timer_ctrl, ioaddr + IntrTimerCtrl); netif_start_if(dev); netif_start_queue(dev); @@ -1002,29 +1213,35 @@ static int netdev_open(struct net_device /* Enable GPIO interrupts on link change */ writel(0x0f00ff00, ioaddr + GPIOCtrl); - /* Set the interrupt mask and enable PCI interrupts. */ + /* Set the interrupt mask */ writel(IntrRxDone | IntrRxEmpty | IntrDMAErr | - IntrTxDone | IntrStatsMax | IntrLinkChange | - IntrNormalSummary | IntrAbnormalSummary | + IntrTxDMADone | IntrStatsMax | IntrLinkChange | IntrRxGFPDead | IntrNoTxCsum | IntrTxBadID, ioaddr + IntrEnable); + /* Enable PCI interrupts. */ writel(0x00800000 | readl(ioaddr + PCIDeviceConfig), ioaddr + PCIDeviceConfig); +#ifdef VLAN_SUPPORT + /* Set VLAN type to 802.1q */ + writel(ETH_P_8021Q, ioaddr + VlanType); +#endif /* VLAN_SUPPORT */ + #ifdef HAS_FIRMWARE /* Load Rx/Tx firmware into the frame processors */ for (i = 0; i < FIRMWARE_RX_SIZE * 2; i++) writel(firmware_rx[i], ioaddr + RxGfpMem + i * 4); for (i = 0; i < FIRMWARE_TX_SIZE * 2; i++) writel(firmware_tx[i], ioaddr + TxGfpMem + i * 4); - /* Enable the Rx and Tx units, and the Rx/Tx frame processors. */ - writel(0x003F, ioaddr + GenCtrl); -#else /* not HAS_FIRMWARE */ - /* Enable the Rx and Tx units only. */ - writel(0x000F, ioaddr + GenCtrl); -#endif /* not HAS_FIRMWARE */ +#endif /* HAS_FIRMWARE */ + if (enable_hw_cksum) + /* Enable the Rx and Tx units, and the Rx/Tx frame processors. */ + writel(TxEnable|TxGFPEnable|RxEnable|RxGFPEnable, ioaddr + GenCtrl); + else + /* Enable the Rx and Tx units only. */ + writel(TxEnable|RxEnable, ioaddr + GenCtrl); - if (debug > 2) + if (debug > 1) printk(KERN_DEBUG "%s: Done netdev_open().\n", dev->name); @@ -1036,11 +1253,17 @@ static void check_duplex(struct net_devi { struct netdev_private *np = dev->priv; u16 reg0; + int silly_count = 1000; mdio_write(dev, np->phys[0], MII_ADVERTISE, np->mii_if.advertising); mdio_write(dev, np->phys[0], MII_BMCR, BMCR_RESET); udelay(500); - while (mdio_read(dev, np->phys[0], MII_BMCR) & BMCR_RESET); + while (--silly_count && mdio_read(dev, np->phys[0], MII_BMCR) & BMCR_RESET) + /* do nothing */; + if (!silly_count) { + printk("%s: MII reset failed!\n", dev->name); + return; + } reg0 = mdio_read(dev, np->phys[0], MII_BMCR); @@ -1065,25 +1288,22 @@ static void tx_timeout(struct net_device { struct netdev_private *np = dev->priv; long ioaddr = dev->base_addr; + int old_debug; - printk(KERN_WARNING "%s: Transmit timed out, status %8.8x," - " resetting...\n", dev->name, (int)readl(ioaddr + IntrStatus)); - -#ifndef __alpha__ - { - int i; - printk(KERN_DEBUG " Rx ring %p: ", np->rx_ring); - for (i = 0; i < RX_RING_SIZE; i++) - printk(" %8.8x", (unsigned int)le32_to_cpu(np->rx_ring[i].rxaddr)); - printk("\n"KERN_DEBUG" Tx ring %p: ", np->tx_ring); - for (i = 0; i < TX_RING_SIZE; i++) - printk(" %4.4x", le32_to_cpu(np->tx_ring[i].status)); - printk("\n"); - } -#endif + printk(KERN_WARNING "%s: Transmit timed out, status %#8.8x, " + "resetting...\n", dev->name, (int) readl(ioaddr + IntrStatus)); /* Perhaps we should reinitialize the hardware here. */ - /* Stop and restart the chip's Tx processes . */ + + /* + * Stop and restart the interface. + * Cheat and increase the debug level temporarily. + */ + old_debug = debug; + debug = 2; + netdev_close(dev); + netdev_open(dev); + debug = old_debug; /* Trigger an immediate transmit demand. */ @@ -1099,9 +1319,8 @@ static void init_ring(struct net_device struct netdev_private *np = dev->priv; int i; - np->tx_full = 0; - np->cur_rx = np->cur_tx = 0; - np->dirty_rx = np->rx_done = np->dirty_tx = np->tx_done = 0; + np->cur_rx = np->cur_tx = np->reap_tx = 0; + np->dirty_rx = np->dirty_tx = np->rx_done = np->tx_done = 0; np->rx_buf_sz = (dev->mtu <= 1500 ? PKT_BUF_SZ : dev->mtu + 32); @@ -1114,7 +1333,7 @@ static void init_ring(struct net_device np->rx_info[i].mapping = pci_map_single(np->pci_dev, skb->tail, np->rx_buf_sz, PCI_DMA_FROMDEVICE); skb->dev = dev; /* Mark as being used by this device. */ /* Grrr, we cannot offset to correctly align the IP header. */ - np->rx_ring[i].rxaddr = cpu_to_le32(np->rx_info[i].mapping | RxDescValid); + np->rx_ring[i].rxaddr = cpu_to_dma(np->rx_info[i].mapping | RxDescValid); } writew(i - 1, dev->base_addr + RxDescQIdx); np->dirty_rx = (unsigned int)(i - RX_RING_SIZE); @@ -1126,7 +1345,7 @@ static void init_ring(struct net_device np->rx_info[i].mapping = 0; } /* Mark the last entry as wrapping the ring. */ - np->rx_ring[i-1].rxaddr |= cpu_to_le32(RxDescEndRing); + np->rx_ring[RX_RING_SIZE - 1].rxaddr |= cpu_to_dma(RxDescEndRing); /* Clear the completion rings. */ for (i = 0; i < DONE_Q_SIZE; i++) { @@ -1134,18 +1353,9 @@ static void init_ring(struct net_device np->tx_done_q[i].status = 0; } - for (i = 0; i < TX_RING_SIZE; i++) { - np->tx_info[i].skb = NULL; - np->tx_info[i].first_mapping = 0; -#ifdef ZEROCOPY - { - int j; - for (j = 0; j < MAX_STARFIRE_FRAGS; j++) - np->tx_info[i].frag_mapping[j] = 0; - } -#endif /* ZEROCOPY */ - np->tx_ring[i].status = 0; - } + for (i = 0; i < TX_RING_SIZE; i++) + memset(&np->tx_info[i], 0, sizeof(np->tx_info[i])); + return; } @@ -1154,19 +1364,21 @@ static int start_tx(struct sk_buff *skb, { struct netdev_private *np = dev->priv; unsigned int entry; -#ifdef ZEROCOPY + u32 status; int i; -#endif kick_tx_timer(dev, tx_timeout, TX_TIMEOUT); - /* Caution: the write order is important here, set the field - with the "ownership" bits last. */ - - /* Calculate the next Tx descriptor entry. */ - entry = np->cur_tx % TX_RING_SIZE; + /* + * be cautious here, wrapping the queue has weird semantics + * and we may not have enough slots even when it seems we do. + */ + if ((np->cur_tx - np->dirty_tx) + skb_num_frags(skb) * 2 > TX_RING_SIZE) { + netif_stop_queue(dev); + return 1; + } -#if defined(ZEROCOPY) && defined(HAS_FIRMWARE) && defined(HAS_BROKEN_FIRMWARE) +#if defined(ZEROCOPY) && defined(HAS_BROKEN_FIRMWARE) { int has_bad_length = 0; @@ -1183,85 +1395,72 @@ static int start_tx(struct sk_buff *skb, if (has_bad_length) skb_checksum_help(skb); } -#endif /* ZEROCOPY && HAS_FIRMWARE && HAS_BROKEN_FIRMWARE */ - - np->tx_info[entry].skb = skb; - np->tx_info[entry].first_mapping = - pci_map_single(np->pci_dev, skb->data, skb_first_frag_len(skb), PCI_DMA_TODEVICE); +#endif /* ZEROCOPY && HAS_BROKEN_FIRMWARE */ - np->tx_ring[entry].first_addr = cpu_to_le32(np->tx_info[entry].first_mapping); -#ifdef ZEROCOPY - np->tx_ring[entry].first_len = cpu_to_le16(skb_first_frag_len(skb)); - np->tx_ring[entry].total_len = cpu_to_le16(skb->len); - /* Add "| TxDescIntr" to generate Tx-done interrupts. */ - np->tx_ring[entry].status = cpu_to_le32(TxDescID | TxCRCEn); - np->tx_ring[entry].nbufs = cpu_to_le32(skb_shinfo(skb)->nr_frags + 1); -#else /* not ZEROCOPY */ - /* Add "| TxDescIntr" to generate Tx-done interrupts. */ - np->tx_ring[entry].status = cpu_to_le32(skb->len | TxDescID | TxCRCEn | 1 << 16); -#endif /* not ZEROCOPY */ - - if (entry >= TX_RING_SIZE-1) /* Wrap ring */ - np->tx_ring[entry].status |= cpu_to_le32(TxRingWrap | TxDescIntr); - -#ifdef ZEROCOPY - if (skb->ip_summed == CHECKSUM_HW) { - np->tx_ring[entry].status |= cpu_to_le32(TxCalTCP); - np->stats.tx_compressed++; - } -#endif /* ZEROCOPY */ - - if (debug > 5) { -#ifdef ZEROCOPY - printk(KERN_DEBUG "%s: Tx #%d slot %d status %8.8x nbufs %d len %4.4x/%4.4x.\n", - dev->name, np->cur_tx, entry, - le32_to_cpu(np->tx_ring[entry].status), - le32_to_cpu(np->tx_ring[entry].nbufs), - le32_to_cpu(np->tx_ring[entry].first_len), - le32_to_cpu(np->tx_ring[entry].total_len)); -#else /* not ZEROCOPY */ - printk(KERN_DEBUG "%s: Tx #%d slot %d status %8.8x.\n", - dev->name, np->cur_tx, entry, - le32_to_cpu(np->tx_ring[entry].status)); -#endif /* not ZEROCOPY */ - } - -#ifdef ZEROCOPY - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - skb_frag_t *this_frag = &skb_shinfo(skb)->frags[i]; + entry = np->cur_tx % TX_RING_SIZE; + for (i = 0; i < skb_num_frags(skb); i++) { + int wrap_ring = 0; + status = TxDescID; + + if (i == 0) { + np->tx_info[entry].skb = skb; + status |= TxCRCEn; + if (entry >= TX_RING_SIZE - skb_num_frags(skb)) { + status |= TxRingWrap; + wrap_ring = 1; + } + if (np->reap_tx) { + status |= TxDescIntr; + np->reap_tx = 0; + } + if (skb->ip_summed == CHECKSUM_HW) { + status |= TxCalTCP; + np->stats.tx_compressed++; + } + status |= skb_first_frag_len(skb) | (skb_num_frags(skb) << 16); - /* we already have the proper value in entry */ - np->tx_info[entry].frag_mapping[i] = - pci_map_single(np->pci_dev, page_address(this_frag->page) + this_frag->page_offset, this_frag->size, PCI_DMA_TODEVICE); - - np->tx_ring[entry].frag[i].addr = cpu_to_le32(np->tx_info[entry].frag_mapping[i]); - np->tx_ring[entry].frag[i].len = cpu_to_le32(this_frag->size); - if (debug > 5) { - printk(KERN_DEBUG "%s: Tx #%d frag %d len %4.4x.\n", - dev->name, np->cur_tx, i, - le32_to_cpu(np->tx_ring[entry].frag[i].len)); - } + np->tx_info[entry].mapping = + pci_map_single(np->pci_dev, skb->data, skb_first_frag_len(skb), PCI_DMA_TODEVICE); + } else { +#ifdef MAX_SKB_FRAGS + skb_frag_t *this_frag = &skb_shinfo(skb)->frags[i - 1]; + status |= this_frag->size; + np->tx_info[entry].mapping = + pci_map_single(np->pci_dev, page_address(this_frag->page) + this_frag->page_offset, this_frag->size, PCI_DMA_TODEVICE); +#endif /* MAX_SKB_FRAGS */ + } + + np->tx_ring[entry].addr = cpu_to_dma(np->tx_info[entry].mapping); + np->tx_ring[entry].status = cpu_to_le32(status); + if (debug > 3) + printk(KERN_DEBUG "%s: Tx #%d/#%d slot %d status %#8.8x.\n", + dev->name, np->cur_tx, np->dirty_tx, + entry, status); + if (wrap_ring) { + np->tx_info[entry].used_slots = TX_RING_SIZE - entry; + np->cur_tx += np->tx_info[entry].used_slots; + entry = 0; + } else { + np->tx_info[entry].used_slots = 1; + np->cur_tx += np->tx_info[entry].used_slots; + entry++; + } + /* scavenge the tx descriptors twice per TX_RING_SIZE */ + if (np->cur_tx % (TX_RING_SIZE / 2) == 0) + np->reap_tx = 1; } -#endif /* ZEROCOPY */ - - np->cur_tx++; - - if (entry >= TX_RING_SIZE-1) /* Wrap ring */ - entry = -1; - entry++; /* Non-x86: explicitly flush descriptor cache lines here. */ - /* Ensure everything is written back above before the transmit is + /* Ensure all descriptors are written back before the transmit is initiated. - Jes */ wmb(); /* Update the producer index. */ - writel(entry * (sizeof(struct starfire_tx_desc) / 8), dev->base_addr + TxProducerIdx); + writel(entry * (sizeof(starfire_tx_desc) / 8), dev->base_addr + TxProducerIdx); - if (np->cur_tx - np->dirty_tx >= TX_RING_SIZE - 1) { - np->tx_full = 1; + /* 4 is arbitrary, but should be ok */ + if ((np->cur_tx - np->dirty_tx) + 4 > TX_RING_SIZE) netif_stop_queue(dev); - } dev->trans_start = jiffies; @@ -1273,20 +1472,13 @@ static int start_tx(struct sk_buff *skb, after the Tx thread. */ static void intr_handler(int irq, void *dev_instance, struct pt_regs *rgs) { - struct net_device *dev = (struct net_device *)dev_instance; + struct net_device *dev = dev_instance; struct netdev_private *np; long ioaddr; int boguscnt = max_interrupt_work; int consumer; int tx_status; -#ifndef final_version /* Can never occur. */ - if (dev == NULL) { - printk (KERN_ERR "Netdev interrupt handler(): IRQ %d for unknown device.\n", irq); - return; - } -#endif - ioaddr = dev->base_addr; np = dev->priv; @@ -1294,83 +1486,69 @@ static void intr_handler(int irq, void * u32 intr_status = readl(ioaddr + IntrClear); if (debug > 4) - printk(KERN_DEBUG "%s: Interrupt status %4.4x.\n", + printk(KERN_DEBUG "%s: Interrupt status %#8.8x.\n", dev->name, intr_status); - if (intr_status == 0) + if (intr_status == 0 || intr_status == (u32) -1) break; - if (intr_status & IntrRxDone) - netdev_rx(dev); + if (intr_status & (IntrRxDone | IntrRxEmpty)) + netdev_rx(dev, ioaddr); /* Scavenge the skbuff list based on the Tx-done queue. There are redundant checks here that may be cleaned up after the driver has proven to be reliable. */ consumer = readl(ioaddr + TxConsumerIdx); - if (debug > 4) + if (debug > 3) printk(KERN_DEBUG "%s: Tx Consumer index is %d.\n", dev->name, consumer); -#if 0 - if (np->tx_done >= 250 || np->tx_done == 0) - printk(KERN_DEBUG "%s: Tx completion entry %d is %8.8x, %d is %8.8x.\n", - dev->name, np->tx_done, - le32_to_cpu(np->tx_done_q[np->tx_done].status), - (np->tx_done+1) & (DONE_Q_SIZE-1), - le32_to_cpu(np->tx_done_q[(np->tx_done+1)&(DONE_Q_SIZE-1)].status)); -#endif while ((tx_status = le32_to_cpu(np->tx_done_q[np->tx_done].status)) != 0) { - if (debug > 4) - printk(KERN_DEBUG "%s: Tx completion entry %d is %8.8x.\n", - dev->name, np->tx_done, tx_status); + if (debug > 3) + printk(KERN_DEBUG "%s: Tx completion #%d entry %d is %#8.8x.\n", + dev->name, np->dirty_tx, np->tx_done, tx_status); if ((tx_status & 0xe0000000) == 0xa0000000) { np->stats.tx_packets++; } else if ((tx_status & 0xe0000000) == 0x80000000) { - struct sk_buff *skb; -#ifdef ZEROCOPY - int i; -#endif /* ZEROCOPY */ - u16 entry = tx_status; /* Implicit truncate */ - entry /= sizeof(struct starfire_tx_desc); - - skb = np->tx_info[entry].skb; + u16 entry = (tx_status & 0x7fff) / sizeof(starfire_tx_desc); + struct sk_buff *skb = np->tx_info[entry].skb; np->tx_info[entry].skb = NULL; pci_unmap_single(np->pci_dev, - np->tx_info[entry].first_mapping, + np->tx_info[entry].mapping, skb_first_frag_len(skb), PCI_DMA_TODEVICE); - np->tx_info[entry].first_mapping = 0; - -#ifdef ZEROCOPY - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - pci_unmap_single(np->pci_dev, - np->tx_info[entry].frag_mapping[i], - skb_shinfo(skb)->frags[i].size, - PCI_DMA_TODEVICE); - np->tx_info[entry].frag_mapping[i] = 0; + np->tx_info[entry].mapping = 0; + np->dirty_tx += np->tx_info[entry].used_slots; + entry = (entry + np->tx_info[entry].used_slots) % TX_RING_SIZE; +#ifdef MAX_SKB_FRAGS + { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + pci_unmap_single(np->pci_dev, + np->tx_info[entry].mapping, + skb_shinfo(skb)->frags[i].size, + PCI_DMA_TODEVICE); + np->dirty_tx++; + entry++; + } } -#endif /* ZEROCOPY */ - - /* Scavenge the descriptor. */ +#endif /* MAX_SKB_FRAGS */ dev_kfree_skb_irq(skb); - - np->dirty_tx++; } np->tx_done_q[np->tx_done].status = 0; - np->tx_done = (np->tx_done+1) & (DONE_Q_SIZE-1); + np->tx_done = (np->tx_done + 1) % DONE_Q_SIZE; } writew(np->tx_done, ioaddr + CompletionQConsumerIdx + 2); - if (np->tx_full && np->cur_tx - np->dirty_tx < TX_RING_SIZE - 4) { + if (netif_queue_stopped(dev) && + (np->cur_tx - np->dirty_tx + 4 < TX_RING_SIZE)) { /* The ring is no longer full, wake the queue. */ - np->tx_full = 0; netif_wake_queue(dev); } /* Stats overflow */ - if (intr_status & IntrStatsMax) { + if (intr_status & IntrStatsMax) get_stats(dev); - } /* Media change interrupt. */ if (intr_status & IntrLinkChange) @@ -1381,72 +1559,58 @@ static void intr_handler(int irq, void * netdev_error(dev, intr_status); if (--boguscnt < 0) { - printk(KERN_WARNING "%s: Too much work at interrupt, " - "status=0x%4.4x.\n", - dev->name, intr_status); + if (debug > 1) + printk(KERN_WARNING "%s: Too much work at interrupt, " + "status=%#8.8x.\n", + dev->name, intr_status); break; } } while (1); if (debug > 4) - printk(KERN_DEBUG "%s: exiting interrupt, status=%#4.4x.\n", - dev->name, (int)readl(ioaddr + IntrStatus)); - -#ifndef final_version - /* Code that should never be run! Remove after testing.. */ - { - static int stopit = 10; - if (!netif_running(dev) && --stopit < 0) { - printk(KERN_ERR "%s: Emergency stop, looping startup interrupt.\n", - dev->name); - free_irq(irq, dev); - } - } -#endif + printk(KERN_DEBUG "%s: exiting interrupt, status=%#8.8x.\n", + dev->name, (int) readl(ioaddr + IntrStatus)); } -/* This routine is logically part of the interrupt handler, but separated - for clarity and better register allocation. */ -static int netdev_rx(struct net_device *dev) +/* This routine is logically part of the interrupt/poll handler, but separated + for clarity, code sharing between NAPI/non-NAPI, and better register allocation. */ +static int __netdev_rx(struct net_device *dev, int *quota) { struct netdev_private *np = dev->priv; - int boguscnt = np->dirty_rx + RX_RING_SIZE - np->cur_rx; u32 desc_status; - - if (np->rx_done_q == 0) { - printk(KERN_ERR "%s: rx_done_q is NULL! rx_done is %d. %p.\n", - dev->name, np->rx_done, np->tx_done_q); - return 0; - } + int retcode = 0; /* If EOP is set on the next entry, it's a new packet. Send it up. */ while ((desc_status = le32_to_cpu(np->rx_done_q[np->rx_done].status)) != 0) { struct sk_buff *skb; u16 pkt_len; int entry; + rx_done_desc *desc = &np->rx_done_q[np->rx_done]; if (debug > 4) - printk(KERN_DEBUG " netdev_rx() status of %d was %8.8x.\n", np->rx_done, desc_status); - if (--boguscnt < 0) - break; - if ( ! (desc_status & RxOK)) { + printk(KERN_DEBUG " netdev_rx() status of %d was %#8.8x.\n", np->rx_done, desc_status); + if (!(desc_status & RxOK)) { /* There was a error. */ if (debug > 2) - printk(KERN_DEBUG " netdev_rx() Rx error was %8.8x.\n", desc_status); + printk(KERN_DEBUG " netdev_rx() Rx error was %#8.8x.\n", desc_status); np->stats.rx_errors++; if (desc_status & RxFIFOErr) np->stats.rx_fifo_errors++; goto next_rx; } + if (*quota <= 0) { /* out of rx quota */ + retcode = 1; + goto out; + } + (*quota)--; + pkt_len = desc_status; /* Implicitly Truncate */ entry = (desc_status >> 16) & 0x7ff; -#ifndef final_version if (debug > 4) - printk(KERN_DEBUG " netdev_rx() normal Rx pkt length %d, bogus_cnt %d.\n", pkt_len, boguscnt); -#endif + printk(KERN_DEBUG " netdev_rx() normal Rx pkt length %d, quota %d.\n", pkt_len, *quota); /* Check if the packet is long enough to accept without copying to a minimally-sized skbuff. */ if (pkt_len < rx_copybreak @@ -1456,12 +1620,8 @@ static int netdev_rx(struct net_device * pci_dma_sync_single(np->pci_dev, np->rx_info[entry].mapping, pkt_len, PCI_DMA_FROMDEVICE); -#if HAS_IP_COPYSUM /* Call copy + cksum if available. */ eth_copy_and_sum(skb, np->rx_info[entry].skb->tail, pkt_len, 0); skb_put(skb, pkt_len); -#else - memcpy(skb_put(skb, pkt_len), np->rx_info[entry].skb->tail, pkt_len); -#endif } else { pci_unmap_single(np->pci_dev, np->rx_info[entry].mapping, np->rx_buf_sz, PCI_DMA_FROMDEVICE); skb = np->rx_info[entry].skb; @@ -1473,51 +1633,109 @@ static int netdev_rx(struct net_device * /* You will want this info for the initial debug. */ if (debug > 5) printk(KERN_DEBUG " Rx data %2.2x:%2.2x:%2.2x:%2.2x:%2.2x:" - "%2.2x %2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x %2.2x%2.2x " - "%d.%d.%d.%d.\n", + "%2.2x %2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x %2.2x%2.2x.\n", skb->data[0], skb->data[1], skb->data[2], skb->data[3], skb->data[4], skb->data[5], skb->data[6], skb->data[7], skb->data[8], skb->data[9], skb->data[10], - skb->data[11], skb->data[12], skb->data[13], - skb->data[14], skb->data[15], skb->data[16], - skb->data[17]); + skb->data[11], skb->data[12], skb->data[13]); #endif + skb->protocol = eth_type_trans(skb, dev); -#if defined(full_rx_status) || defined(csum_rx_status) - if (le32_to_cpu(np->rx_done_q[np->rx_done].status2) & 0x01000000) { +#if defined(HAS_FIRMWARE) || defined(VLAN_SUPPORT) + if (debug > 4) + printk(KERN_DEBUG " netdev_rx() status2 of %d was %#4.4x.\n", np->rx_done, le16_to_cpu(desc->status2)); +#endif +#ifdef HAS_FIRMWARE + if (le16_to_cpu(desc->status2) & 0x0100) { skb->ip_summed = CHECKSUM_UNNECESSARY; np->stats.rx_compressed++; } /* * This feature doesn't seem to be working, at least * with the two firmware versions I have. If the GFP sees - * a fragment, it either ignores it completely, or reports + * an IP fragment, it either ignores it completely, or reports * "bad checksum" on it. * * Maybe I missed something -- corrections are welcome. * Until then, the printk stays. :-) -Ion */ - else if (le32_to_cpu(np->rx_done_q[np->rx_done].status2) & 0x00400000) { + else if (le16_to_cpu(desc->status2) & 0x0040) { skb->ip_summed = CHECKSUM_HW; - skb->csum = le32_to_cpu(np->rx_done_q[np->rx_done].status2) & 0xffff; - printk(KERN_DEBUG "%s: checksum_hw, status2 = %x\n", dev->name, np->rx_done_q[np->rx_done].status2); + skb->csum = le16_to_cpu(desc->csum); + printk(KERN_DEBUG "%s: checksum_hw, status2 = %#x\n", dev->name, le16_to_cpu(desc->status2)); } -#endif - netif_rx(skb); +#endif /* HAS_FIRMWARE */ +#ifdef VLAN_SUPPORT + if (np->vlgrp && le16_to_cpu(desc->status2) & 0x0200) { + if (debug > 4) + printk(KERN_DEBUG " netdev_rx() vlanid = %d\n", le16_to_cpu(desc->vlanid)); + /* vlan_hwaccel_receive_skb() expects a packet with the VLAN tag stripped out */ + vlan_hwaccel_receive_skb(skb, np->vlgrp, le16_to_cpu(desc->vlanid) & VLAN_VID_MASK); + } else +#endif /* VLAN_SUPPORT */ + netif_receive_skb(skb); dev->last_rx = jiffies; np->stats.rx_packets++; -next_rx: + next_rx: np->cur_rx++; - np->rx_done_q[np->rx_done].status = 0; - np->rx_done = (np->rx_done + 1) & (DONE_Q_SIZE-1); + desc->status = 0; + np->rx_done = (np->rx_done + 1) % DONE_Q_SIZE; } writew(np->rx_done, dev->base_addr + CompletionQConsumerIdx); + out: + refill_rx_ring(dev); + if (debug > 5) + printk(KERN_DEBUG " exiting netdev_rx(): %d, status of %d was %#8.8x.\n", + retcode, np->rx_done, desc_status); + return retcode; +} + + +#ifdef HAVE_NETDEV_POLL +static int netdev_poll(struct net_device *dev, int *budget) +{ + u32 intr_status; + long ioaddr = dev->base_addr; + int retcode = 0, quota = dev->quota; + + do { + writel(IntrRxDone | IntrRxEmpty, ioaddr + IntrClear); + + retcode = __netdev_rx(dev, "a); + *budget -= (dev->quota - quota); + dev->quota = quota; + if (retcode) + goto out; + + intr_status = readl(ioaddr + IntrStatus); + } while (intr_status & (IntrRxDone | IntrRxEmpty)); + + netif_rx_complete(dev); + intr_status = readl(ioaddr + IntrEnable); + intr_status |= IntrRxDone | IntrRxEmpty; + writel(intr_status, ioaddr + IntrEnable); + + out: + if (debug > 5) + printk(KERN_DEBUG " exiting netdev_poll(): %d.\n", retcode); + + /* Restart Rx engine if stopped. */ + return retcode; +} +#endif /* HAVE_NETDEV_POLL */ + + +static void refill_rx_ring(struct net_device *dev) +{ + struct netdev_private *np = dev->priv; + struct sk_buff *skb; + int entry = -1; + /* Refill the Rx ring buffers. */ for (; np->cur_rx - np->dirty_rx > 0; np->dirty_rx++) { - struct sk_buff *skb; - int entry = np->dirty_rx % RX_RING_SIZE; + entry = np->dirty_rx % RX_RING_SIZE; if (np->rx_info[entry].skb == NULL) { skb = dev_alloc_skb(np->rx_buf_sz); np->rx_info[entry].skb = skb; @@ -1527,20 +1745,13 @@ next_rx: pci_map_single(np->pci_dev, skb->tail, np->rx_buf_sz, PCI_DMA_FROMDEVICE); skb->dev = dev; /* Mark as being used by this device. */ np->rx_ring[entry].rxaddr = - cpu_to_le32(np->rx_info[entry].mapping | RxDescValid); + cpu_to_dma(np->rx_info[entry].mapping | RxDescValid); } if (entry == RX_RING_SIZE - 1) - np->rx_ring[entry].rxaddr |= cpu_to_le32(RxDescEndRing); - /* We could defer this until later... */ - writew(entry, dev->base_addr + RxDescQIdx); + np->rx_ring[entry].rxaddr |= cpu_to_dma(RxDescEndRing); } - - if (debug > 5) - printk(KERN_DEBUG " exiting netdev_rx() status of %d was %8.8x.\n", - np->rx_done, desc_status); - - /* Restart Rx engine if stopped. */ - return 0; + if (entry >= 0) + writew(entry, dev->base_addr + RxDescQIdx); } @@ -1550,6 +1761,7 @@ static void netdev_media_change(struct n long ioaddr = dev->base_addr; u16 reg0, reg1, reg4, reg5; u32 new_tx_mode; + u32 new_intr_timer_ctrl; /* reset status first */ mdio_read(dev, np->phys[0], MII_BMCR); @@ -1594,15 +1806,23 @@ static void netdev_media_change(struct n np->speed100 ? "100" : "10", np->mii_if.full_duplex ? "full" : "half"); - new_tx_mode = np->tx_mode & ~0x2; /* duplex setting */ + new_tx_mode = np->tx_mode & ~FullDuplex; /* duplex setting */ if (np->mii_if.full_duplex) - new_tx_mode |= 2; + new_tx_mode |= FullDuplex; if (np->tx_mode != new_tx_mode) { np->tx_mode = new_tx_mode; - writel(np->tx_mode | 0x8000, ioaddr + TxMode); + writel(np->tx_mode | MiiSoftReset, ioaddr + TxMode); udelay(1000); writel(np->tx_mode, ioaddr + TxMode); } + + new_intr_timer_ctrl = np->intr_timer_ctrl & ~Timer10X; + if (np->speed100) + new_intr_timer_ctrl |= Timer10X; + if (np->intr_timer_ctrl != new_intr_timer_ctrl) { + np->intr_timer_ctrl = new_intr_timer_ctrl; + writel(new_intr_timer_ctrl, ioaddr + IntrTimerCtrl); + } } else { netif_carrier_off(dev); printk(KERN_DEBUG "%s: Link is down\n", dev->name); @@ -1616,9 +1836,12 @@ static void netdev_error(struct net_devi /* Came close to underrunning the Tx FIFO, increase threshold. */ if (intr_status & IntrTxDataLow) { - writel(++np->tx_threshold, dev->base_addr + TxThreshold); - printk(KERN_NOTICE "%s: Increasing Tx FIFO threshold to %d bytes\n", - dev->name, np->tx_threshold * 16); + if (np->tx_threshold <= PKT_BUF_SZ / 16) { + writel(++np->tx_threshold, dev->base_addr + TxThreshold); + printk(KERN_NOTICE "%s: Increasing Tx FIFO threshold to %d bytes\n", + dev->name, np->tx_threshold * 16); + } else + printk(KERN_WARNING "%s: PCI Tx underflow -- adapter is probably malfunctioning\n", dev->name); } if (intr_status & IntrRxGFPDead) { np->stats.rx_fifo_errors++; @@ -1629,7 +1852,7 @@ static void netdev_error(struct net_devi np->stats.tx_errors++; } if ((intr_status & ~(IntrNormalMask | IntrAbnormalSummary | IntrLinkChange | IntrStatsMax | IntrTxDataLow | IntrRxGFPDead | IntrNoTxCsum | IntrPCIPad)) && debug) - printk(KERN_ERR "%s: Something Wicked happened! %4.4x.\n", + printk(KERN_ERR "%s: Something Wicked happened! %#8.8x.\n", dev->name, intr_status); } @@ -1664,39 +1887,67 @@ static struct net_device_stats *get_stat /* Chips may use the upper or lower CRC bits, and may reverse and/or invert them. Select the endian-ness that results in minimal calculations. */ - static void set_rx_mode(struct net_device *dev) { long ioaddr = dev->base_addr; - u32 rx_mode; + u32 rx_mode = MinVLANPrio; struct dev_mc_list *mclist; int i; +#ifdef VLAN_SUPPORT + struct netdev_private *np = dev->priv; + + rx_mode |= VlanMode; + if (np->vlgrp) { + int vlan_count = 0; + long filter_addr = ioaddr + HashTable + 8; + for (i = 0; i < VLAN_VID_MASK; i++) { + if (np->vlgrp->vlan_devices[i]) { + if (vlan_count >= 32) + break; + writew(cpu_to_be16(i), filter_addr); + filter_addr += 16; + vlan_count++; + } + } + if (i == VLAN_VID_MASK) { + rx_mode |= PerfectFilterVlan; + while (vlan_count < 32) { + writew(0, filter_addr); + filter_addr += 16; + vlan_count++; + } + } + } +#endif /* VLAN_SUPPORT */ if (dev->flags & IFF_PROMISC) { /* Set promiscuous. */ - rx_mode = AcceptBroadcast|AcceptAllMulticast|AcceptAll|AcceptMyPhys; + rx_mode |= AcceptAll; } else if ((dev->mc_count > multicast_filter_limit) || (dev->flags & IFF_ALLMULTI)) { /* Too many to match, or accept all multicasts. */ - rx_mode = AcceptBroadcast|AcceptAllMulticast|AcceptMyPhys; - } else if (dev->mc_count <= 15) { - /* Use the 16 element perfect filter, skip first entry. */ - long filter_addr = ioaddr + PerfFilterTable + 1 * 16; - for (i = 1, mclist = dev->mc_list; mclist && i <= dev->mc_count; + rx_mode |= AcceptBroadcast|AcceptAllMulticast|PerfectFilter; + } else if (dev->mc_count <= 14) { + /* Use the 16 element perfect filter, skip first two entries. */ + long filter_addr = ioaddr + PerfFilterTable + 2 * 16; + u16 *eaddrs; + for (i = 2, mclist = dev->mc_list; mclist && i < dev->mc_count + 2; i++, mclist = mclist->next) { - u16 *eaddrs = (u16 *)mclist->dmi_addr; + eaddrs = (u16 *)mclist->dmi_addr; writew(cpu_to_be16(eaddrs[2]), filter_addr); filter_addr += 4; writew(cpu_to_be16(eaddrs[1]), filter_addr); filter_addr += 4; writew(cpu_to_be16(eaddrs[0]), filter_addr); filter_addr += 8; } + eaddrs = (u16 *)dev->dev_addr; while (i++ < 16) { - writew(0xffff, filter_addr); filter_addr += 4; - writew(0xffff, filter_addr); filter_addr += 4; - writew(0xffff, filter_addr); filter_addr += 8; + writew(cpu_to_be16(eaddrs[0]), filter_addr); filter_addr += 4; + writew(cpu_to_be16(eaddrs[1]), filter_addr); filter_addr += 4; + writew(cpu_to_be16(eaddrs[2]), filter_addr); filter_addr += 8; } - rx_mode = AcceptBroadcast | AcceptMyPhys; + rx_mode |= AcceptBroadcast|PerfectFilter; } else { /* Must use a multicast hash table. */ long filter_addr; + u16 *eaddrs; u16 mc_filter[32] __attribute__ ((aligned(sizeof(long)))); /* Multicast hash filter */ memset(mc_filter, 0, sizeof(mc_filter)); @@ -1707,17 +1958,19 @@ static void set_rx_mode(struct net_devic *fptr |= cpu_to_le32(1 << (bit_nr & 31)); } - /* Clear the perfect filter list, skip first entry. */ - filter_addr = ioaddr + PerfFilterTable + 1 * 16; - for (i = 1; i < 16; i++) { - writew(0xffff, filter_addr); filter_addr += 4; - writew(0xffff, filter_addr); filter_addr += 4; - writew(0xffff, filter_addr); filter_addr += 8; + /* Clear the perfect filter list, skip first two entries. */ + filter_addr = ioaddr + PerfFilterTable + 2 * 16; + eaddrs = (u16 *)dev->dev_addr; + for (i = 2; i < 16; i++) { + writew(cpu_to_be16(eaddrs[0]), filter_addr); filter_addr += 4; + writew(cpu_to_be16(eaddrs[1]), filter_addr); filter_addr += 4; + writew(cpu_to_be16(eaddrs[2]), filter_addr); filter_addr += 8; } - for (filter_addr = ioaddr + HashTable, i=0; i < 32; filter_addr+= 16, i++) + for (filter_addr = ioaddr + HashTable, i = 0; i < 32; filter_addr+= 16, i++) writew(mc_filter[i], filter_addr); - rx_mode = AcceptBroadcast | AcceptMulticast | AcceptMyPhys; + rx_mode |= AcceptBroadcast|PerfectFilter|HashFilter; } + wmb(); writel(rx_mode, ioaddr + RxFilterMode); } @@ -1763,6 +2016,7 @@ static int netdev_ethtool_ioctl(struct n spin_lock_irq(&np->lock); r = mii_ethtool_sset(&np->mii_if, &ecmd); spin_unlock_irq(&np->lock); + check_duplex(dev); return r; } /* restart autonegotiation */ @@ -1816,7 +2070,7 @@ static int netdev_ioctl(struct net_devic spin_lock_irq(&np->lock); rc = generic_mii_ioctl(&np->mii_if, data, cmd, NULL); spin_unlock_irq(&np->lock); - + if ((cmd == SIOCSMIIREG) && (data->phy_id == np->phys[0])) check_duplex(dev); } @@ -1834,41 +2088,42 @@ static int netdev_close(struct net_devic netif_stop_if(dev); if (debug > 1) { - printk(KERN_DEBUG "%s: Shutting down ethercard, Intr status %4.4x.\n", - dev->name, (int)readl(ioaddr + IntrStatus)); - printk(KERN_DEBUG "%s: Queue pointers were Tx %d / %d, Rx %d / %d.\n", - dev->name, np->cur_tx, np->dirty_tx, np->cur_rx, np->dirty_rx); + printk(KERN_DEBUG "%s: Shutting down ethercard, Intr status %#8.8x.\n", + dev->name, (int) readl(ioaddr + IntrStatus)); + printk(KERN_DEBUG "%s: Queue pointers were Tx %d / %d, Rx %d / %d.\n", + dev->name, np->cur_tx, np->dirty_tx, + np->cur_rx, np->dirty_rx); } /* Disable interrupts by clearing the interrupt mask. */ writel(0, ioaddr + IntrEnable); /* Stop the chip's Tx and Rx processes. */ + writel(0, ioaddr + GenCtrl); + readl(ioaddr + GenCtrl); -#ifdef __i386__ - if (debug > 2) { - printk("\n"KERN_DEBUG" Tx ring at %9.9Lx:\n", - (u64) np->tx_ring_dma); + if (debug > 5) { + printk(KERN_DEBUG" Tx ring at %#llx:\n", + (long long) np->tx_ring_dma); for (i = 0; i < 8 /* TX_RING_SIZE is huge! */; i++) - printk(KERN_DEBUG " #%d desc. %8.8x %8.8x -> %8.8x.\n", + printk(KERN_DEBUG " #%d desc. %#8.8x %#llx -> %#8.8x.\n", i, le32_to_cpu(np->tx_ring[i].status), - le32_to_cpu(np->tx_ring[i].first_addr), + (long long) dma_to_cpu(np->tx_ring[i].addr), le32_to_cpu(np->tx_done_q[i].status)); - printk(KERN_DEBUG " Rx ring at %9.9Lx -> %p:\n", - (u64) np->rx_ring_dma, np->rx_done_q); + printk(KERN_DEBUG " Rx ring at %#llx -> %p:\n", + (long long) np->rx_ring_dma, np->rx_done_q); if (np->rx_done_q) for (i = 0; i < 8 /* RX_RING_SIZE */; i++) { - printk(KERN_DEBUG " #%d desc. %8.8x -> %8.8x\n", - i, le32_to_cpu(np->rx_ring[i].rxaddr), le32_to_cpu(np->rx_done_q[i].status)); + printk(KERN_DEBUG " #%d desc. %#llx -> %#8.8x\n", + i, (long long) dma_to_cpu(np->rx_ring[i].rxaddr), le32_to_cpu(np->rx_done_q[i].status)); } } -#endif /* __i386__ debugging only */ free_irq(dev->irq, dev); /* Free all the skbuffs in the Rx queue. */ for (i = 0; i < RX_RING_SIZE; i++) { - np->rx_ring[i].rxaddr = cpu_to_le32(0xBADF00D0); /* An invalid address. */ + np->rx_ring[i].rxaddr = cpu_to_dma(0xBADF00D0); /* An invalid address. */ if (np->rx_info[i].skb != NULL) { pci_unmap_single(np->pci_dev, np->rx_info[i].mapping, np->rx_buf_sz, PCI_DMA_FROMDEVICE); dev_kfree_skb(np->rx_info[i].skb); @@ -1878,28 +2133,14 @@ static int netdev_close(struct net_devic } for (i = 0; i < TX_RING_SIZE; i++) { struct sk_buff *skb = np->tx_info[i].skb; -#ifdef ZEROCOPY - int j; -#endif /* ZEROCOPY */ if (skb == NULL) continue; pci_unmap_single(np->pci_dev, - np->tx_info[i].first_mapping, + np->tx_info[i].mapping, skb_first_frag_len(skb), PCI_DMA_TODEVICE); - np->tx_info[i].first_mapping = 0; + np->tx_info[i].mapping = 0; dev_kfree_skb(skb); np->tx_info[i].skb = NULL; -#ifdef ZEROCOPY - for (j = 0; j < MAX_STARFIRE_FRAGS; j++) - if (np->tx_info[i].frag_mapping[j]) { - pci_unmap_single(np->pci_dev, - np->tx_info[i].frag_mapping[j], - skb_shinfo(skb)->frags[j].size, - PCI_DMA_TODEVICE); - np->tx_info[i].frag_mapping[j] = 0; - } else - break; -#endif /* ZEROCOPY */ } COMPAT_MOD_DEC_USE_COUNT; @@ -1917,19 +2158,7 @@ static void __devexit starfire_remove_on BUG(); np = dev->priv; - if (np->tx_done_q) - pci_free_consistent(pdev, PAGE_SIZE, - np->tx_done_q, np->tx_done_q_dma); - if (np->rx_done_q) - pci_free_consistent(pdev, - sizeof(struct rx_done_desc) * DONE_Q_SIZE, - np->rx_done_q, np->rx_done_q_dma); - if (np->tx_ring) - pci_free_consistent(pdev, PAGE_SIZE, - np->tx_ring, np->tx_ring_dma); - if (np->rx_ring) - pci_free_consistent(pdev, PAGE_SIZE, - np->rx_ring, np->rx_ring_dma); + pci_free_consistent(pdev, np->queue_mem_size, np->queue_mem, np->queue_mem_dma); unregister_netdev(dev); iounmap((char *)dev->base_addr); @@ -1954,6 +2183,17 @@ static int __init starfire_init (void) #ifdef MODULE printk(version); #endif +#ifndef ADDR_64BITS + /* we can do this test only at run-time... sigh */ + if (sizeof(dma_addr_t) == sizeof(u64)) { + printk("This driver has not been ported to this 64-bit architecture yet\n"); + return -ENODEV; + } +#endif /* not ADDR_64BITS */ +#ifndef HAS_FIRMWARE + /* unconditionally disable hw cksums if firmware is not present */ + enable_hw_cksum = 0; +#endif /* not HAS_FIRMWARE */ return pci_module_init (&starfire_driver); } @@ -1970,8 +2210,6 @@ module_exit(starfire_cleanup); /* * Local variables: - * compile-command: "gcc -DMODULE -Wall -Wstrict-prototypes -O2 -c starfire.c" - * simple-compile-command: "gcc -DMODULE -O2 -c starfire.c" * c-basic-offset: 8 * tab-width: 8 * End: diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/serial/8250.c 90-mjb/drivers/serial/8250.c --- 00-virgin/drivers/serial/8250.c Mon Jan 13 21:09:12 2003 +++ 90-mjb/drivers/serial/8250.c Sat Feb 1 22:09:06 2003 @@ -2031,9 +2031,116 @@ void serial8250_get_irq_map(unsigned int } } -static int __init serial8250_init(void) +#ifdef CONFIG_X86_REMOTE_DEBUG +/* + * Takes: + * ttyS - integer specifying which serial port to use for debugging + * baud - baud rate of specified serial port + * Returns: + * port for use by the gdb serial driver + */ +int gdb_serial_setup(int ttyS, int baud, int *port, int *irq) +{ + struct uart_8250_port *up; + unsigned cval; + int bits = 8; + int parity = 'n'; + int cflag = CREAD | HUPCL | CLOCAL; + int quot = 0; + + /* + * Now construct a cflag setting. + */ + switch(baud) { + case 1200: + cflag |= B1200; + break; + case 2400: + cflag |= B2400; + break; + case 4800: + cflag |= B4800; + break; + case 19200: + cflag |= B19200; + break; + case 38400: + cflag |= B38400; + break; + case 57600: + cflag |= B57600; + break; + case 115200: + cflag |= B115200; + break; + case 9600: + default: + cflag |= B9600; + break; + } + switch(bits) { + case 7: + cflag |= CS7; + break; + default: + case 8: + cflag |= CS8; + break; + } + switch(parity) { + case 'o': case 'O': + cflag |= PARODD; + break; + case 'e': case 'E': + cflag |= PARENB; + break; + } + + /* + * Divisor, bytesize and parity + */ + + up = &serial8250_ports[ttyS]; +// ser->flags &= ~ASYNC_BOOT_AUTOCONF; + quot = ( 1843200 / 16 ) / baud; + cval = cflag & (CSIZE | CSTOPB); + cval >>= 4; + if (cflag & PARENB) + cval |= UART_LCR_PARITY; + if (!(cflag & PARODD)) + cval |= UART_LCR_EPAR; + + /* + * Disable UART interrupts, set DTR and RTS high + * and set speed. + */ + cval = 0x3; + serial_outp(up, UART_LCR, cval | UART_LCR_DLAB); /* set DLAB */ + serial_outp(up, UART_DLL, quot & 0xff); /* LS of divisor */ + serial_outp(up, UART_DLM, quot >> 8); /* MS of divisor */ + serial_outp(up, UART_LCR, cval); /* reset DLAB */ + serial_outp(up, UART_IER, UART_IER_RDI); /* turn on interrupts*/ + serial_outp(up, UART_MCR, UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS); + + /* + * If we read 0xff from the LSR, there is no UART here. + */ + if (serial_inp(up, UART_LSR) == 0xff) + return 1; + *port = up->port.iobase; + *irq = up->port.irq; +// serial8250_shutdown(&up->port); + return 0; +} +#endif + +int serial8250_init(void) { int ret, i; + static int didit = 0; + + if (didit++) + return 0; printk(KERN_INFO "Serial: 8250/16550 driver $Revision: 1.90 $ " "IRQ sharing %sabled\n", share_irqs ? "en" : "dis"); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/serial/core.c 90-mjb/drivers/serial/core.c --- 00-virgin/drivers/serial/core.c Mon Jan 13 21:09:12 2003 +++ 90-mjb/drivers/serial/core.c Sat Feb 1 22:09:06 2003 @@ -36,6 +36,10 @@ #include #include /* for serial_state and serial_icounter_struct */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + #include #include @@ -1040,6 +1044,17 @@ uart_ioctl(struct tty_struct *tty, struc (unsigned int *)arg); break; +#ifdef CONFIG_X86_REMOTE_DEBUG + case TIOCGDB: + ret = -ENOTTY; + if (capable(CAP_SYS_ADMIN)) { + gdb_ttyS = minor(tty->device) & 0x03F; + gdb_baud = tty_get_baud_rate(tty); + ret = gdb_hook(); + } + break; +#endif + case TIOCMBIS: case TIOCMBIC: case TIOCMSET: @@ -1115,6 +1130,30 @@ uart_ioctl(struct tty_struct *tty, struc } return ret; } + + /* + * ------------------------------------------------------------ + * Serial GDB driver (most in gdbserial.c) + * ------------------------------------------------------------ + */ + +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_GDB_CONSOLE +static struct console gdbcons = { + name: "gdb", + write: gdb_console_write, + flags: CON_PRINTBUFFER | CON_ENABLED, + index: -1, +}; +#endif + +#ifdef CONFIG_GDB_CONSOLE +void __init gdb_console_init(void) +{ + register_console(&gdbcons); +} +#endif +#endif /* CONFIG_X86_REMOTE_DEBUG */ static void uart_set_termios(struct tty_struct *tty, struct termios *old_termios) { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/fs/dcache.c 90-mjb/fs/dcache.c --- 00-virgin/fs/dcache.c Sun Dec 1 10:00:00 2002 +++ 90-mjb/fs/dcache.c Sat Feb 1 21:58:05 2003 @@ -24,6 +24,7 @@ #include #include #include +#include #include #define DCACHE_PARANOIA 1 @@ -55,6 +56,15 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; +static void d_callback(void *arg) +{ + struct dentry * dentry = (struct dentry *)arg; + + if (dname_external(dentry)) + kfree(dentry->d_name.name); + kmem_cache_free(dentry_cache, dentry); +} + /* * no dcache_lock, please. The caller must decrement dentry_stat.nr_dentry * inside dcache_lock. @@ -63,9 +73,7 @@ static void d_free(struct dentry *dentry { if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - if (dname_external(dentry)) - kfree(dentry->d_name.name); - kmem_cache_free(dentry_cache, dentry); + call_rcu(&dentry->d_rcu, d_callback, dentry); } /* @@ -126,9 +134,13 @@ repeat: if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) return; - /* dput on a free dentry? */ - if (!list_empty(&dentry->d_lru)) - BUG(); + spin_lock(&dentry->d_lock); + if (atomic_read(&dentry->d_count)) { + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); + return; + } + /* * AV: ->d_delete() is _NOT_ allowed to block now. */ @@ -139,8 +151,12 @@ repeat: /* Unreachable? Get rid of it */ if (d_unhashed(dentry)) goto kill_it; - list_add(&dentry->d_lru, &dentry_unused); - dentry_stat.nr_unused++; + if (list_empty(&dentry->d_lru)) { + dentry->d_vfs_flags &= ~DCACHE_REFERENCED; + list_add(&dentry->d_lru, &dentry_unused); + dentry_stat.nr_unused++; + } + spin_unlock(&dentry->d_lock); dentry->d_vfs_flags |= DCACHE_REFERENCED; spin_unlock(&dcache_lock); return; @@ -150,7 +166,12 @@ unhash_it: kill_it: { struct dentry *parent; - list_del(&dentry->d_child); + if (!list_empty(&dentry->d_lru)) { + list_del(&dentry->d_lru); + dentry_stat.nr_unused--; + } + list_del(&dentry->d_child); + spin_unlock(&dentry->d_lock); dentry_stat.nr_dentry--; /* For d_free, below */ /* drops the lock, at that point nobody can reach this dentry */ dentry_iput(dentry); @@ -222,6 +243,7 @@ int d_invalidate(struct dentry * dentry) static inline struct dentry * __dget_locked(struct dentry *dentry) { atomic_inc(&dentry->d_count); + dentry->d_vfs_flags |= DCACHE_REFERENCED; if (atomic_read(&dentry->d_count) == 1) { dentry_stat.nr_unused--; list_del_init(&dentry->d_lru); @@ -289,8 +311,8 @@ restart: struct dentry *dentry = list_entry(tmp, struct dentry, d_alias); if (!atomic_read(&dentry->d_count)) { __dget_locked(dentry); + __d_drop(dentry); spin_unlock(&dcache_lock); - d_drop(dentry); dput(dentry); goto restart; } @@ -310,6 +332,7 @@ static inline void prune_one_dentry(stru __d_drop(dentry); list_del(&dentry->d_child); + spin_unlock(&dentry->d_lock); dentry_stat.nr_dentry--; /* For d_free, below */ dentry_iput(dentry); parent = dentry->d_parent; @@ -343,18 +366,20 @@ static void prune_dcache(int count) if (tmp == &dentry_unused) break; list_del_init(tmp); + dentry_stat.nr_unused--; dentry = list_entry(tmp, struct dentry, d_lru); + spin_lock(&dentry->d_lock); /* If the dentry was recently referenced, don't free it. */ if (dentry->d_vfs_flags & DCACHE_REFERENCED) { dentry->d_vfs_flags &= ~DCACHE_REFERENCED; - list_add(&dentry->d_lru, &dentry_unused); + if (!atomic_read(&dentry->d_count)) { + list_add(&dentry->d_lru, &dentry_unused); + dentry_stat.nr_unused++; + } + spin_unlock(&dentry->d_lock); continue; } - dentry_stat.nr_unused--; - - /* Unused dentry with a count? */ - BUG_ON(atomic_read(&dentry->d_count)); prune_one_dentry(dentry); } spin_unlock(&dcache_lock); @@ -414,10 +439,13 @@ repeat: dentry = list_entry(tmp, struct dentry, d_lru); if (dentry->d_sb != sb) continue; - if (atomic_read(&dentry->d_count)) - continue; dentry_stat.nr_unused--; list_del_init(tmp); + spin_lock(&dentry->d_lock); + if (atomic_read(&dentry->d_count)) { + spin_unlock(&dentry->d_lock); + continue; + } prune_one_dentry(dentry); goto repeat; } @@ -497,8 +525,8 @@ resume: struct list_head *tmp = next; struct dentry *dentry = list_entry(tmp, struct dentry, d_child); next = tmp->next; + list_del_init(&dentry->d_lru); if (!atomic_read(&dentry->d_count)) { - list_del(&dentry->d_lru); list_add(&dentry->d_lru, dentry_unused.prev); found++; } @@ -561,8 +589,8 @@ void shrink_dcache_anon(struct list_head spin_lock(&dcache_lock); list_for_each(lp, head) { struct dentry *this = list_entry(lp, struct dentry, d_hash); + list_del(&this->d_lru); if (!atomic_read(&this->d_count)) { - list_del(&this->d_lru); list_add_tail(&this->d_lru, &dentry_unused); found++; } @@ -648,7 +676,8 @@ struct dentry * d_alloc(struct dentry * str[name->len] = 0; atomic_set(&dentry->d_count, 1); - dentry->d_vfs_flags = 0; + dentry->d_vfs_flags = DCACHE_UNHASHED; + dentry->d_lock = SPIN_LOCK_UNLOCKED; dentry->d_flags = 0; dentry->d_inode = NULL; dentry->d_parent = NULL; @@ -785,12 +814,15 @@ struct dentry * d_alloc_anon(struct inod res = tmp; tmp = NULL; if (res) { + spin_lock(&res->d_lock); res->d_sb = inode->i_sb; res->d_parent = res; res->d_inode = inode; res->d_flags |= DCACHE_DISCONNECTED; + res->d_vfs_flags &= ~DCACHE_UNHASHED; list_add(&res->d_alias, &inode->i_dentry); list_add(&res->d_hash, &inode->i_sb->s_anon); + spin_unlock(&res->d_lock); } inode = NULL; /* don't drop reference */ } @@ -859,30 +891,16 @@ struct dentry *d_splice_alias(struct ino struct dentry * d_lookup(struct dentry * parent, struct qstr * name) { - struct dentry * dentry; - spin_lock(&dcache_lock); - dentry = __d_lookup(parent,name); - if (dentry) - __dget_locked(dentry); - spin_unlock(&dcache_lock); - return dentry; -} - -struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) -{ - unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; struct list_head *head = d_hash(parent,hash); struct list_head *tmp; + struct dentry *found = NULL; - tmp = head->next; - for (;;) { + rcu_read_lock(); + __list_for_each_rcu(tmp, head) { struct dentry * dentry = list_entry(tmp, struct dentry, d_hash); - if (tmp == head) - break; - tmp = tmp->next; if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) @@ -896,9 +914,14 @@ struct dentry * __d_lookup(struct dentry if (memcmp(dentry->d_name.name, str, len)) continue; } - return dentry; - } - return NULL; + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) + found = dget(dentry); + spin_unlock(&dentry->d_lock); + break; + } + rcu_read_unlock(); + return found; } /** @@ -937,7 +960,7 @@ int d_validate(struct dentry *dentry, st lhp = base = d_hash(dparent, dentry->d_name.hash); while ((lhp = lhp->next) != base) { if (dentry == list_entry(lhp, struct dentry, d_hash)) { - __dget_locked(dentry); + dget(dentry); spin_unlock(&dcache_lock); return 1; } @@ -974,17 +997,18 @@ void d_delete(struct dentry * dentry) * Are we the only user? */ spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); if (atomic_read(&dentry->d_count) == 1) { + spin_unlock(&dentry->d_lock); dentry_iput(dentry); return; } - spin_unlock(&dcache_lock); - /* - * If not, just drop the dentry and let dput - * pick up the tab.. - */ - d_drop(dentry); + if (!d_unhashed(dentry)) + __d_drop(dentry); + + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); } /** @@ -997,9 +1021,10 @@ void d_delete(struct dentry * dentry) void d_rehash(struct dentry * entry) { struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); - if (!d_unhashed(entry)) BUG(); spin_lock(&dcache_lock); - list_add(&entry->d_hash, list); + if (!list_empty(&entry->d_hash) && !d_unhashed(entry)) BUG(); + entry->d_vfs_flags &= ~DCACHE_UNHASHED; + list_add_rcu(&entry->d_hash, list); spin_unlock(&dcache_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/fs/namei.c 90-mjb/fs/namei.c --- 00-virgin/fs/namei.c Mon Jan 13 21:09:27 2003 +++ 90-mjb/fs/namei.c Sat Feb 1 21:58:03 2003 @@ -286,27 +286,6 @@ static struct dentry * cached_lookup(str return dentry; } -/*for fastwalking*/ -static inline void unlock_nd(struct nameidata *nd) -{ - struct vfsmount *mnt = nd->old_mnt; - struct dentry *dentry = nd->old_dentry; - mntget(nd->mnt); - dget_locked(nd->dentry); - nd->old_mnt = NULL; - nd->old_dentry = NULL; - spin_unlock(&dcache_lock); - dput(dentry); - mntput(mnt); -} - -static inline void lock_nd(struct nameidata *nd) -{ - spin_lock(&dcache_lock); - nd->old_mnt = nd->mnt; - nd->old_dentry = nd->dentry; -} - /* * Short-cut version of permission(), for calling by * path_walk(), when dcache lock is held. Combines parts @@ -451,11 +430,18 @@ static int follow_mount(struct vfsmount { int res = 0; while (d_mountpoint(*dentry)) { - struct vfsmount *mounted = lookup_mnt(*mnt, *dentry); - if (!mounted) + struct vfsmount *mounted; + spin_lock(&dcache_lock); + mounted = lookup_mnt(*mnt, *dentry); + if (!mounted) { + spin_unlock(&dcache_lock); break; - *mnt = mounted; - *dentry = mounted->mnt_root; + } + *mnt = mntget(mounted); + spin_unlock(&dcache_lock); + dput(*dentry); + mntput(mounted->mnt_parent); + *dentry = dget(mounted->mnt_root); res = 1; } return res; @@ -488,17 +474,32 @@ static inline void follow_dotdot(struct { while(1) { struct vfsmount *parent; + struct dentry *old = *dentry; + + read_lock(¤t->fs->lock); if (*dentry == current->fs->root && - *mnt == current->fs->rootmnt) + *mnt == current->fs->rootmnt) { + read_unlock(¤t->fs->lock); break; + } + read_unlock(¤t->fs->lock); + spin_lock(&dcache_lock); if (*dentry != (*mnt)->mnt_root) { - *dentry = (*dentry)->d_parent; + *dentry = dget((*dentry)->d_parent); + spin_unlock(&dcache_lock); + dput(old); break; } - parent=(*mnt)->mnt_parent; - if (parent == *mnt) + parent = (*mnt)->mnt_parent; + if (parent == *mnt) { + spin_unlock(&dcache_lock); break; - *dentry=(*mnt)->mnt_mountpoint; + } + mntget(parent); + *dentry = dget((*mnt)->mnt_mountpoint); + spin_unlock(&dcache_lock); + dput(old); + mntput(*mnt); *mnt = parent; } follow_mount(mnt, dentry); @@ -515,14 +516,13 @@ struct path { * It _is_ time-critical. */ static int do_lookup(struct nameidata *nd, struct qstr *name, - struct path *path, struct path *cached_path, - int flags) + struct path *path, int flags) { struct vfsmount *mnt = nd->mnt; - struct dentry *dentry = __d_lookup(nd->dentry, name); + struct dentry *dentry = d_lookup(nd->dentry, name); if (!dentry) - goto dcache_miss; + goto need_lookup; if (dentry->d_op && dentry->d_op->d_revalidate) goto need_revalidate; done: @@ -530,36 +530,21 @@ done: path->dentry = dentry; return 0; -dcache_miss: - unlock_nd(nd); - need_lookup: dentry = real_lookup(nd->dentry, name, LOOKUP_CONTINUE); if (IS_ERR(dentry)) goto fail; - mntget(mnt); -relock: - dput(cached_path->dentry); - mntput(cached_path->mnt); - cached_path->mnt = mnt; - cached_path->dentry = dentry; - lock_nd(nd); goto done; need_revalidate: - mntget(mnt); - dget_locked(dentry); - unlock_nd(nd); if (dentry->d_op->d_revalidate(dentry, flags)) - goto relock; + goto done; if (d_invalidate(dentry)) - goto relock; + goto done; dput(dentry); - mntput(mnt); goto need_lookup; fail: - lock_nd(nd); return PTR_ERR(dentry); } @@ -573,7 +558,7 @@ fail: */ int link_path_walk(const char * name, struct nameidata *nd) { - struct path next, pinned = {NULL, NULL}; + struct path next; struct inode *inode; int err; unsigned int lookup_flags = nd->flags; @@ -594,10 +579,8 @@ int link_path_walk(const char * name, st unsigned int c; err = exec_permission_lite(inode); - if (err == -EAGAIN) { - unlock_nd(nd); + if (err == -EAGAIN) { err = permission(inode, MAY_EXEC); - lock_nd(nd); } if (err) break; @@ -648,7 +631,7 @@ int link_path_walk(const char * name, st break; } /* This does the actual lookups.. */ - err = do_lookup(nd, &this, &next, &pinned, LOOKUP_CONTINUE); + err = do_lookup(nd, &this, &next, LOOKUP_CONTINUE); if (err) break; /* Check mountpoints.. */ @@ -657,21 +640,16 @@ int link_path_walk(const char * name, st err = -ENOENT; inode = next.dentry->d_inode; if (!inode) - break; + goto out_dput; err = -ENOTDIR; if (!inode->i_op) - break; + goto out_dput; if (inode->i_op->follow_link) { - mntget(next.mnt); - dget_locked(next.dentry); - unlock_nd(nd); err = do_follow_link(next.dentry, nd); dput(next.dentry); - mntput(next.mnt); if (err) goto return_err; - lock_nd(nd); err = -ENOENT; inode = nd->dentry->d_inode; if (!inode) @@ -680,6 +658,7 @@ int link_path_walk(const char * name, st if (!inode->i_op) break; } else { + dput(nd->dentry); nd->mnt = next.mnt; nd->dentry = next.dentry; } @@ -711,24 +690,20 @@ last_component: if (err < 0) break; } - err = do_lookup(nd, &this, &next, &pinned, 0); + err = do_lookup(nd, &this, &next, 0); if (err) break; follow_mount(&next.mnt, &next.dentry); inode = next.dentry->d_inode; if ((lookup_flags & LOOKUP_FOLLOW) && inode && inode->i_op && inode->i_op->follow_link) { - mntget(next.mnt); - dget_locked(next.dentry); - unlock_nd(nd); err = do_follow_link(next.dentry, nd); dput(next.dentry); - mntput(next.mnt); if (err) goto return_err; inode = nd->dentry->d_inode; - lock_nd(nd); } else { + dput(nd->dentry); nd->mnt = next.mnt; nd->dentry = next.dentry; } @@ -751,23 +726,19 @@ lookup_parent: else if (this.len == 2 && this.name[1] == '.') nd->last_type = LAST_DOTDOT; return_base: - unlock_nd(nd); - dput(pinned.dentry); - mntput(pinned.mnt); return 0; +out_dput: + dput(next.dentry); + break; } - unlock_nd(nd); path_release(nd); return_err: - dput(pinned.dentry); - mntput(pinned.mnt); return err; } int path_walk(const char * name, struct nameidata *nd) { current->total_link_count = 0; - lock_nd(nd); return link_path_walk(name, nd); } @@ -855,28 +826,24 @@ int path_lookup(const char *name, unsign { nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags; + + read_lock(¤t->fs->lock); if (*name=='/') { - read_lock(¤t->fs->lock); if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) { nd->mnt = mntget(current->fs->altrootmnt); nd->dentry = dget(current->fs->altroot); read_unlock(¤t->fs->lock); if (__emul_lookup_dentry(name,nd)) return 0; - } else { - read_unlock(¤t->fs->lock); } - spin_lock(&dcache_lock); - nd->mnt = current->fs->rootmnt; - nd->dentry = current->fs->root; + nd->mnt = mntget(current->fs->rootmnt); + nd->dentry = dget(current->fs->root); } else{ - spin_lock(&dcache_lock); - nd->mnt = current->fs->pwdmnt; - nd->dentry = current->fs->pwd; + nd->mnt = mntget(current->fs->pwdmnt); + nd->dentry = dget(current->fs->pwd); } - nd->old_mnt = NULL; - nd->old_dentry = NULL; + read_unlock(¤t->fs->lock); current->total_link_count = 0; return link_path_walk(name, nd); } @@ -2117,7 +2084,6 @@ __vfs_follow_link(struct nameidata *nd, /* weird __emul_prefix() stuff did it */ goto out; } - lock_nd(nd); res = link_path_walk(link, nd); out: if (current->link_count || res || nd->last_type!=LAST_NORM) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/fs/namespace.c 90-mjb/fs/namespace.c --- 00-virgin/fs/namespace.c Fri Dec 13 23:18:10 2002 +++ 90-mjb/fs/namespace.c Sat Feb 1 21:58:03 2003 @@ -892,12 +892,10 @@ void set_fs_root(struct fs_struct *fs, s struct dentry *old_root; struct vfsmount *old_rootmnt; write_lock(&fs->lock); - spin_lock(&dcache_lock); old_root = fs->root; old_rootmnt = fs->rootmnt; fs->rootmnt = mntget(mnt); fs->root = dget(dentry); - spin_unlock(&dcache_lock); write_unlock(&fs->lock); if (old_root) { dput(old_root); @@ -916,12 +914,10 @@ void set_fs_pwd(struct fs_struct *fs, st struct vfsmount *old_pwdmnt; write_lock(&fs->lock); - spin_lock(&dcache_lock); old_pwd = fs->pwd; old_pwdmnt = fs->pwdmnt; fs->pwdmnt = mntget(mnt); fs->pwd = dget(dentry); - spin_unlock(&dcache_lock); write_unlock(&fs->lock); if (old_pwd) { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/fs/proc/proc_misc.c 90-mjb/fs/proc/proc_misc.c --- 00-virgin/fs/proc/proc_misc.c Fri Jan 17 09:18:30 2003 +++ 90-mjb/fs/proc/proc_misc.c Sat Feb 1 22:09:09 2003 @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -130,6 +131,40 @@ static int uptime_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +struct vmalloc_info { + unsigned long used; + unsigned long largest_chunk; +}; + +static struct vmalloc_info get_vmalloc_info(void) +{ + unsigned long prev_end = VMALLOC_START; + struct vm_struct* vma; + struct vmalloc_info vmi; + vmi.used = 0; + + read_lock(&vmlist_lock); + + if(!vmlist) + vmi.largest_chunk = (VMALLOC_END-VMALLOC_START); + else + vmi.largest_chunk = 0; + + for (vma = vmlist; vma; vma = vma->next) { + unsigned long free_area_size = + (unsigned long)vma->addr - prev_end; + vmi.used += vma->size; + if (vmi.largest_chunk < free_area_size ) + vmi.largest_chunk = free_area_size; + prev_end = vma->size + (unsigned long)vma->addr; + } + if(VMALLOC_END-prev_end > vmi.largest_chunk) + vmi.largest_chunk = VMALLOC_END-prev_end; + + read_unlock(&vmlist_lock); + return vmi; +} + extern atomic_t vm_committed_space; static int meminfo_read_proc(char *page, char **start, off_t off, @@ -141,6 +176,8 @@ static int meminfo_read_proc(char *page, unsigned long inactive; unsigned long active; unsigned long free; + unsigned long vmtot; + struct vmalloc_info vmi; get_page_state(&ps); get_zone_counts(&active, &inactive, &free); @@ -153,6 +190,11 @@ static int meminfo_read_proc(char *page, si_swapinfo(&i); committed = atomic_read(&vm_committed_space); + vmtot = (VMALLOC_END-VMALLOC_START)>>10; + vmi = get_vmalloc_info(); + vmi.used >>= 10; + vmi.largest_chunk >>= 10; + /* * Tagged format, for easy grepping and expansion. */ @@ -176,7 +218,10 @@ static int meminfo_read_proc(char *page, "Slab: %8lu kB\n" "Committed_AS: %8u kB\n" "PageTables: %8lu kB\n" - "ReverseMaps: %8lu\n", + "ReverseMaps: %8lu\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.bufferram), @@ -196,7 +241,10 @@ static int meminfo_read_proc(char *page, K(ps.nr_slab), K(committed), K(ps.nr_page_table_pages), - ps.nr_reverse_maps + ps.nr_reverse_maps, + vmtot, + vmi.used, + vmi.largest_chunk ); len += hugetlb_report_meminfo(page + len); @@ -254,6 +302,9 @@ static struct file_operations proc_vmsta .release = seq_release, }; +extern int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data); + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -298,6 +349,71 @@ static struct file_operations proc_modul }; #endif +#ifdef CONFIG_NUMA +#define K(x) ((x) << (PAGE_SHIFT - 10)) +static int show_meminfo_numa (struct seq_file *m, void *v) +{ + int *d = v; + int nid = *d; + struct sysinfo i; + si_meminfo_node(&i, nid); + seq_printf(m, "\n" + "Node %d MemTotal: %8lu kB\n" + "Node %d MemFree: %8lu kB\n" + "Node %d MemUsed: %8lu kB\n" + "Node %d HighTotal: %8lu kB\n" + "Node %d HighFree: %8lu kB\n" + "Node %d LowTotal: %8lu kB\n" + "Node %d LowFree: %8lu kB\n", + nid, K(i.totalram), + nid, K(i.freeram), + nid, K(i.totalram-i.freeram), + nid, K(i.totalhigh), + nid, K(i.freehigh), + nid, K(i.totalram-i.totalhigh), + nid, K(i.freeram-i.freehigh)); + + return 0; +} +#undef K + +extern struct seq_operations meminfo_numa_op; +static int meminfo_numa_open(struct inode *inode, struct file *file) +{ + return seq_open(file,&meminfo_numa_op); +} + +static struct file_operations proc_meminfo_numa_operations = { + open: meminfo_numa_open, + read: seq_read, + llseek: seq_lseek, + release: seq_release, +}; + +static void *meminfo_numa_start(struct seq_file *m, loff_t *pos) +{ + return *pos < numnodes ? pos : NULL; +} + +static void *meminfo_numa_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return meminfo_numa_start(m, pos); +} + +static void meminfo_numa_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations meminfo_numa_op = { + .start = meminfo_numa_start, + .next = meminfo_numa_next, + .stop = meminfo_numa_stop, + .show = show_meminfo_numa, +}; + +#endif + extern struct seq_operations slabinfo_op; extern ssize_t slabinfo_write(struct file *, const char *, size_t, loff_t *); static int slabinfo_open(struct inode *inode, struct file *file) @@ -573,6 +689,7 @@ void __init proc_misc_init(void) {"locks", locks_read_proc}, {"iomem", memory_read_proc}, {"execdomains", execdomains_read_proc}, + {"schedstat", schedstats_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) @@ -594,6 +711,9 @@ void __init proc_misc_init(void) create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations); #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); +#endif +#ifdef CONFIG_NUMA + create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations); #endif proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/fs/proc/task_mmu.c 90-mjb/fs/proc/task_mmu.c --- 00-virgin/fs/proc/task_mmu.c Fri Jan 17 09:18:30 2003 +++ 90-mjb/fs/proc/task_mmu.c Sun Feb 2 13:19:32 2003 @@ -2,6 +2,7 @@ #include #include #include +#include char *task_mem(struct mm_struct *mm, char *buffer) { @@ -111,7 +112,44 @@ int task_statm(struct mm_struct *mm, int #define MAPS_LINE_FORMAT (sizeof(void*) == 4 ? MAPS_LINE_FORMAT4 : MAPS_LINE_FORMAT8) #define MAPS_LINE_MAX (sizeof(void*) == 4 ? MAPS_LINE_MAX4 : MAPS_LINE_MAX8) -static int proc_pid_maps_get_line (char *buf, struct vm_area_struct *map) +extern pte_t __follow_page(struct mm_struct *mm, unsigned long address); +static int print_vma_nodepages(char* buf, struct mm_struct *mm, struct vm_area_struct *map) +{ + int retval = 0; + unsigned long vaddr = map->vm_start; + unsigned long vm_end = map->vm_end; + pte_t pte; + unsigned long pfn; + int pages_per_node[MAX_NR_NODES]; + int i; + + if (numnodes<=1) + goto out; + + for (i=0;ipage_table_lock); + pte = __follow_page(mm, vaddr); + spin_unlock(&mm->page_table_lock); + pfn = pte_pfn(pte); + if (pfn) /* don't count the zero page */ + pages_per_node[pfn_to_nid(pfn)]++; + } + retval += sprintf(&buf[retval]," #"); + for (i=0; ivm_file->f_dentry, map->vm_file->f_vfsmnt, buf, PAGE_SIZE); - buf[PAGE_SIZE-1] = '\n'; + //buf[PAGE_SIZE-1] = '\n'; line -= MAPS_LINE_MAX; if(line < buf) line = buf; @@ -149,15 +187,15 @@ static int proc_pid_maps_get_line (char MAPS_LINE_FORMAT, map->vm_start, map->vm_end, str, map->vm_pgoff << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino); - if(map->vm_file) { int i; for(i = len; i < MAPS_LINE_MAX; i++) line[i] = ' '; len = buf + PAGE_SIZE - line; memmove(buf, line, len); - } else - line[len++] = '\n'; + } + //else + // line[len++] = '\n'; return len; } @@ -207,7 +245,8 @@ ssize_t proc_pid_read_maps(struct task_s off -= PAGE_SIZE; goto next; } - len = proc_pid_maps_get_line(tmp, map); + len = proc_pid_maps_get_line(tmp, mm, map); + len += print_vma_nodepages(&tmp[len], mm, map); len -= off; if (len > 0) { if (retval+len > count) { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/bug.h 90-mjb/include/asm-i386/bug.h --- 00-virgin/include/asm-i386/bug.h Tue Jan 14 10:06:18 2003 +++ 90-mjb/include/asm-i386/bug.h Sat Feb 1 22:09:06 2003 @@ -10,6 +10,11 @@ * undefined" opcode for parsing in the trap handler. */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#define BUG() do { \ + asm ("int $0x3"); \ +} while (0) +#else #if 1 /* Set to zero for a slightly smaller kernel */ #define BUG() \ __asm__ __volatile__( "ud2\n" \ @@ -18,6 +23,7 @@ : : "i" (__LINE__), "i" (__FILE__)) #else #define BUG() __asm__ __volatile__("ud2\n") +#endif #endif #define PAGE_BUG(page) do { \ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/early_printk.h 90-mjb/include/asm-i386/early_printk.h --- 00-virgin/include/asm-i386/early_printk.h Wed Dec 31 16:00:00 1969 +++ 90-mjb/include/asm-i386/early_printk.h Sat Feb 1 22:00:06 2003 @@ -0,0 +1,8 @@ +#ifndef __EARLY_PRINTK_H_I386_ +#define __EARLY_PRINTK_H_i386_ + +#define VGABASE 0xB8000 +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/fixmap.h 90-mjb/include/asm-i386/fixmap.h --- 00-virgin/include/asm-i386/fixmap.h Mon Dec 23 23:01:56 2002 +++ 90-mjb/include/asm-i386/fixmap.h Sun Feb 2 13:19:14 2003 @@ -60,7 +60,7 @@ enum fixed_addresses { #ifdef CONFIG_X86_F00F_BUG FIX_F00F_IDT, /* Virtual mapping for IDT */ #endif -#ifdef CONFIG_X86_CYCLONE +#ifdef CONFIG_X86_SUMMIT FIX_CYCLONE_TIMER, /*cyclone timer register*/ #endif #ifdef CONFIG_HIGHMEM diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/ioctls.h 90-mjb/include/asm-i386/ioctls.h --- 00-virgin/include/asm-i386/ioctls.h Sun Nov 17 20:29:22 2002 +++ 90-mjb/include/asm-i386/ioctls.h Sat Feb 1 22:09:06 2003 @@ -68,6 +68,7 @@ #define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ #define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ #define FIOQSIZE 0x5460 +#define TIOCGDB 0x547F /* enable GDB stub mode on this tty */ /* Used for packet mode */ #define TIOCPKT_DATA 0 diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/mach-bigsmp/mach_apic.h 90-mjb/include/asm-i386/mach-bigsmp/mach_apic.h --- 00-virgin/include/asm-i386/mach-bigsmp/mach_apic.h Fri Jan 17 09:18:31 2003 +++ 90-mjb/include/asm-i386/mach-bigsmp/mach_apic.h Sat Feb 1 22:04:51 2003 @@ -87,7 +87,8 @@ static inline int cpu_to_logical_apicid( return (int)cpu_2_logical_apicid[cpu]; } -static inline int mpc_apic_id(struct mpc_config_processor *m, int quad) +static inline int mpc_apic_id(struct mpc_config_processor *m, + struct mpc_config_translation *translation_record) { printk("Processor #%d %ld:%ld APIC version %d\n", m->mpc_apicid, diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/mach-default/mach_apic.h 90-mjb/include/asm-i386/mach-default/mach_apic.h --- 00-virgin/include/asm-i386/mach-default/mach_apic.h Fri Jan 17 09:18:31 2003 +++ 90-mjb/include/asm-i386/mach-default/mach_apic.h Sat Feb 1 22:04:51 2003 @@ -79,7 +79,8 @@ static inline unsigned long apicid_to_cp return (1ul << phys_apicid); } -static inline int mpc_apic_id(struct mpc_config_processor *m, int quad) +static inline int mpc_apic_id(struct mpc_config_processor *m, + struct mpc_config_translation *translation_record) { printk("Processor #%d %ld:%ld APIC version %d\n", m->mpc_apicid, diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/mach-numaq/mach_apic.h 90-mjb/include/asm-i386/mach-numaq/mach_apic.h --- 00-virgin/include/asm-i386/mach-numaq/mach_apic.h Fri Jan 17 09:18:31 2003 +++ 90-mjb/include/asm-i386/mach-numaq/mach_apic.h Sat Feb 1 22:04:51 2003 @@ -73,8 +73,10 @@ static inline unsigned long apicid_to_cp return ( (logical_apicid&0xf) << (4*apicid_to_node(logical_apicid)) ); } -static inline int mpc_apic_id(struct mpc_config_processor *m, int quad) +static inline int mpc_apic_id(struct mpc_config_processor *m, + struct mpc_config_translation *translation_record) { + int quad = translation_record->trans_quad; int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); printk("Processor #%d %ld:%ld APIC version %d (quad %d, apic %d)\n", diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/mach-summit/mach_apic.h 90-mjb/include/asm-i386/mach-summit/mach_apic.h --- 00-virgin/include/asm-i386/mach-summit/mach_apic.h Fri Jan 17 09:18:31 2003 +++ 90-mjb/include/asm-i386/mach-summit/mach_apic.h Sun Feb 2 13:19:13 2003 @@ -3,7 +3,7 @@ extern int x86_summit; -#define esr_disable (1) +#define esr_disable (x86_summit ? 1 : 0) #define no_balance_irq (0) #define XAPIC_DEST_CPUS_MASK 0x0Fu @@ -15,14 +15,14 @@ extern int x86_summit; #define APIC_DFR_VALUE (x86_summit ? APIC_DFR_CLUSTER : APIC_DFR_FLAT) #define TARGET_CPUS (x86_summit ? XAPIC_DEST_CPUS_MASK : cpu_online_map) -#define INT_DELIVERY_MODE dest_Fixed +#define INT_DELIVERY_MODE (x86_summit ? dest_Fixed : dest_LowestPrio) #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ #define APIC_BROADCAST_ID (x86_summit ? 0xFF : 0x0F) -#define check_apicid_used(bitmap, apicid) (0) +#define check_apicid_used(bitmap, apicid) (x86_summit ? 0 : (bitmap & (1 << apicid))) /* we don't use the phys_cpu_present_map to indicate apicid presence */ -#define check_apicid_present(bit) (1) +#define check_apicid_present(bit) (x86_summit ? 1 : (phys_cpu_present_map & (1 << bit))) extern u8 bios_cpu_apicid[]; @@ -90,7 +90,8 @@ static inline unsigned long apicid_to_cp return (1ul << apicid); } -static inline int mpc_apic_id(struct mpc_config_processor *m, int quad) +static inline int mpc_apic_id(struct mpc_config_processor *m, + struct mpc_config_translation *translation_record) { printk("Processor #%d %ld:%ld APIC version %d\n", m->mpc_apicid, @@ -106,7 +107,10 @@ static inline void setup_portio_remap(vo static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) { - return (1); + if (x86_summit) + return (1); + else + return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map); } #endif /* __ASM_MACH_APIC_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/mach-summit/mach_mpparse.h 90-mjb/include/asm-i386/mach-summit/mach_mpparse.h --- 00-virgin/include/asm-i386/mach-summit/mach_mpparse.h Fri Jan 17 09:18:31 2003 +++ 90-mjb/include/asm-i386/mach-summit/mach_mpparse.h Sun Feb 2 13:19:14 2003 @@ -1,6 +1,8 @@ #ifndef __ASM_MACH_MPPARSE_H #define __ASM_MACH_MPPARSE_H +extern int use_cyclone; + static inline void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, struct mpc_config_translation *translation) { @@ -17,14 +19,18 @@ static inline void mps_oem_check(struct { if (!strncmp(oem, "IBM ENSW", 8) && (!strncmp(productid, "VIGIL SMP", 9) - || !strncmp(productid, "RUTHLESS SMP", 12))) + || !strncmp(productid, "RUTHLESS SMP", 12))){ x86_summit = 1; + use_cyclone = 1; /*enable cyclone-timer*/ + } } /* Hook from generic ACPI tables.c */ static inline void acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "SERVIGIL", 8)) + if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "SERVIGIL", 8)){ x86_summit = 1; + use_cyclone = 1; /*enable cyclone-timer*/ + } } #endif /* __ASM_MACH_MPPARSE_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/mmzone.h 90-mjb/include/asm-i386/mmzone.h --- 00-virgin/include/asm-i386/mmzone.h Sun Nov 17 20:29:46 2002 +++ 90-mjb/include/asm-i386/mmzone.h Sun Feb 2 13:19:32 2003 @@ -8,12 +8,17 @@ #include -#ifdef CONFIG_DISCONTIGMEM +#ifndef CONFIG_DISCONTIGMEM + +#define pfn_to_nid(pfn) (0) + +#else #ifdef CONFIG_X86_NUMAQ #include +#elif CONFIG_X86_SUMMIT +#include #else -#define pfn_to_nid(pfn) (0) #endif /* CONFIG_X86_NUMAQ */ extern struct pglist_data *node_data[]; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/msr.h 90-mjb/include/asm-i386/msr.h --- 00-virgin/include/asm-i386/msr.h Mon Dec 23 23:01:56 2002 +++ 90-mjb/include/asm-i386/msr.h Sun Feb 2 13:19:27 2003 @@ -93,6 +93,90 @@ #define MSR_IA32_MC0_ADDR 0x402 #define MSR_IA32_MC0_MISC 0x403 +/* Pentium IV performance counter MSRs */ +#define MSR_P4_BPU_PERFCTR0 0x300 +#define MSR_P4_BPU_PERFCTR1 0x301 +#define MSR_P4_BPU_PERFCTR2 0x302 +#define MSR_P4_BPU_PERFCTR3 0x303 +#define MSR_P4_MS_PERFCTR0 0x304 +#define MSR_P4_MS_PERFCTR1 0x305 +#define MSR_P4_MS_PERFCTR2 0x306 +#define MSR_P4_MS_PERFCTR3 0x307 +#define MSR_P4_FLAME_PERFCTR0 0x308 +#define MSR_P4_FLAME_PERFCTR1 0x309 +#define MSR_P4_FLAME_PERFCTR2 0x30a +#define MSR_P4_FLAME_PERFCTR3 0x30b +#define MSR_P4_IQ_PERFCTR0 0x30c +#define MSR_P4_IQ_PERFCTR1 0x30d +#define MSR_P4_IQ_PERFCTR2 0x30e +#define MSR_P4_IQ_PERFCTR3 0x30f +#define MSR_P4_IQ_PERFCTR4 0x310 +#define MSR_P4_IQ_PERFCTR5 0x311 +#define MSR_P4_BPU_CCCR0 0x360 +#define MSR_P4_BPU_CCCR1 0x361 +#define MSR_P4_BPU_CCCR2 0x362 +#define MSR_P4_BPU_CCCR3 0x363 +#define MSR_P4_MS_CCCR0 0x364 +#define MSR_P4_MS_CCCR1 0x365 +#define MSR_P4_MS_CCCR2 0x366 +#define MSR_P4_MS_CCCR3 0x367 +#define MSR_P4_FLAME_CCCR0 0x368 +#define MSR_P4_FLAME_CCCR1 0x369 +#define MSR_P4_FLAME_CCCR2 0x36a +#define MSR_P4_FLAME_CCCR3 0x36b +#define MSR_P4_IQ_CCCR0 0x36c +#define MSR_P4_IQ_CCCR1 0x36d +#define MSR_P4_IQ_CCCR2 0x36e +#define MSR_P4_IQ_CCCR3 0x36f +#define MSR_P4_IQ_CCCR4 0x370 +#define MSR_P4_IQ_CCCR5 0x371 +#define MSR_P4_ALF_ESCR0 0x3ca +#define MSR_P4_ALF_ESCR1 0x3cb +#define MSR_P4_BPU_ESCR0 0x3b2 +#define MSR_P4_BPU_ESCR1 0x3b3 +#define MSR_P4_BSU_ESCR0 0x3a0 +#define MSR_P4_BSU_ESCR1 0x3a1 +#define MSR_P4_CRU_ESCR0 0x3b8 +#define MSR_P4_CRU_ESCR1 0x3b9 +#define MSR_P4_CRU_ESCR2 0x3cc +#define MSR_P4_CRU_ESCR3 0x3cd +#define MSR_P4_CRU_ESCR4 0x3e0 +#define MSR_P4_CRU_ESCR5 0x3e1 +#define MSR_P4_DAC_ESCR0 0x3a8 +#define MSR_P4_DAC_ESCR1 0x3a9 +#define MSR_P4_FIRM_ESCR0 0x3a4 +#define MSR_P4_FIRM_ESCR1 0x3a5 +#define MSR_P4_FLAME_ESCR0 0x3a6 +#define MSR_P4_FLAME_ESCR1 0x3a7 +#define MSR_P4_FSB_ESCR0 0x3a2 +#define MSR_P4_FSB_ESCR1 0x3a3 +#define MSR_P4_IQ_ESCR0 0x3ba +#define MSR_P4_IQ_ESCR1 0x3bb +#define MSR_P4_IS_ESCR0 0x3b4 +#define MSR_P4_IS_ESCR1 0x3b5 +#define MSR_P4_ITLB_ESCR0 0x3b6 +#define MSR_P4_ITLB_ESCR1 0x3b7 +#define MSR_P4_IX_ESCR0 0x3c8 +#define MSR_P4_IX_ESCR1 0x3c9 +#define MSR_P4_MOB_ESCR0 0x3aa +#define MSR_P4_MOB_ESCR1 0x3ab +#define MSR_P4_MS_ESCR0 0x3c0 +#define MSR_P4_MS_ESCR1 0x3c1 +#define MSR_P4_PMH_ESCR0 0x3ac +#define MSR_P4_PMH_ESCR1 0x3ad +#define MSR_P4_RAT_ESCR0 0x3bc +#define MSR_P4_RAT_ESCR1 0x3bd +#define MSR_P4_SAAT_ESCR0 0x3ae +#define MSR_P4_SAAT_ESCR1 0x3af +#define MSR_P4_SSU_ESCR0 0x3be +#define MSR_P4_SSU_ESCR1 0x3bf /* guess: not defined in manual */ +#define MSR_P4_TBPU_ESCR0 0x3c2 +#define MSR_P4_TBPU_ESCR1 0x3c3 +#define MSR_P4_TC_ESCR0 0x3c4 +#define MSR_P4_TC_ESCR1 0x3c5 +#define MSR_P4_U2L_ESCR0 0x3b0 +#define MSR_P4_U2L_ESCR1 0x3b1 + /* AMD Defined MSRs */ #define MSR_K6_EFER 0xC0000080 #define MSR_K6_STAR 0xC0000081 diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/numaq.h 90-mjb/include/asm-i386/numaq.h --- 00-virgin/include/asm-i386/numaq.h Thu Jan 9 19:16:11 2003 +++ 90-mjb/include/asm-i386/numaq.h Sun Feb 2 13:19:31 2003 @@ -36,10 +36,11 @@ #define MAX_ELEMENTS 256 #define PAGES_PER_ELEMENT (16777216/256) +extern int physnode_map[]; +#define pfn_to_nid(pfn) ({ physnode_map[(pfn) / PAGES_PER_ELEMENT]; }) #define pfn_to_pgdat(pfn) NODE_DATA(pfn_to_nid(pfn)) #define PHYSADDR_TO_NID(pa) pfn_to_nid(pa >> PAGE_SHIFT) #define MAX_NUMNODES 8 -extern int pfn_to_nid(unsigned long); extern void get_memcfg_numaq(void); #define get_memcfg_numa() get_memcfg_numaq() @@ -168,6 +169,10 @@ struct sys_cfg_data { struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */ }; +static inline unsigned long get_zholes_size(int nid) +{ + return 0; +} #endif /* CONFIG_X86_NUMAQ */ #endif /* NUMAQ_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/page.h 90-mjb/include/asm-i386/page.h --- 00-virgin/include/asm-i386/page.h Tue Jan 14 10:06:18 2003 +++ 90-mjb/include/asm-i386/page.h Sat Feb 1 22:11:38 2003 @@ -3,7 +3,11 @@ /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) +#ifndef __ASSEMBLY__ +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#else +#define PAGE_SIZE (1 << PAGE_SHIFT) +#endif #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) @@ -89,7 +93,16 @@ typedef struct { unsigned long pgprot; } * and CONFIG_HIGHMEM64G options in the kernel configuration. */ -#define __PAGE_OFFSET (0xC0000000) +#include +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000) +#endif /* * This much address space is reserved for vmalloc() and iomap() diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/param.h 90-mjb/include/asm-i386/param.h --- 00-virgin/include/asm-i386/param.h Sun Nov 17 20:29:26 2002 +++ 90-mjb/include/asm-i386/param.h Sat Feb 1 22:04:43 2003 @@ -2,10 +2,18 @@ #define _ASMi386_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +#include + +#ifdef CONFIG_1000HZ +# define HZ 1000 /* Internal kernel timer frequency */ +#else +# define HZ 100 #endif + +#define USER_HZ 100 /* .. some user interfaces are in "ticks" */ +#define CLOCKS_PER_SEC (USER_HZ) /* like times() */ + +#endif /* __KERNEL__ */ #ifndef HZ #define HZ 100 diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/pgalloc.h 90-mjb/include/asm-i386/pgalloc.h --- 00-virgin/include/asm-i386/pgalloc.h Sun Nov 17 20:29:21 2002 +++ 90-mjb/include/asm-i386/pgalloc.h Sun Feb 2 13:19:30 2003 @@ -20,11 +20,11 @@ static inline void pmd_populate(struct m * Allocate and free page tables. */ -extern pgd_t *pgd_alloc(struct mm_struct *); -extern void pgd_free(pgd_t *pgd); +pgd_t *pgd_alloc(struct mm_struct *); +void pgd_free(pgd_t *pgd); -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); +pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); +struct page *pte_alloc_one(struct mm_struct *, unsigned long); static inline void pte_free_kernel(pte_t *pte) { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/pgtable-3level.h 90-mjb/include/asm-i386/pgtable-3level.h --- 00-virgin/include/asm-i386/pgtable-3level.h Sun Nov 17 20:29:52 2002 +++ 90-mjb/include/asm-i386/pgtable-3level.h Sun Feb 2 13:19:30 2003 @@ -106,6 +106,4 @@ static inline pmd_t pfn_pmd(unsigned lon return __pmd(((unsigned long long)page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); } -extern struct kmem_cache_s *pae_pgd_cachep; - #endif /* _I386_PGTABLE_3LEVEL_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/pgtable.h 90-mjb/include/asm-i386/pgtable.h --- 00-virgin/include/asm-i386/pgtable.h Sun Dec 1 10:00:23 2002 +++ 90-mjb/include/asm-i386/pgtable.h Sun Feb 2 13:19:30 2003 @@ -41,21 +41,12 @@ extern unsigned long empty_zero_page[102 #ifndef __ASSEMBLY__ #if CONFIG_X86_PAE # include - -/* - * Need to initialise the X86 PAE caches - */ -extern void pgtable_cache_init(void); - #else # include +#endif -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) +void pgtable_cache_init(void); -#endif #endif #define __beep() asm("movb $0x3,%al; outb %al,$0x61") diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/processor.h 90-mjb/include/asm-i386/processor.h --- 00-virgin/include/asm-i386/processor.h Thu Jan 2 22:05:15 2003 +++ 90-mjb/include/asm-i386/processor.h Sat Feb 1 22:09:06 2003 @@ -279,7 +279,11 @@ extern unsigned int mca_pentium_flag; /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ +#ifdef CONFIG_05GB +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 16)) +#else #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#endif /* * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. @@ -393,6 +397,9 @@ struct thread_struct { unsigned int saved_fs, saved_gs; /* IO permissions */ unsigned long *ts_io_bitmap; +#ifdef CONFIG_X86_REMOTE_DEBUG + struct pt_regs *kgdbregs; +#endif }; #define INIT_THREAD { \ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/srat.h 90-mjb/include/asm-i386/srat.h --- 00-virgin/include/asm-i386/srat.h Wed Dec 31 16:00:00 1969 +++ 90-mjb/include/asm-i386/srat.h Sat Feb 1 22:21:15 2003 @@ -0,0 +1,68 @@ +/* + * Code taken from 64 bit discontigmem support. + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + */ + +#ifndef SRAT_DISCTG_H +#define SRAT_DISCTG_H + +extern int pa_to_nid(u64); +extern int pfn_to_nid(unsigned long); +extern void get_memcfg_from_srat(void); +extern unsigned long get_zholes_size(int); + +#define PHYSADDR_TO_NID(pa) pa_to_nid(pa) +#define pfn_to_pgdat(pfn) NODE_DATA(pfn_to_nid(pfn)) +#define get_memcfg_numa() get_memcfg_from_srat() + +#define MAX_NUMNODES 8 +#define MAX_CLUMPS_PER_NODE 4 +#define MAXCLUMPS (MAX_CLUMPS_PER_NODE * MAX_NUMNODES) + +/* + * cpu -> pxm_domain structure + */ +struct node_cpuid_s{ + u8 phys_id; /* phys apic ID (no EID for IA32) */ + u8 pxm; // proximity domain of cpu + u8 nid; +}; + +extern struct node_cpuid_s node_cpuid[]; + +#define _cpu_to_node(cpu) (node_cpuid[cpu].nid) + +/* + * memory -> pxm_domain structure + */ +struct node_memory_chunk_s { + u64 start_paddr; + u64 end_paddr; + u64 size; + u8 pxm; // proximity domain of node + u8 nid; // which cnode contains this chunk? + u8 bank; // which mem bank on this node +}; +extern struct node_memory_chunk_s node_memory_chunk[]; + +#endif /* SRAT_DISCTG_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/thread_info.h 90-mjb/include/asm-i386/thread_info.h --- 00-virgin/include/asm-i386/thread_info.h Thu Jan 9 19:16:11 2003 +++ 90-mjb/include/asm-i386/thread_info.h Sat Feb 1 22:18:27 2003 @@ -9,6 +9,7 @@ #ifdef __KERNEL__ +#include #ifndef __ASSEMBLY__ #include #endif @@ -29,9 +30,11 @@ struct thread_info { __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: + 0 for interrupts: illegal 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ + struct thread_info *irq_stack; /* pointer to cpu irq stack */ struct restart_block restart_block; __u8 supervisor_stack[0]; @@ -46,7 +49,8 @@ struct thread_info { #define TI_CPU 0x0000000C #define TI_PRE_COUNT 0x00000010 #define TI_ADDR_LIMIT 0x00000014 -#define TI_RESTART_BLOCK 0x0000018 +#define TI_IRQ_STACK 0x00000018 +#define TI_RESTART_BLOCK 0x0000022 #endif @@ -57,48 +61,66 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ +#ifdef CONFIG_4K_STACK +#define THREAD_ORDER 0 +#define STACK_WARN 0x200 +#define STACK_PANIC 0x100 +#else +#define THREAD_ORDER 1 +#define STACK_WARN ((THREAD_SIZE)>>1) +#define STACK_PANIC 0x100 +#endif +#define INIT_THREAD_SIZE THREAD_SIZE + #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .irq_stack = &init_irq_union.thread_info, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + } \ } #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) +/* thread information allocation */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define alloc_thread_info() ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)) +#define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER) +#define get_thread_info(ti) get_task_struct((ti)->task) +#define put_thread_info(ti) put_task_struct((ti)->task) + /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } -/* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_thread_info() ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) -#define free_thread_info(ti) free_pages((unsigned long) (ti), 1) -#define get_thread_info(ti) get_task_struct((ti)->task) -#define put_thread_info(ti) put_task_struct((ti)->task) - #else /* !__ASSEMBLY__ */ +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) + /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $-THREAD_SIZE, reg; \ andl %esp, reg -#endif +/* use this one if reg already contains %esp */ +#define GET_THREAD_INFO_WITH_ESP(reg) \ + andl $-THREAD_SIZE, reg +#endif + /* * thread information flags * - these are process state flags that various assembly files may need to access diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-x86_64/early_printk.h 90-mjb/include/asm-x86_64/early_printk.h --- 00-virgin/include/asm-x86_64/early_printk.h Wed Dec 31 16:00:00 1969 +++ 90-mjb/include/asm-x86_64/early_printk.h Sat Feb 1 22:00:06 2003 @@ -0,0 +1,8 @@ +#ifdef __EARLY_PRINTK_H_X86_64_ +#define __EARLY_PRINTK_H_X86_64_ + +#define VGABASE 0xffffffff800b8000UL +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/acpi.h 90-mjb/include/linux/acpi.h --- 00-virgin/include/linux/acpi.h Mon Jan 13 21:09:14 2003 +++ 90-mjb/include/linux/acpi.h Sat Feb 1 22:21:15 2003 @@ -82,7 +82,7 @@ typedef struct { struct acpi_table_rsdt { struct acpi_table_header header; - u32 entry[1]; + u32 entry[8]; } __attribute__ ((packed)); /* Extended System Description Table (XSDT) */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/dcache.h 90-mjb/include/linux/dcache.h --- 00-virgin/include/linux/dcache.h Tue Jan 14 10:06:18 2003 +++ 90-mjb/include/linux/dcache.h Sat Feb 1 21:58:05 2003 @@ -7,6 +7,7 @@ #include #include #include +#include #include struct vfsmount; @@ -72,11 +73,13 @@ struct dcookie_struct; struct dentry { atomic_t d_count; + unsigned long d_vfs_flags; /* moved here to be on same cacheline */ + spinlock_t d_lock; /* per dentry lock */ unsigned int d_flags; struct inode * d_inode; /* Where the name belongs to - NULL is negative */ struct dentry * d_parent; /* parent directory */ struct list_head d_hash; /* lookup hash list */ - struct list_head d_lru; /* d_count = 0 LRU list */ + struct list_head d_lru; /* LRU list */ struct list_head d_child; /* child of parent list */ struct list_head d_subdirs; /* our children */ struct list_head d_alias; /* inode alias list */ @@ -85,8 +88,8 @@ struct dentry { unsigned long d_time; /* used by d_revalidate */ struct dentry_operations *d_op; struct super_block * d_sb; /* The root of the dentry tree */ - unsigned long d_vfs_flags; void * d_fsdata; /* fs-specific data */ + struct rcu_head d_rcu; struct dcookie_struct * d_cookie; /* cookie, if any */ unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ } ____cacheline_aligned; @@ -139,6 +142,7 @@ d_iput: no no yes */ #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ +#define DCACHE_UNHASHED 0x0010 extern spinlock_t dcache_lock; extern rwlock_t dparent_lock; @@ -162,7 +166,8 @@ extern rwlock_t dparent_lock; static __inline__ void __d_drop(struct dentry * dentry) { - list_del_init(&dentry->d_hash); + dentry->d_vfs_flags |= DCACHE_UNHASHED; + list_del_rcu(&dentry->d_hash); } static __inline__ void d_drop(struct dentry * dentry) @@ -254,9 +259,8 @@ extern char * d_path(struct dentry *, st static __inline__ struct dentry * dget(struct dentry *dentry) { if (dentry) { - if (!atomic_read(&dentry->d_count)) - BUG(); atomic_inc(&dentry->d_count); + dentry->d_vfs_flags |= DCACHE_REFERENCED; } return dentry; } @@ -272,7 +276,7 @@ extern struct dentry * dget_locked(struc static __inline__ int d_unhashed(struct dentry *dentry) { - return list_empty(&dentry->d_hash); + return (dentry->d_vfs_flags & DCACHE_UNHASHED); } extern void dput(struct dentry *); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/early_printk.h 90-mjb/include/linux/early_printk.h --- 00-virgin/include/linux/early_printk.h Wed Dec 31 16:00:00 1969 +++ 90-mjb/include/linux/early_printk.h Sat Feb 1 22:00:06 2003 @@ -0,0 +1,47 @@ +#ifndef __EARLY_PRINTK_H_ +#define __EARLY_PRINTK_H_ + +#ifdef CONFIG_EARLY_PRINTK +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +/* Simple serial port output */ + +#define DEFAULT_BAUD 57600 +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + + +void early_printk(const char *fmt, ...); +int __init setup_early_printk(char *opt); + +#else + +#define early_printk(...) do {} while(0) +#define setup_early_printk(X) do {} while(0) + +#endif + +#endif diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/frlock.h 90-mjb/include/linux/frlock.h --- 00-virgin/include/linux/frlock.h Wed Dec 31 16:00:00 1969 +++ 90-mjb/include/linux/frlock.h Sun Feb 2 13:19:24 2003 @@ -0,0 +1,100 @@ +#ifndef __LINUX_FRLOCK_H +#define __LINUX_FRLOCK_H + +/* + * Fast read-write spinlocks. + * + * Fast reader/writer locks without starving writers. This type of + * lock for data where the reader wants a consitent set of information + * and is willing to retry if the information changes. Readers never + * block but they may have to retry if a writer is in + * progress. Writers do not wait for readers. + * + * Generalization on sequence variables used for gettimeofday on x86-64 + * by Andrea Arcangeli + * + * This is not as cache friendly as brlock. Also, this will not work + * for data that contains pointers, because any writer could + * invalidate a pointer that a reader was following. + * + * + * Expected reader usage: + * do { + * seq = fr_read_begin(); + * ... + * } while (seq != fr_read_end()); + * + * On non-SMP the spin locks disappear but the writer still needs + * to increment the sequence variables because an interrupt routine could + * change the state of the data. + */ + +#include +#include + +typedef struct { + spinlock_t lock; + unsigned pre_sequence; + unsigned post_sequence; +} frlock_t; + +#define FR_LOCK_UNLOCKED { SPIN_LOCK_UNLOCKED, 0, 0 } +#define frlock_init(x) do { *(x) = FR_LOCK_UNLOCKED; } while (0) + +static inline void fr_write_lock(frlock_t *rw) +{ + spin_lock(&rw->lock); + rw->pre_sequence++; + wmb(); +} + +static inline void fr_write_unlock(frlock_t *rw) +{ + wmb(); + rw->post_sequence++; + spin_unlock(&rw->lock); +} + +static inline int fr_write_trylock(frlock_t *rw) +{ + int ret = spin_trylock(&rw->lock); + + if (ret) { + ++rw->pre_sequence; + wmb(); + } + return ret; +} + +static inline unsigned fr_read_begin(frlock_t *rw) +{ + unsigned ret = rw->post_sequence; + rmb(); + return ret; + +} + +static inline unsigned fr_read_end(frlock_t *rw) +{ + rmb(); + return rw->pre_sequence; +} + +/* + * Possible sw/hw IRQ protected versions of the interfaces. + */ +#define fr_write_lock_irqsave(lock, flags) \ + do { local_irq_save(flags); fr_write_lock(lock); } while (0) +#define fr_write_lock_irq(lock) \ + do { local_irq_disable(); fr_write_lock(lock); } while (0) +#define fr_write_lock_bh(lock) \ + do { local_bh_disable(); fr_write_lock(lock); } while (0) + +#define fr_write_unlock_irqrestore(lock, flags) \ + do { fr_write_unlock(lock); local_irq_restore(flags); } while(0) +#define fr_write_unlock_irq(lock) \ + do { fr_write_unlock(lock); local_irq_enable(); } while(0) +#define fr_write_unlock_bh(lock) \ + do { fr_write_unlock(lock); local_bh_enable(); } while(0) + +#endif /* __LINUX_FRLOCK_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/gdb.h 90-mjb/include/linux/gdb.h --- 00-virgin/include/linux/gdb.h Wed Dec 31 16:00:00 1969 +++ 90-mjb/include/linux/gdb.h Sat Feb 1 22:09:06 2003 @@ -0,0 +1,67 @@ +#ifndef _GDB_H_ +#define _GDB_H_ + +/* + * Copyright (C) 2001 Amit S. Kale + */ + +/* gdb locks */ +#define KGDB_MAX_NO_CPUS NR_CPUS + +extern int gdb_enter; /* 1 = enter debugger on boot */ +extern int gdb_ttyS; +extern int gdb_baud; +extern int gdb_initialized; + +extern int gdb_hook(void); +extern void breakpoint(void); + +typedef int gdb_debug_hook(int trapno, + int signo, + int err_code, + struct pt_regs *regs); +extern gdb_debug_hook *linux_debug_hook; + +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +extern spinlock_t kgdb_nmispinlock; +#else +extern unsigned kgdb_spinlock; +extern unsigned kgdb_nmispinlock; +#endif + +extern volatile int kgdb_memerr_expected; + +struct console; +void gdb_console_write(struct console *co, const char *s, + unsigned count); +void gdb_console_init(void); + +extern volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#define KGDB_ASSERT(message, condition) do { \ + if (!(condition)) { \ + printk("kgdb assertion failed: %s\n", message); \ + asm ("int $0x3"); \ + } \ +} while (0) + +#ifdef CONFIG_KERNEL_ASSERTS +#define KERNEL_ASSERT(message, condition) KGDB_ASSERT(message, condition) +#else +#define KERNEL_ASSERT(message, condition) +#endif + +#define KA_VALID_ERRNO(errno) ((errno) > 0 && (errno) <= EMEDIUMTYPE) + +#define KA_VALID_PTR_ERR(ptr) KA_VALID_ERRNO(-PTR_ERR(ptr)) + +#define KA_VALID_KPTR(ptr) (!(ptr) || \ + ((void *)(ptr) >= (void *)PAGE_OFFSET && \ + (void *)(ptr) < ERR_PTR(-EMEDIUMTYPE))) + +#define KA_VALID_PTRORERR(errptr) (KA_VALID_KPTR(errptr) || KA_VALID_PTR_ERR(errptr)) + +#define KA_HELD_GKL() (current->lock_depth >= 0) + +#endif /* _GDB_H_ */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/namei.h 90-mjb/include/linux/namei.h --- 00-virgin/include/linux/namei.h Sun Nov 17 20:29:30 2002 +++ 90-mjb/include/linux/namei.h Sat Feb 1 21:58:03 2003 @@ -11,8 +11,6 @@ struct nameidata { struct qstr last; unsigned int flags; int last_type; - struct dentry *old_dentry; - struct vfsmount *old_mnt; }; /* diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/oprofile.h 90-mjb/include/linux/oprofile.h --- 00-virgin/include/linux/oprofile.h Mon Dec 23 23:01:57 2002 +++ 90-mjb/include/linux/oprofile.h Sun Feb 2 13:19:27 2003 @@ -21,12 +21,22 @@ struct super_block; struct dentry; struct file_operations; +/* This is duplicated from user-space so + * must be kept in sync :( + */ enum oprofile_cpu { OPROFILE_CPU_PPRO, OPROFILE_CPU_PII, OPROFILE_CPU_PIII, OPROFILE_CPU_ATHLON, - OPROFILE_CPU_TIMER + OPROFILE_CPU_TIMER, + OPROFILE_UNUSED1, /* 2.4's RTC mode */ + OPROFILE_CPU_P4, + OPROFILE_CPU_IA64, + OPROFILE_CPU_IA64_1, + OPROFILE_CPU_IA64_2, + OPROFILE_CPU_HAMMER, + OPROFILE_CPU_P4_HT2 }; /* Operations structure to be filled in */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/sched.h 90-mjb/include/linux/sched.h --- 00-virgin/include/linux/sched.h Fri Jan 17 09:18:32 2003 +++ 90-mjb/include/linux/sched.h Sat Feb 1 22:09:06 2003 @@ -166,7 +166,9 @@ extern unsigned long cache_decay_ticks; #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); -asmlinkage void schedule(void); +asmlinkage void do_schedule(void); +asmlinkage void kern_schedule(void); +asmlinkage void kern_do_schedule(struct pt_regs); struct namespace; @@ -648,6 +650,12 @@ static inline int thread_group_empty(tas (thread_group_leader(p) && !thread_group_empty(p)) extern void unhash_process(struct task_struct *p); + +#ifdef CONFIG_KGDB_THREAD +#define schedule() kern_schedule() +#else +#define schedule() do_schedule() +#endif /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ static inline void task_lock(struct task_struct *p) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/sysctl.h 90-mjb/include/linux/sysctl.h --- 00-virgin/include/linux/sysctl.h Mon Dec 23 23:01:57 2002 +++ 90-mjb/include/linux/sysctl.h Sat Feb 1 22:09:10 2003 @@ -66,7 +66,8 @@ enum CTL_DEV=7, /* Devices */ CTL_BUS=8, /* Busses */ CTL_ABI=9, /* Binary emulation */ - CTL_CPU=10 /* CPU stuff (speed scaling, etc) */ + CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_SCHED=11, /* scheduler tunables */ }; /* CTL_BUS names: */ @@ -157,6 +158,20 @@ enum VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ }; +/* Tunable scheduler parameters in /proc/sys/sched/ */ +enum { + SCHED_MIN_TIMESLICE=1, /* minimum process timeslice */ + SCHED_MAX_TIMESLICE=2, /* maximum process timeslice */ + SCHED_CHILD_PENALTY=3, /* penalty on fork to child */ + SCHED_PARENT_PENALTY=4, /* penalty on fork to parent */ + SCHED_EXIT_WEIGHT=5, /* penalty to parent of CPU hog child */ + SCHED_PRIO_BONUS_RATIO=6, /* percent of max prio given as bonus */ + SCHED_INTERACTIVE_DELTA=7, /* delta used to scale interactivity */ + SCHED_MAX_SLEEP_AVG=8, /* maximum sleep avg attainable */ + SCHED_STARVATION_LIMIT=9, /* no re-active if expired is starved */ + SCHED_IDLE_NODE_REBALANCE_RATIO=10, /* how often to global balance */ + SCHED_BUSY_NODE_REBALANCE_RATIO=11, /* how often to global balance */ +}; /* CTL_NET names: */ enum diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/time.h 90-mjb/include/linux/time.h --- 00-virgin/include/linux/time.h Fri Dec 13 23:18:14 2002 +++ 90-mjb/include/linux/time.h Sun Feb 2 13:19:24 2003 @@ -25,6 +25,7 @@ struct timezone { #ifdef __KERNEL__ #include +#include /* * Change timeval to jiffies, trying to avoid the @@ -120,7 +121,7 @@ mktime (unsigned int year, unsigned int } extern struct timespec xtime; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; static inline unsigned long get_seconds(void) { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/timex.h 90-mjb/include/linux/timex.h --- 00-virgin/include/linux/timex.h Sun Nov 17 20:29:21 2002 +++ 90-mjb/include/linux/timex.h Sat Feb 1 22:04:43 2003 @@ -76,7 +76,7 @@ #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #else -# error You lose. +# error Please use a HZ value which is between 12 and 1536 #endif /* diff -urpN -X /home/fletch/.diff.exclude 00-virgin/init/main.c 90-mjb/init/main.c --- 00-virgin/init/main.c Fri Jan 17 09:18:32 2003 +++ 90-mjb/init/main.c Sat Feb 1 22:09:06 2003 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,10 @@ #include #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + /* * Versions of gcc older than that listed below may actually compile * and link okay, but the end product can have subtle run time bugs. @@ -374,6 +379,7 @@ asmlinkage void __init start_kernel(void */ lock_kernel(); printk(linux_banner); + setup_early_printk(&command_line); setup_arch(&command_line); setup_per_cpu_areas(); @@ -444,6 +450,12 @@ asmlinkage void __init start_kernel(void * make syscalls (and thus be locked). */ init_idle(current, smp_processor_id()); + +#ifdef CONFIG_X86_REMOTE_DEBUG + if (gdb_enter) { + gdb_hook(); /* right at boot time */ + } +#endif /* Do the rest non-__init'ed, we're now alive */ rest_init(); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/Makefile 90-mjb/kernel/Makefile --- 00-virgin/kernel/Makefile Thu Jan 9 19:16:15 2003 +++ 90-mjb/kernel/Makefile Sat Feb 1 22:00:06 2003 @@ -22,6 +22,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/early_printk.c 90-mjb/kernel/early_printk.c --- 00-virgin/kernel/early_printk.c Wed Dec 31 16:00:00 1969 +++ 90-mjb/kernel/early_printk.c Sat Feb 1 22:00:06 2003 @@ -0,0 +1,209 @@ +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +static int current_ypos = 1, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= MAX_YPOS) { + /* scroll 1 line up */ + for(k = 1, j = 0; k < MAX_YPOS; k++, j++) { + for(i = 0; i < MAX_XPOS; i++) { + writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), + VGABASE + 2*(MAX_XPOS*j + i)); + } + } + for(i = 0; i < MAX_XPOS; i++) { + writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); + } + current_ypos = MAX_YPOS-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++)); + if (current_xpos >= MAX_XPOS) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ + +int early_serial_base; /* ttyS0 */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + rep_nop(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); + s++; + } +} + +static __init void early_serial_init(char *opt) +{ + unsigned char c; + unsigned divisor, baud = DEFAULT_BAUD; + static int bases[] = SERIAL_BASES; + char *s, *e; + + early_serial_base = bases[0]; + + if (*opt == ',') + ++opt; + + s = strsep(&opt, ","); + if (s != NULL) { + unsigned port; + if (!strncmp(s,"0x",2)) + early_serial_base = simple_strtoul(s, &e, 16); + else { + if (!strncmp(s,"ttyS",4)) + s+=4; + port = simple_strtoul(s, &e, 10); + if (port > (SERIAL_BASES_LEN-1) || s == e) + port = 0; + early_serial_base = bases[port]; + } + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + s = strsep(&opt, ","); + if (s != NULL) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + va_start(ap,fmt); + n = vsnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int keep_early; + +int __init setup_early_printk(char *opt) +{ + char *space, *s; + char buf[256]; + + s = strstr(opt, "earlyprintk="); + if (s == NULL) + return -1; + opt = s+12; + + if (early_console_initialized) + return -1; + + strncpy(buf,opt,256); + buf[255] = 0; + space = strchr(buf, ' '); + if (space) + *space = 0; + + if (strstr(buf,"keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3)) { + early_console = &early_vga_console; + } else { + early_console = NULL; + return -1; + } + early_console_initialized = 1; + register_console(early_console); + early_printk( "early printk console registered\n" ); + return 0; +} + +void __init disable_early_printk(void) +{ + if (!early_console_initialized || !early_console) + return; + if (!keep_early) { + printk("disabling early console...\n"); + unregister_console(early_console); + early_console_initialized = 0; + } else { + printk("keeping early console.\n"); + } +} + +/* syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. */ +__setup("earlyprintk=", setup_early_printk); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/ksyms.c 90-mjb/kernel/ksyms.c --- 00-virgin/kernel/ksyms.c Tue Jan 14 10:06:19 2003 +++ 90-mjb/kernel/ksyms.c Sat Feb 1 22:09:06 2003 @@ -469,7 +469,10 @@ EXPORT_SYMBOL(sleep_on); EXPORT_SYMBOL(sleep_on_timeout); EXPORT_SYMBOL(interruptible_sleep_on); EXPORT_SYMBOL(interruptible_sleep_on_timeout); -EXPORT_SYMBOL(schedule); +EXPORT_SYMBOL(do_schedule); +#ifdef CONFIG_KGDB_THREAD +EXPORT_SYMBOL(kern_schedule); +#endif #ifdef CONFIG_PREEMPT EXPORT_SYMBOL(preempt_schedule); #endif diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/sched.c 90-mjb/kernel/sched.c --- 00-virgin/kernel/sched.c Fri Jan 17 09:18:32 2003 +++ 90-mjb/kernel/sched.c Sat Feb 1 22:09:10 2003 @@ -33,6 +33,12 @@ #include #include +#ifdef CONFIG_NUMA +#define __cpu_to_node_mask(cpu) __node_to_cpu_mask(__cpu_to_node(cpu)) +#else +#define __cpu_to_node_mask(cpu) (cpu_online_map) +#endif + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -57,16 +63,30 @@ * Minimum timeslice is 10 msecs, default timeslice is 150 msecs, * maximum timeslice is 300 msecs. Timeslices get refilled after * they expire. + * + * They are configurable via /proc/sys/sched */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (300 * HZ / 1000) -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (2*HZ) -#define STARVATION_LIMIT (2*HZ) + +int min_timeslice = (10 * HZ) / 1000; +int max_timeslice = (300 * HZ) / 1000; +int child_penalty = 95; +int parent_penalty = 100; +int exit_weight = 3; +int prio_bonus_ratio = 25; +int interactive_delta = 2; +int max_sleep_avg = 2 * HZ; +int starvation_limit = 2 * HZ; + +#define MIN_TIMESLICE (min_timeslice) +#define MAX_TIMESLICE (max_timeslice) +#define CHILD_PENALTY (child_penalty) +#define PARENT_PENALTY (parent_penalty) +#define EXIT_WEIGHT (exit_weight) +#define PRIO_BONUS_RATIO (prio_bonus_ratio) +#define INTERACTIVE_DELTA (interactive_delta) +#define MAX_SLEEP_AVG (max_sleep_avg) +#define STARVATION_LIMIT (starvation_limit) + #define NODE_THRESHOLD 125 /* @@ -153,10 +173,9 @@ struct runqueue { nr_uninterruptible; task_t *curr, *idle; prio_array_t *active, *expired, arrays[2]; - int prev_nr_running[NR_CPUS]; + int prev_cpu_load[NR_CPUS]; #ifdef CONFIG_NUMA atomic_t *node_nr_running; - unsigned int nr_balanced; int prev_node_load[MAX_NUMNODES]; #endif task_t *migration_thread; @@ -224,6 +243,83 @@ __init void node_nr_running_init(void) #endif /* CONFIG_NUMA */ + +struct schedstat { + /* sys_sched_yield stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + + /* load_balance stats */ + unsigned long lb_imbalance; + unsigned long lb_idle; + unsigned long lb_resched; + unsigned long lb_cnt; + unsigned long lb_nobusy; +} ____cacheline_aligned; + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 1 + +struct schedstat schedstats[NR_CPUS]; + +/* + * This could conceivably exceed a page's worth of output on machines with + * large number of cpus, where large == about 4096/100 or 40ish. Start + * worrying when we pass 32, probably. Then this has to stop being a + * "simple" entry in proc/proc_misc.c and needs to be an actual seq_file. + */ +int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct schedstat sums; + int i, len; + + memset(&sums, 0, sizeof(sums)); + len = sprintf(page, "version %d\n", SCHEDSTAT_VERSION); + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) continue; + sums.yld_exp_empty += schedstats[i].yld_exp_empty; + sums.yld_act_empty += schedstats[i].yld_act_empty; + sums.yld_both_empty += schedstats[i].yld_both_empty; + sums.yld_cnt += schedstats[i].yld_cnt; + sums.sched_noswitch += schedstats[i].sched_noswitch; + sums.sched_switch += schedstats[i].sched_switch; + sums.sched_switch += schedstats[i].sched_cnt; + sums.lb_idle += schedstats[i].lb_idle; + sums.lb_resched += schedstats[i].lb_resched; + sums.lb_cnt += schedstats[i].lb_cnt; + sums.lb_imbalance += schedstats[i].lb_imbalance; + sums.lb_nobusy += schedstats[i].lb_nobusy; + len += sprintf(page + len, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + i, schedstats[i].yld_both_empty, + schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty, + schedstats[i].yld_cnt, schedstats[i].sched_noswitch, + schedstats[i].sched_switch, schedstats[i].sched_cnt, + schedstats[i].lb_idle, schedstats[i].lb_resched, + schedstats[i].lb_cnt, schedstats[i].lb_imbalance, + schedstats[i].lb_nobusy); + } + len += sprintf(page + len, + "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty, + sums.yld_cnt, sums.sched_noswitch, sums.sched_switch, + sums.sched_cnt, sums.lb_idle, sums.lb_resched, sums.lb_cnt, + sums.lb_imbalance, sums.lb_nobusy); + + return len; +} + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -583,7 +679,6 @@ static inline task_t * context_switch(ta return prev; } - /* * nr_running, nr_uninterruptible and nr_context_switches: * @@ -765,31 +860,11 @@ static int find_busiest_node(int this_no return node; } -static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq) -{ - int this_node = __cpu_to_node(this_cpu); - /* - * Avoid rebalancing between nodes too often. - * We rebalance globally once every NODE_BALANCE_RATE load balances. - */ - if (++(this_rq->nr_balanced) == NODE_BALANCE_RATE) { - int node = find_busiest_node(this_node); - this_rq->nr_balanced = 0; - if (node >= 0) - return (__node_to_cpu_mask(node) | (1UL << this_cpu)); - } - return __node_to_cpu_mask(this_node); -} - -#else /* !CONFIG_NUMA */ - -static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq) -{ - return cpu_online_map; -} - #endif /* CONFIG_NUMA */ +int idle_node_rebalance_ratio = 10; +int busy_node_rebalance_ratio = 100; + #if CONFIG_SMP /* @@ -807,10 +882,10 @@ static inline unsigned int double_lock_b spin_lock(&busiest->lock); spin_lock(&this_rq->lock); /* Need to recalculate nr_running */ - if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) nr_running = this_rq->nr_running; else - nr_running = this_rq->prev_nr_running[this_cpu]; + nr_running = this_rq->prev_cpu_load[this_cpu]; } else spin_lock(&busiest->lock); } @@ -847,10 +922,10 @@ static inline runqueue_t *find_busiest_q * that case we are less picky about moving a task across CPUs and * take what can be taken. */ - if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) nr_running = this_rq->nr_running; else - nr_running = this_rq->prev_nr_running[this_cpu]; + nr_running = this_rq->prev_cpu_load[this_cpu]; busiest = NULL; max_load = 1; @@ -859,11 +934,11 @@ static inline runqueue_t *find_busiest_q continue; rq_src = cpu_rq(i); - if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i])) + if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i])) load = rq_src->nr_running; else - load = this_rq->prev_nr_running[i]; - this_rq->prev_nr_running[i] = rq_src->nr_running; + load = this_rq->prev_cpu_load[i]; + this_rq->prev_cpu_load[i] = rq_src->nr_running; if ((load > max_load) && (rq_src != this_rq)) { busiest = rq_src; @@ -922,7 +997,7 @@ static inline void pull_task(runqueue_t * We call this with the current runqueue locked, * irqs disabled. */ -static void load_balance(runqueue_t *this_rq, int idle) +static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask) { int imbalance, idx, this_cpu = smp_processor_id(); runqueue_t *busiest; @@ -930,11 +1005,16 @@ static void load_balance(runqueue_t *thi struct list_head *head, *curr; task_t *tmp; - busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, - cpus_to_balance(this_cpu, this_rq)); - if (!busiest) + schedstats[this_cpu].lb_cnt++; + if (idle) + schedstats[this_cpu].lb_idle++; + busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); + if (!busiest) { + schedstats[this_cpu].lb_nobusy++; goto out; + } + schedstats[this_cpu].lb_imbalance += imbalance; /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1006,21 +1086,76 @@ out: * frequency and balancing agressivity depends on whether the CPU is * idle or not. * - * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on + * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on * systems with HZ=100, every 10 msecs.) + * + * On NUMA, do a node-rebalance every 400 msecs. */ -#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) +#define BUSY_REBALANCE_TICK (HZ/5 ?: 1) + +#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * idle_node_rebalance_ratio) +#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * busy_node_rebalance_ratio) -static inline void idle_tick(runqueue_t *rq) +#if CONFIG_NUMA +static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) { - if (jiffies % IDLE_REBALANCE_TICK) - return; - spin_lock(&rq->lock); - load_balance(rq, 1); - spin_unlock(&rq->lock); + int node = find_busiest_node(__cpu_to_node(this_cpu)); + unsigned long cpumask, this_cpumask = 1UL << this_cpu; + + if (node >= 0) { + cpumask = __node_to_cpu_mask(node) | this_cpumask; + spin_lock(&this_rq->lock); + load_balance(this_rq, idle, cpumask); + spin_unlock(&this_rq->lock); + } } +#endif +static void rebalance_tick(runqueue_t *this_rq, int idle) +{ +#if CONFIG_NUMA + int this_cpu = smp_processor_id(); +#endif + unsigned long j = jiffies; + + /* + * First do inter-node rebalancing, then intra-node rebalancing, + * if both events happen in the same tick. The inter-node + * rebalancing does not necessarily have to create a perfect + * balance within the node, since we load-balance the most loaded + * node with the current CPU. (ie. other CPUs in the local node + * are not balanced.) + */ + if (idle) { +#if CONFIG_NUMA + if (!(j % IDLE_NODE_REBALANCE_TICK)) + balance_node(this_rq, idle, this_cpu); +#endif + if (!(j % IDLE_REBALANCE_TICK)) { + spin_lock(&this_rq->lock); + load_balance(this_rq, 0, __cpu_to_node_mask(this_cpu)); + spin_unlock(&this_rq->lock); + } + return; + } +#if CONFIG_NUMA + if (!(j % BUSY_NODE_REBALANCE_TICK)) + balance_node(this_rq, idle, this_cpu); +#endif + if (!(j % BUSY_REBALANCE_TICK)) { + spin_lock(&this_rq->lock); + load_balance(this_rq, idle, __cpu_to_node_mask(this_cpu)); + spin_unlock(&this_rq->lock); + } +} +#else +/* + * on UP we do not need to balance between CPUs: + */ +static inline void rebalance_tick(runqueue_t *this_rq, int idle) +{ +} #endif DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } }; @@ -1063,9 +1198,7 @@ void scheduler_tick(int user_ticks, int kstat_cpu(cpu).cpustat.iowait += sys_ticks; else kstat_cpu(cpu).cpustat.idle += sys_ticks; -#if CONFIG_SMP - idle_tick(rq); -#endif + rebalance_tick(rq, 1); return; } if (TASK_NICE(p) > 0) @@ -1121,11 +1254,8 @@ void scheduler_tick(int user_ticks, int enqueue_task(p, rq->active); } out: -#if CONFIG_SMP - if (!(jiffies % BUSY_REBALANCE_TICK)) - load_balance(rq, 0); -#endif spin_unlock(&rq->lock); + rebalance_tick(rq, 0); } void scheduling_functions_start_here(void) { } @@ -1133,19 +1263,20 @@ void scheduling_functions_start_here(voi /* * schedule() is the main scheduler function. */ -asmlinkage void schedule(void) +asmlinkage void do_schedule(void) { task_t *prev, *next; runqueue_t *rq; prio_array_t *array; struct list_head *queue; - int idx; + int idx, mycpu = smp_processor_id(); /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ + schedstats[mycpu].sched_cnt++; if (likely(current->state != TASK_ZOMBIE)) { if (unlikely(in_atomic())) { printk(KERN_ERR "bad: scheduling while atomic!\n"); @@ -1184,7 +1315,8 @@ need_resched: pick_next_task: if (unlikely(!rq->nr_running)) { #if CONFIG_SMP - load_balance(rq, 1); + schedstats[mycpu].lb_resched++; + load_balance(rq, 1, __cpu_to_node_mask(smp_processor_id())); if (rq->nr_running) goto pick_next_task; #endif @@ -1198,11 +1330,13 @@ pick_next_task: /* * Switch the active and expired arrays. */ + schedstats[mycpu].sched_switch++; rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; } + schedstats[mycpu].sched_noswitch++; idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; @@ -1367,6 +1501,22 @@ void complete_all(struct completion *x) spin_unlock_irqrestore(&x->wait.lock, flags); } +asmlinkage void user_schedule(void) +{ +#ifdef CONFIG_KGDB_THREAD + current->thread.kgdbregs = NULL; +#endif + do_schedule(); +} + +#ifdef CONFIG_KGDB_THREAD +asmlinkage void kern_do_schedule(struct pt_regs regs) +{ + current->thread.kgdbregs = ®s; + do_schedule(); +} +#endif + void wait_for_completion(struct completion *x) { might_sleep(); @@ -1859,6 +2009,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; + int mycpu = smp_processor_id(); /* * We implement yielding by moving the task into the expired @@ -1867,7 +2018,15 @@ asmlinkage long sys_sched_yield(void) * (special rule: RT tasks will just roundrobin in the active * array.) */ + schedstats[mycpu].yld_cnt++; if (likely(!rt_task(current))) { + if (current->array->nr_active == 1) { + schedstats[mycpu].yld_act_empty++; + if (!rq->expired->nr_active) + schedstats[mycpu].yld_both_empty++; + } else if (!rq->expired->nr_active) { + schedstats[mycpu].yld_exp_empty++; + } dequeue_task(current, array); enqueue_task(current, rq->expired); } else { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/sysctl.c 90-mjb/kernel/sysctl.c --- 00-virgin/kernel/sysctl.c Mon Dec 16 21:50:51 2002 +++ 90-mjb/kernel/sysctl.c Sat Feb 1 22:09:10 2003 @@ -55,6 +55,17 @@ extern char core_pattern[]; extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; +extern int min_timeslice; +extern int max_timeslice; +extern int child_penalty; +extern int parent_penalty; +extern int exit_weight; +extern int prio_bonus_ratio; +extern int interactive_delta; +extern int max_sleep_avg; +extern int starvation_limit; +extern int idle_node_rebalance_ratio; +extern int busy_node_rebalance_ratio; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -112,6 +123,7 @@ static struct ctl_table_header root_tabl static ctl_table kern_table[]; static ctl_table vm_table[]; +static ctl_table sched_table[]; #ifdef CONFIG_NET extern ctl_table net_table[]; #endif @@ -156,6 +168,7 @@ static ctl_table root_table[] = { {CTL_FS, "fs", NULL, 0, 0555, fs_table}, {CTL_DEBUG, "debug", NULL, 0, 0555, debug_table}, {CTL_DEV, "dev", NULL, 0, 0555, dev_table}, + {CTL_SCHED, "sched", NULL, 0, 0555, sched_table}, {0} }; @@ -358,7 +371,46 @@ static ctl_table debug_table[] = { static ctl_table dev_table[] = { {0} -}; +}; + +static ctl_table sched_table[] = { + {SCHED_MAX_TIMESLICE, "max_timeslice", &max_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_MIN_TIMESLICE, "min_timeslice", &min_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_CHILD_PENALTY, "child_penalty", &child_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PARENT_PENALTY, "parent_penalty", &parent_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_EXIT_WEIGHT, "exit_weight", &exit_weight, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio", &prio_bonus_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_INTERACTIVE_DELTA, "interactive_delta", &interactive_delta, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_MAX_SLEEP_AVG, "max_sleep_avg", &max_sleep_avg, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_STARVATION_LIMIT, "starvation_limit", &starvation_limit, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_IDLE_NODE_REBALANCE_RATIO, "idle_node_rebalance_ratio", + &idle_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_BUSY_NODE_REBALANCE_RATIO, "busy_node_rebalance_ratio", + &busy_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {0} +}; extern void init_irq_proc (void); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/time.c 90-mjb/kernel/time.c --- 00-virgin/kernel/time.c Sun Nov 17 20:29:28 2002 +++ 90-mjb/kernel/time.c Sun Feb 2 13:19:24 2003 @@ -27,7 +27,6 @@ #include #include #include - #include /* @@ -38,7 +37,7 @@ struct timezone sys_tz; /* The xtime_lock is not only serializing the xtime read/writes but it's also serializing all accesses to the global NTP variables now. */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long last_time_offset; #if !defined(__alpha__) && !defined(__ia64__) @@ -80,7 +79,7 @@ asmlinkage long sys_stime(int * tptr) return -EPERM; if (get_user(value, tptr)) return -EFAULT; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime.tv_sec = value; xtime.tv_nsec = 0; last_time_offset = 0; @@ -88,7 +87,7 @@ asmlinkage long sys_stime(int * tptr) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); return 0; } @@ -96,13 +95,13 @@ asmlinkage long sys_stime(int * tptr) asmlinkage long sys_gettimeofday(struct timeval *tv, struct timezone *tz) { - if (tv) { + if (likely(tv != NULL)) { struct timeval ktv; do_gettimeofday(&ktv); if (copy_to_user(tv, &ktv, sizeof(ktv))) return -EFAULT; } - if (tz) { + if (unlikely(tz != NULL)) { if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) return -EFAULT; } @@ -127,10 +126,10 @@ asmlinkage long sys_gettimeofday(struct */ inline static void warp_clock(void) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime.tv_sec += sys_tz.tz_minuteswest * 60; last_time_offset = 0; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* @@ -235,7 +234,7 @@ int do_adjtimex(struct timex *txc) txc->tick > 1100000/USER_HZ) return -EINVAL; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); result = time_state; /* mostly `TIME_OK' */ /* Save for later - semantics of adjtime is to return old value */ @@ -386,7 +385,7 @@ leave: if ((time_status & (STA_UNSYNC|ST txc->errcnt = pps_errcnt; txc->stbcnt = pps_stbcnt; last_time_offset = 0; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); do_gettimeofday(&txc->time); return(result); } @@ -409,9 +408,13 @@ asmlinkage long sys_adjtimex(struct time struct timespec current_kernel_time(void) { struct timespec now; - unsigned long flags; - read_lock_irqsave(&xtime_lock,flags); - now = xtime; - read_unlock_irqrestore(&xtime_lock,flags); + unsigned long seq; + + do { + seq = fr_read_begin(&xtime_lock); + + now = xtime; + } while (seq != fr_read_end(&xtime_lock)); + return now; } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/timer.c 90-mjb/kernel/timer.c --- 00-virgin/kernel/timer.c Mon Dec 16 21:50:51 2002 +++ 90-mjb/kernel/timer.c Sun Feb 2 13:19:24 2003 @@ -758,7 +758,7 @@ unsigned long wall_jiffies; * This read-write spinlock protects us from races in SMP while * playing with xtime and avenrun. */ -rwlock_t xtime_lock __cacheline_aligned_in_smp = RW_LOCK_UNLOCKED; +frlock_t xtime_lock __cacheline_aligned_in_smp = FR_LOCK_UNLOCKED; unsigned long last_time_offset; /* @@ -798,8 +798,7 @@ static inline void update_times(void) } /* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without holding read_lock_irq(&xtime_lock). + * The 64-bit jiffies value is not atomic * jiffies is defined in the linker script... */ @@ -1087,18 +1086,21 @@ asmlinkage long sys_sysinfo(struct sysin struct sysinfo val; unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; + unsigned long seq; memset((char *)&val, 0, sizeof(struct sysinfo)); - read_lock_irq(&xtime_lock); - val.uptime = jiffies / HZ; + do { + seq = fr_read_begin(&xtime_lock); - val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + val.uptime = jiffies / HZ; - val.procs = nr_threads; - read_unlock_irq(&xtime_lock); + val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + + val.procs = nr_threads; + } while (seq != fr_read_end(&xtime_lock)); si_meminfo(&val); si_swapinfo(&val); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/mm/memory.c 90-mjb/mm/memory.c --- 00-virgin/mm/memory.c Mon Jan 13 21:09:28 2003 +++ 90-mjb/mm/memory.c Sun Feb 2 13:19:32 2003 @@ -101,8 +101,7 @@ static inline void free_one_pmd(struct m static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) { - int j; - pmd_t * pmd; + pmd_t * pmd, * md, * emd; if (pgd_none(*dir)) return; @@ -113,8 +112,21 @@ static inline void free_one_pgd(struct m } pmd = pmd_offset(dir, 0); pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(tlb, pmd+j); + /* + * Beware if changing the loop below. It once used int j, + * for (j = 0; j < PTRS_PER_PMD; j++) + * free_one_pmd(pmd+j); + * but some older i386 compilers (e.g. egcs-2.91.66, gcc-2.95.3) + * terminated the loop with a _signed_ address comparison + * using "jle", when configured for HIGHMEM64GB (X86_PAE). + * If also configured for 3GB of kernel virtual address space, + * if page at physical 0x3ffff000 virtual 0x7ffff000 is used as + * a pmd, when that mm exits the loop goes on to free "entries" + * found at 0x80000000 onwards. The loop below compiles instead + * to be terminated by unsigned address comparison using "jb". + */ + for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++) + free_one_pmd(tlb,md); pmd_free_tlb(tlb, pmd); } @@ -600,13 +612,12 @@ void zap_page_range(struct vm_area_struc * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. */ -struct page * -follow_page(struct mm_struct *mm, unsigned long address, int write) +pte_t +__follow_page(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; pmd_t *pmd; pte_t *ptep, pte; - unsigned long pfn; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || pgd_bad(*pgd)) @@ -617,11 +628,25 @@ follow_page(struct mm_struct *mm, unsign goto out; ptep = pte_offset_map(pmd, address); - if (!ptep) + if (!ptep) { + pte.pte_low = 0; //__bad_page(); + pte.pte_high = 0; goto out; - + } pte = *ptep; pte_unmap(ptep); + +out: + return pte; +} + +struct page * +follow_page(struct mm_struct *mm, unsigned long address, int write) +{ + pte_t pte; + unsigned long pfn; + + pte = __follow_page(mm, address); if (pte_present(pte)) { if (!write || (pte_write(pte) && pte_dirty(pte))) { pfn = pte_pfn(pte); @@ -630,7 +655,6 @@ follow_page(struct mm_struct *mm, unsign } } -out: return NULL; } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/net/ipv4/tcp_output.c 90-mjb/net/ipv4/tcp_output.c --- 00-virgin/net/ipv4/tcp_output.c Sun Nov 17 20:29:50 2002 +++ 90-mjb/net/ipv4/tcp_output.c Sun Feb 2 13:19:29 2003 @@ -786,13 +786,13 @@ static void tcp_retrans_try_collapse(str /* Ok. We will be able to collapse the packet. */ __skb_unlink(next_skb, next_skb->list); + memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); + if (next_skb->ip_summed == CHECKSUM_HW) skb->ip_summed = CHECKSUM_HW; - if (skb->ip_summed != CHECKSUM_HW) { - memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); + if (skb->ip_summed != CHECKSUM_HW) skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); - } /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;