diff -urpN -X /home/fletch/.diff.exclude 00-virgin/Documentation/filesystems/proc.txt 90-mjb/Documentation/filesystems/proc.txt --- 00-virgin/Documentation/filesystems/proc.txt Mon Jan 13 21:09:08 2003 +++ 90-mjb/Documentation/filesystems/proc.txt Wed Feb 5 22:23:08 2003 @@ -37,6 +37,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/sched - scheduler tunables ------------------------------------------------------------------------------ Preface @@ -1662,6 +1663,104 @@ IPX. The /proc/net/ipx_route table holds a list of IPX routes. For each route it gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. + +2.11 /proc/sys/sched - scheduler tunables +----------------------------------------- + +Useful knobs for tuning the scheduler live in /proc/sys/sched. + +child_penalty +------------- + +Percentage of the parent's sleep_avg that children inherit. sleep_avg is +a running average of the time a process spends sleeping. Tasks with high +sleep_avg values are considered interactive and given a higher dynamic +priority and a larger timeslice. You typically want this some value just +under 100. + +exit_weight +----------- + +When a CPU hog task exits, its parent's sleep_avg is reduced by a factor of +exit_weight against the exiting task's sleep_avg. + +interactive_delta +----------------- + +If a task is "interactive" it is reinserted into the active array after it +has expired its timeslice, instead of being inserted into the expired array. +How "interactive" a task must be in order to be deemed interactive is a +function of its nice value. This interactive limit is scaled linearly by nice +value and is offset by the interactive_delta. + +max_sleep_avg +------------- + +max_sleep_avg is the largest value (in ms) stored for a task's running sleep +average. The larger this value, the longer a task needs to sleep to be +considered interactive (maximum interactive bonus is a function of +max_sleep_avg). + +max_timeslice +------------- + +Maximum timeslice, in milliseconds. This is the value given to tasks of the +highest dynamic priority. + +min_timeslice +------------- + +Minimum timeslice, in milliseconds. This is the value given to tasks of the +lowest dynamic priority. Every task gets at least this slice of the processor +per array switch. + +parent_penalty +-------------- + +Percentage of the parent's sleep_avg that it retains across a fork(). +sleep_avg is a running average of the time a process spends sleeping. Tasks +with high sleep_avg values are considered interactive and given a higher +dynamic priority and a larger timeslice. Normally, this value is 100 and thus +task's retain their sleep_avg on fork. If you want to punish interactive +tasks for forking, set this below 100. + +prio_bonus_ratio +---------------- + +Middle percentage of the priority range that tasks can receive as a dynamic +priority. The default value of 25% ensures that nice values at the +extremes are still enforced. For example, nice +19 interactive tasks will +never be able to preempt a nice 0 CPU hog. Setting this higher will increase +the size of the priority range the tasks can receive as a bonus. Setting +this lower will decrease this range, making the interactivity bonus less +apparent and user nice values more applicable. + +starvation_limit +---------------- + +Sufficiently interactive tasks are reinserted into the active array when they +run out of timeslice. Normally, tasks are inserted into the expired array. +Reinserting interactive tasks into the active array allows them to remain +runnable, which is important to interactive performance. This could starve +expired tasks, however, since the interactive task could prevent the array +switch. To prevent starving the tasks on the expired array for too long. the +starvation_limit is the longest (in ms) we will let the expired array starve +at the expense of reinserting interactive tasks back into active. Higher +values here give more preferance to running interactive tasks, at the expense +of expired tasks. Lower values provide more fair scheduling behavior, at the +expense of interactivity. The units are in milliseconds. + +idle_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N idle rebalance ticks, where N = idle_node_rebalance_ratio. + +busy_node_rebalance_ratio +------------------------- + +On NUMA machines, we normally rebalance within nodes, but we also rebalance +globally every N busy rebalance ticks, where N = busy_node_rebalance_ratio. ------------------------------------------------------------------------------ Summary diff -urpN -X /home/fletch/.diff.exclude 00-virgin/Documentation/i386/gdb-serial.txt 90-mjb/Documentation/i386/gdb-serial.txt --- 00-virgin/Documentation/i386/gdb-serial.txt Wed Dec 31 16:00:00 1969 +++ 90-mjb/Documentation/i386/gdb-serial.txt Wed Feb 5 22:23:05 2003 @@ -0,0 +1,386 @@ +Version +======= + +This version of the gdbstub package was developed and tested on +kernel version 2.3.48. It will not install on a 2.2 kernel. It may +not work on earlier versions of 2.3 kernels. It is possible that +it will continue to work on later versions of 2.3 and then +versions of 2.4 (I hope). + + +Debugging Setup +=============== + +Designate one machine as the "development" machine. This is the +machine on which you run your compiles and which has your source +code for the kernel. Designate a second machine as the "target" +machine. This is the machine that will run your experimental +kernel. + +The two machines will be connected together via a serial line out +one or the other of the COM ports of the PC. You will need a modem +eliminator and the appropriate cables. + +On the DEVELOPMENT machine you need to apply the patch for the gdb +hooks. You have probably already done that if you are reading this +file. + +On your DEVELOPMENT machine, go to your kernel source directory and +do "make menuconfig". Go down to the kernel hacking menu item and +open it up. Enable the kernel gdb stub code by selecting that item. + +Save and exit the menuconfig program. Then do "make clean" and +"make bzImage" (or whatever target you want to make). This gets +the kernel compiled with the "-g" option set -- necessary for +debugging. + +You have just built the kernel on your DEVELOPMENT machine that you +intend to run on our TARGET machine. + +To install this new kernel, use the following installation procedure. +Remember, you are on the DEVELOPMENT machine patching the kernel source +for the kernel that you intend to run on the TARGET machine. + +Copy this kernel to your target machine using your usual procedures. +I usually arrange to copy development:/usr/src/linux/arch/i386/boot/zImage +to /vmlinuz on the TARGET machine via a LAN based NFS access. That is, +I run the cp command on the target and copy from the development machine +via the LAN. Run Lilo on the new kernel on the target machine so that it +will boot! Then boot the kernel on the target machine. + +There is an utility program named "gdbstart" in the +development:/usr/src/linux/arch/i386/kernel directory. +You should copy this program over to your target machine, probably into +/sbin. This utility program is run on the target machine to +activate the kernel hooks for the debugger. It is invoked as follows: + + gdbstart [-s speed] [-t tty-dev] + defaults: /dev/ttyS0 with speed unmodified by gdbstart + +Don't run the program just yet. We'll get to that in a bit. + +Decide on which tty port you want the machines to communicate, then +cable them up back-to-back using the null modem. COM1 is /dev/ttyS0 +and COM2 is /dev/ttyS1. + +On the DEVELOPMENT machine, create a file called .gdbinit in the +directory /usr/src/linux. An example .gdbinit file looks like this: + +define rmt +set remotebaud 38400 +target remote /dev/ttyS0 +end + +Assuming that you added my gdbinit stuff to your .gdbinit, edit .gdbinit +and find the section that looks like this: + + define rmt + set remotebaud 38400 + target remote /dev/ttyS0 + end + +Change the "target" definition so that it specifies the tty port that +you intend to use. Change the "remotebaud" definition to match the +data rate that you are going to use for the com line. + +On the TARGET machine I find it helpful to create shell script file +named "debug" in the root home directory with the following contents: + + gdbstart -s 38400 -t /dev/ttyS0 < + EOF + +This runs the gdbstart program and gives it the carriage return that +it prompts for. This sets the data rate from the target machine's side. + +You are now ready to try it out. + +On your TARGET machine, freshly rebooted with your gdbstub-equipped +kernel, type "debug" in the root home directory. The system will appear +to hang with some messages on the screen from the debug stub. What +it is doing is waiting for contact from the development machine. + +On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". +When gdb gets the symbols loaded and prompts you, enter "rmt" (that's +the macro from the .gdbinit file that you just edited). If everything +is working correctly you should see gdb print out a few lines indicating +that a breakpoint has been taken. It will actually show a line of +code in the target kernel inside the gdbstub activation code. + +The gdb interaction should look something like this: + + linux-dev:/usr/src/linux# gdb vmlinux + GDB is free software and you are welcome to distribute copies of it + under certain conditions; type "show copying" to see the conditions. + There is absolutely no warranty for GDB; type "show warranty" for details. + GDB 4.15.1 (i486-slackware-linux), + Copyright 1995 Free Software Foundation, Inc... + (gdb) rmt + breakpoint () at i386-stub.c:750 + 750 } + (gdb) + + +You can now use whatever gdb commands you like to set breakpoints. +Enter "continue" to start your target machine executing again. At this +point the target system will run at full speed until it encounters +your breakpoint or gets a segment violation in the kernel, or whatever. + + +Triggering gdbstub at Kernel Boot Time +====================================== + +The gdbstub patch now has the ability for gdb to connect to the kernel during +bootup (as opposed to waiting for the system to come all the way up and then +running the gdbstart program on the target machine). This new functionality was +added by Scott Foehner at SGI. + +To force a kernel that has been compiled with gdbstub to pause during the boot +process and wait for a connection from gdb, the paramter "gdb" should be passed +to the kernel. This can be done by typing "gdb" after the name of the kernel +on the LILO command line. The patch defaults to use ttyS1 at a baud rate of +38400. These parameters can be changed by using "gdbttyS=" and +"gdbbaud=" on the command line. + +Example: + +LILO boot: linux gdb gdbttyS=1 gdbbaud=38400 + +Note that this command is entered on the TARGET machine as it is booting +the kernel that was compiled on the DEVELOPMENT machine. + +An alternate approach is to place a line in the /etc/lilo.conf file on +your TARGET machine. Under the heading for the kernel that you intend +to boot, place a line that looks like this: + + append = "gdb gdbttyS=1 gdbbaud=38400" + +This will cause the kernel to enter the gdbstub automatically at boot +time. + +BE SURE to run "lilo" after changing the /etc/lilo.conf file. + + +The "gdbstart" Program +===================== + +This utility program is used to set up the com port and data rate +for the connection from the target system to the development system. +Its usage has been described above. + +This version of the patch uses the same tty ioctl for kernel versions +2.0.30 onwards. Thus, the gdbstart utility does not need to be re-compiled +to install the patch in a later version of the kernel. The ioctl added +to the kernel for this purpose is far enough "off the end" of existing +ioctls (as of 2.1.120) that it should not interfere with any new kernel +tty ioctls for quite some time (famous last words). + +The source for the gdbstart program resides in the arch/i386/kernel directory. + + +Debugging hints +=============== + +You can break into the target machine at any time from the development +machine by typing ^C. If the target machine has interrupts enabled +this will stop it in the kernel and enter the debugger. + +There is unfortunately no way of breaking into the kernel if it is +in a loop with interrupts disabled, so if this happens to you then +you need to place exploratory breakpoints or printk's into the kernel +to find out where it is looping. + +There is a copy of an e-mail in the kgdb distribution directory which +describes how to create an NMI on an ISA bus machine using a paper +clip. I have a sophisticated version of this made by wiring a push +button switch into a PC104/ISA bus adapter card. The adapter card +nicely furnishes wire wrap pins for all the ISA bus signals. + +When you are done debugging the kernel on the target machine it is +a good idea to leave it in a running state. This makes reboots +faster, bypassing the fsck. So do a gdb "continue" as the last gdb +command if this is possible. To terminate gdb itself on the development +machine and leave the target machine running, type ^Z to suspend gdb +and then kill it with "kill %1" or something similar. + +If gdbstub Does Not Work +======================== + +If it doesn't work, you will have to troubleshoot it. Do the easy things +first like double checking your cabling and data rates. You might +try some non-kernel based programs to see if the back-to-back connection +works properly. Just something simple like cat /etc/hosts >/dev/ttyS0 +on one machine and cat /dev/ttyS0 on the other will tell you if you +can send data from one machine to the other. There is no point in tearing +out your hair in the kernel if the line doesn't work. + +All of the real action takes place in the file +/usr/src/linux/arch/i386/kernel/gdbstub.c. That is the code on the target +machine that interacts with gdb on the development machine. In gdb you can +turn on a debug switch with the following command: + + set remotedebug + +This will print out the protocol messages that gdb is exchanging with +the target machine. + +Another place to look is /usr/src/linux/drivers/char/gdbserial.c +That is the code that talks to the serial port on the target side. +There might be a problem there. + +If you are really desperate you can use printk debugging in the +gdbstub code in the target kernel until you get it working. In particular, +there is a global variable in /usr/src/linux/arch/i386/kernel/gdbstub.c +named "remote_debug". Compile your kernel with this set to 1, rather +than 0 and the debug stub will print out lots of stuff as it does +what it does. + + +Debugging Loadable Modules +========================== + +This technique comes courtesy of Edouard Parmelan + + +When you run gdb, enter the command + +source gdbinit-modules + +This will read in a file of gdb macros that was installed in your +kernel source directory with kgdb was installed. This file implements +the following commands: + +mod-list + Lists the loaded modules in the form + +mod-print-symbols + Prints all the symbols in the indicated module. + +mod-add-symbols + Loads the symbols from the object file and associates them + with the indicated module. + +After you have loaded the module that you want to debug, use the command +mod-list to find the of your module. Then use that +address in the mod-add-symbols command to load your module's symbols. +From that point onward you can debug your module as if it were a part +of the kernel. + +The file gdbinit-modules also contains a command named mod-add-lis as +an example of how to construct a command of your own to load your +favorite module. The idea is to "can" the pathname of the module +in the command so you don't have to type so much. + +Threads +======= + +Each process in a target machine is seen as a gdb thread. gdb thread related +commands (info threads, thread n) can be used. + +ia-32 hardware breakpoints +========================== + +gdb stub contains support for hardware breakpoints using debugging features +of ia-32(x86) processors. These breakpoints do not need code modification. +They use debugging registers. 4 hardware breakpoints are available in ia-32 +processors. + +Each hardware breakpoint can be of one of the following three types. +1. Execution breakpoint - An Execution breakpoint is triggered when code at the + breakpoint address is executed. + + As limited number of hardware breakpoints are available, it is advisable + to use software breakpoints ( break command ) instead of execution + hardware breakpoints, unless modification of code is to be avoided. + +2. Write breakpoint - A write breakpoint is triggered when memory location at the + breakpoint address is written. + + A write or can be placed for data of variable length. Length of a write + breakpoint indicates length of the datatype to be watched. Length is 1 + for 1 byte data , 2 for 2 byte data, 3 for 4 byte data. + +3. Access breakpoint - An access breakpoint is triggered when memory location at + the breakpoint address is either read or written. + + Access breakpoints also have lengths similar to write breakpoints. + +IO breakpoints in ia-32 are not supported. + +Since gdb stub at present does not use the protocol used by gdb for hardware +breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros +for hardware breakpoints are described below. + +hwebrk - Places an execution breakpoint + hwebrk breakpointno address +hwwbrk - Places a write breakpoint + hwwbrk breakpointno length address +hwabrk - Places an access breakpoint + hwabrk breakpointno length address +hwrmbrk - Removes a breakpoint + hwrmbrk breakpointno +exinfo - Tells whether a software or hardware breakpoint has occured. + Prints number of the hardware breakpoint if a hardware breakpoint has + occured. + +Arguments required by these commands are as follows +breakpointno - 0 to 3 +length - 1 to 3 +address - Memory location in hex digits ( without 0x ) e.g c015e9bc + +MP support +========== + +When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb client, +all the processors are forced to enter the debugger. Current thread +corresponds to the thread running on the processor where breakpoint occured. +Threads running on other processor(s) appear similar to other non running +threads in the 'info threads' output. + +ia-32 hardware debugging registers on all processors are set to same values. +Hence any hardware breakpoints may occur on any processor. + +gdb troubleshooting +=================== + +1. gdb hangs +Kill it. restart gdb. Connect to target machine. + +2. gdb cannot connect to target machine (after killing a gdb and restarting +another) +If the target machine was not inside debugger when you killed gdb, gdb cannot +connect because the target machine won't respond. +In this case echo "Ctrl+C"(ascii 3) in the serial line. +e.g. echo -e "\003" > /dev/ttyS1 +This forces that target machine into debugger after which you can connect. + +3. gdb cannot connect even after echoing Ctrl+C into serial line +Try changing serial line settings min to 1 and time to 0 +e.g. stty min 1 time 0 < /dev/ttyS1 +Try echoing again + +check serial line speed and set it to correct value if required +e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 + +Final Items +=========== + +I picked up this code from Dave Grothe and enhanced it. + +If you make some really cool modification to this stuff, or if you +fix a bug, please let me know. + +Amit S. Kale + + +(First kgdb by David Grothe ) + +(modified by Tigran Aivazian ) + Putting gdbstub into the kernel config menu. + +(modified by Scott Foehner ) + Hooks for entering gdbstub at boot time. + +(modified by Amit S. Kale ) + Threads, ia-32 hw debugging, mp support, console support, + nmi watchdog handling. diff -urpN -X /home/fletch/.diff.exclude 00-virgin/Documentation/sysrq.txt 90-mjb/Documentation/sysrq.txt --- 00-virgin/Documentation/sysrq.txt Thu Jan 2 22:04:57 2003 +++ 90-mjb/Documentation/sysrq.txt Wed Feb 5 22:23:05 2003 @@ -73,6 +73,8 @@ On other - If you know of the key combos 'l' - Send a SIGKILL to all processes, INCLUDING init. (Your system will be non-functional after this.) +'g' - Enter the kernel debugger (if configured and supported). + 'h' - Will display help ( actually any other key than those listed above will display help. but 'h' is easy to remember :-) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/Makefile 90-mjb/Makefile --- 00-virgin/Makefile Fri Jan 17 09:18:19 2003 +++ 90-mjb/Makefile Thu Feb 6 19:49:50 2003 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 5 SUBLEVEL = 59 -EXTRAVERSION = +EXTRAVERSION = -mjb4 # *DOCUMENTATION* # To see a list of typical targets execute "make help" @@ -47,7 +47,7 @@ TOPDIR := $(CURDIR) HOSTCC = gcc HOSTCXX = g++ -HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer +HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 HOSTCXXFLAGS = -O2 CROSS_COMPILE = @@ -260,8 +260,8 @@ ifdef CONFIG_MODULES export EXPORT_FLAGS := -DEXPORT_SYMTAB endif -ifndef CONFIG_FRAME_POINTER -CFLAGS += -fomit-frame-pointer +ifdef CONFIG_X86_REMOTE_DEBUG +CFLAGS += -g endif # diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/alpha/kernel/time.c 90-mjb/arch/alpha/kernel/time.c --- 00-virgin/arch/alpha/kernel/time.c Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/alpha/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -51,7 +51,7 @@ u64 jiffies_64; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; /* kernel/timer.c */ static int set_rtc_mmss(unsigned long); @@ -106,7 +106,7 @@ void timer_interrupt(int irq, void *dev, alpha_do_profile(regs->pc); #endif - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); /* * Calculate how many ticks have passed since the last update, @@ -138,7 +138,7 @@ void timer_interrupt(int irq, void *dev, state.last_rtc_update = xtime.tv_sec - (tmp ? 600 : 0); } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } void @@ -410,18 +410,19 @@ time_init(void) void do_gettimeofday(struct timeval *tv) { - unsigned long sec, usec, lost, flags; + unsigned long sec, usec, lost, seq; unsigned long delta_cycles, delta_usec, partial_tick; - read_lock_irqsave(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); - delta_cycles = rpcc() - state.last_time; - sec = xtime.tv_sec; - usec = (xtime.tv_nsec / 1000); - partial_tick = state.partial_tick; - lost = jiffies - wall_jiffies; + delta_cycles = rpcc() - state.last_time; + sec = xtime.tv_sec; + usec = (xtime.tv_nsec / 1000); + partial_tick = state.partial_tick; + lost = jiffies - wall_jiffies; - read_unlock_irqrestore(&xtime_lock, flags); + } while (seq != fr_read_end(&xtime_lock)); #ifdef CONFIG_SMP /* Until and unless we figure out how to get cpu cycle counters @@ -463,7 +464,7 @@ do_settimeofday(struct timeval *tv) unsigned long delta_usec; long sec, usec; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* The offset that is added into time in do_gettimeofday above must be subtracted out here to keep a coherent view of the @@ -494,7 +495,7 @@ do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/arm/kernel/time.c 90-mjb/arch/arm/kernel/time.c --- 00-virgin/arch/arm/kernel/time.c Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/arm/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -34,7 +34,7 @@ u64 jiffies_64; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; /* this needs a better home */ @@ -147,19 +147,20 @@ static void do_leds(void) void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec, lost; - read_lock_irqsave(&xtime_lock, flags); - usec = gettimeoffset(); - - lost = jiffies - wall_jiffies; - if (lost) - usec += lost * USECS_PER_JIFFY; - - sec = xtime.tv_sec; - usec += xtime.tv_nsec / 1000; - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + usec = gettimeoffset(); + + lost = jiffies - wall_jiffies; + if (lost) + usec += lost * USECS_PER_JIFFY; + + sec = xtime.tv_sec; + usec += xtime.tv_nsec / 1000; + } while (seq != fr_read_end(&xtime_lock)); /* usec may have gone up a lot: be safe */ while (usec >= 1000000) { @@ -173,7 +174,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -194,7 +195,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } static struct irqaction timer_irq = { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/Kconfig 90-mjb/arch/i386/Kconfig --- 00-virgin/arch/i386/Kconfig Fri Jan 17 09:18:19 2003 +++ 90-mjb/arch/i386/Kconfig Thu Feb 6 19:49:39 2003 @@ -328,11 +328,6 @@ config X86_ALIGNMENT_16 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 default y -config X86_TSC - bool - depends on MWINCHIP3D || MWINCHIP2 || MCRUSOE || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 - default y - config X86_GOOD_APIC bool depends on MK7 || MPENTIUM4 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 @@ -474,7 +469,7 @@ config NR_CPUS # Common NUMA Features config NUMA bool "Numa Memory Allocation Support" - depends on X86_NUMAQ + depends on (X86_NUMAQ || X86_SUMMIT) config DISCONTIGMEM bool @@ -486,6 +481,11 @@ config HAVE_ARCH_BOOTMEM_NODE depends on NUMA default y +config X86_TSC + bool + depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8) && !X86_NUMAQ + default y + config X86_MCE bool "Machine Check Exception" ---help--- @@ -660,6 +660,44 @@ config HIGHMEM64G endchoice +choice + help + On i386, a process can only virtually address 4GB of memory. This + lets you select how much of that virtual space you would like to + devoted to userspace, and how much to the kernel. + + Some userspace programs would like to address as much as possible and + have few demands of the kernel other than it get out of the way. These + users may opt to use the 3.5GB option to give their userspace program + as much room as possible. Due to alignment issues imposed by PAE, + the "3.5GB" option is unavailable if "64GB" high memory support is + enabled. + + Other users (especially those who use PAE) may be running out of + ZONE_NORMAL memory. Those users may benefit from increasing the + kernel's virtual address space size by taking it away from userspace, + which may not need all of its space. An indicator that this is + happening is when /proc/Meminfo's "LowFree:" is a small percentage of + "LowTotal:" while "HighFree:" is very large. + + If unsure, say "3GB" + prompt "User address space size" + default 1GB + +config 05GB + bool "3.5 GB" + depends on !HIGHMEM64G + +config 1GB + bool "3 GB" + +config 2GB + bool "2 GB" + +config 3GB + bool "1 GB" +endchoice + config HIGHMEM bool depends on HIGHMEM64G || HIGHMEM4G @@ -738,6 +776,25 @@ config MTRR See for more information. +choice + help + This is unrelated to your processor's speed. This variable alters + how often the system is asked to generate timer interrupts. A larger + value can lead to a more responsive system, but also causes extra + overhead from the increased number of context switches. + + If in doubt, leave it at the default of 1000. + + prompt "Kernel HZ" + default 1000HZ + +config 100HZ + bool "100 Hz" + +config 1000HZ + bool "1000 Hz" +endchoice + config HAVE_DEC_LOCK bool depends on (SMP || PREEMPT) && X86_CMPXCHG @@ -1557,6 +1614,17 @@ config DEBUG_SLAB allocation as well as poisoning memory on free to catch use of freed memory. +config X86_REMOTE_DEBUG + bool "KGDB: Remote (serial) kernel debugging with gdb" + +config KGDB_THREAD + bool "KGDB: Thread analysis" + depends on X86_REMOTE_DEBUG + +config GDB_CONSOLE + bool "KGDB: Console messages through gdb" + depends on X86_REMOTE_DEBUG + config DEBUG_IOVIRT bool "Memory mapped I/O debugging" depends on DEBUG_KERNEL @@ -1582,6 +1650,26 @@ config MAGIC_SYSRQ keys are documented in . Don't say Y unless you really know what this hack does. +config EARLY_PRINTK + bool "Early console support" + default n + depends on DEBUG_KERNEL + help + Write kernel log output directly into the VGA buffer or serial port. + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server.You should normally N here, + unless you want to debug such a crash. + + Syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. + config DEBUG_SPINLOCK bool "Spinlock debugging" depends on DEBUG_KERNEL @@ -1612,7 +1700,9 @@ config DEBUG_SPINLOCK_SLEEP noisy if they are called with a spinlock held. config FRAME_POINTER - bool "Compile the kernel with frame pointers" + bool + default y if X86_REMOTE_DEBUG + default n if !X86_REMOTE_DEBUG help If you say Y here the resulting kernel image will be slightly larger and slower, but it will give very useful debugging information. diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/Makefile 90-mjb/arch/i386/Makefile --- 00-virgin/arch/i386/Makefile Fri Jan 17 09:18:19 2003 +++ 90-mjb/arch/i386/Makefile Wed Feb 5 22:23:00 2003 @@ -89,6 +89,7 @@ drivers-$(CONFIG_OPROFILE) += arch/i386 CFLAGS += $(mflags-y) AFLAGS += $(mflags-y) +AFLAGS_vmlinux.lds.o += -imacros $(TOPDIR)/include/asm-i386/page.h boot := arch/i386/boot diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/Makefile 90-mjb/arch/i386/kernel/Makefile --- 00-virgin/arch/i386/kernel/Makefile Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/i386/kernel/Makefile Thu Feb 6 19:49:40 2003 @@ -17,6 +17,7 @@ obj-$(CONFIG_MCA) += mca.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbstub.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_ACPI_SLEEP) += acpi_wakeup.o @@ -31,6 +32,17 @@ obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_EDD) += edd.o obj-$(CONFIG_MODULES) += module.o obj-y += sysenter.o +ifdef CONFIG_NUMA +obj-$(CONFIG_X86_SUMMIT) += srat.o +endif + +ifdef CONFIG_X86_REMOTE_DEBUG +GDBSTART=gdbstart +GDBCLEAN= -rm -f gdbstart /sbin/gdbstart +else +GDBSTART= +GDBCLEAN= +endif EXTRA_AFLAGS := -traditional diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/apm.c 90-mjb/arch/i386/kernel/apm.c --- 00-virgin/arch/i386/kernel/apm.c Thu Jan 9 19:15:56 2003 +++ 90-mjb/arch/i386/kernel/apm.c Thu Feb 6 19:49:44 2003 @@ -227,7 +227,7 @@ #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern spinlock_t i8253_lock; extern unsigned long get_cmos_time(void); extern void machine_real_restart(unsigned char *, int); @@ -1264,7 +1264,7 @@ static int suspend(int vetoable) printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n"); } /* serialize with the timer interrupt */ - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* protect against access to timer chip registers */ spin_lock(&i8253_lock); @@ -1276,7 +1276,7 @@ static int suspend(int vetoable) ignore_normal_resume = 1; spin_unlock(&i8253_lock); - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); if (err == APM_NO_ERROR) err = APM_SUCCESS; @@ -1301,10 +1301,10 @@ static void standby(void) int err; /* serialize with the timer interrupt */ - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* If needed, notify drivers here */ get_time_diff(); - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); err = set_system_power_state(APM_STATE_STANDBY); if ((err != APM_SUCCESS) && (err != APM_NO_ERROR)) @@ -1393,9 +1393,9 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); set_time(); - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); pm_send_all(PM_RESUME, (void *)0); queue_event(event, NULL); } @@ -1410,9 +1410,9 @@ static void check_events(void) break; case APM_UPDATE_TIME: - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); set_time(); - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); break; case APM_CRITICAL_SUSPEND: diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/entry.S 90-mjb/arch/i386/kernel/entry.S --- 00-virgin/arch/i386/kernel/entry.S Fri Jan 17 09:18:20 2003 +++ 90-mjb/arch/i386/kernel/entry.S Wed Feb 5 22:23:05 2003 @@ -218,7 +218,7 @@ need_resched: jz restore_all movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) sti - call schedule + call user_schedule movl $0,TI_PRE_COUNT(%ebp) cli jmp need_resched @@ -300,7 +300,7 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: - call schedule + call user_schedule cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -503,6 +503,31 @@ ENTRY(coprocessor_segment_overrun) ENTRY(double_fault) pushl $do_double_fault jmp error_code + +#ifdef CONFIG_KGDB_THREAD +ENTRY(kern_schedule) + pushl %ebp + movl %esp, %ebp + pushl %ss + pushl %ebp + pushfl + pushl %cs + pushl 4(%ebp) + pushl %eax + pushl %es + pushl %ds + pushl %eax + pushl (%ebp) + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + call kern_do_schedule + movl %ebp, %esp + pop %ebp + ret +#endif ENTRY(invalid_TSS) pushl $do_invalid_TSS diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/gdbstart.c 90-mjb/arch/i386/kernel/gdbstart.c --- 00-virgin/arch/i386/kernel/gdbstart.c Wed Dec 31 16:00:00 1969 +++ 90-mjb/arch/i386/kernel/gdbstart.c Wed Feb 5 22:23:05 2003 @@ -0,0 +1,147 @@ +/* + * This program opens a tty file and issues the GDB stub activating + * ioctl on it. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +char *tty_name = "/dev/ttyS0" ; /* COM1 port */ +int speed = 9600 ; /* default speed */ +struct termios save_ts ; /* original term struct */ + +void print_usage(void) +{ + printf("gdbstub [-s speed] [-t tty-dev]\n") ; + printf(" defaults: /dev/ttyS0 with speed unmodified by this program\n"); + +} /* print_usage */ + +void tty_err(char *msg) +{ + char buf[100] ; + + strcpy(buf, msg) ; + strcat(buf, ": ") ; + strcat(buf, tty_name) ; + perror(buf) ; + exit(1) ; + +} /* tty_err */ + + +void setup_term(int fd) +{ + struct termios ts ; + int speed_code ; + + if (tcgetattr(fd, &ts) < 0) tty_err("tcgetattr") ; + + save_ts = ts ; + switch (speed) + { + case 4800: + speed_code = B4800 ; + break ; + case 9600: + speed_code = B9600 ; + break ; + case 19200: + speed_code = B19200 ; + break ; + case 38400: + speed_code = B38400 ; + break ; + case 57600: + speed_code = B57600 ; + break ; + case 115200: + speed_code = B115200 ; + break ; + case 230400: + speed_code = B230400 ; + break ; + default: + printf("Invalid speed: %d\n", speed) ; + exit(1) ; + } + + ts.c_cflag = CS8 | CREAD | CLOCAL ; + if (cfsetospeed(&ts, speed_code) < 0) tty_err("cfsetospeed") ; + if (cfsetispeed(&ts, speed_code) < 0) tty_err("cfsetispeed") ; + + if (tcsetattr(fd, TCSANOW, &ts) < 0) tty_err("tcsetattr") ; + +} /* setup_term */ + +int main(int argc, char **argv) +{ + int opt ; + int fil ; + int rslt ; + + while ((opt = getopt(argc, argv, "hs:t:")) > 0) + { + switch (opt) + { + case 's': + speed = atol(optarg) ; + break ; + case 't': + tty_name = optarg ; + break ; + case ':': + printf("Invalid option\n") ; + break ; + case '?': + case 'h': + default: + print_usage() ; + return 1; + } + } + + fil = open(tty_name, O_RDWR) ; + if (fil < 0) + { + perror(tty_name) ; + return 1; + } + + + setup_term(fil) ; + + /* + * When we issue this ioctl, control will not return until + * the debugger running on the remote host machine says "go". + */ + printf("\nAbout to activate GDB stub in the kernel on %s\n", tty_name) ; + printf("Hit CR to continue, kill program to abort -- ") ; + getchar() ; + sync() ; + rslt = ioctl(fil, TIOCGDB, 0) ; + if (rslt < 0) + { + perror("TIOCGDB ioctl") ; + return 1; + } + + printf("\nGDB stub successfully activated\n") ; + + for (;;) + { + pause() ; + } + + if (tcsetattr(fil, TCSANOW, &save_ts) < 0) tty_err("tcsetattr") ; + + exit(0); +} /* main */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/gdbstub.c 90-mjb/arch/i386/kernel/gdbstub.c --- 00-virgin/arch/i386/kernel/gdbstub.c Wed Dec 31 16:00:00 1969 +++ 90-mjb/arch/i386/kernel/gdbstub.c Wed Feb 5 22:23:05 2003 @@ -0,0 +1,1208 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (C) 2000-2001 VERITAS Software Corporation. + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: Amit Kale + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Origianl kgdb, compatibility with 2.1.xx kernel by David Grothe + * Integrated into 2.2.5 kernel by Tigran Aivazian + * thread support, + * support for multiple processors, + * support for ia-32(x86) hardware debugging, + * Console support, + * handling nmi watchdog + * Amit S. Kale ( akale@veritas.com ) + * + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif +#include + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int putDebugChar(int); /* write a single character */ +extern int getDebugChar(void); /* read and return a single char */ + +extern int pid_max; + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 1024 + +static char initialized; /* boolean flag. != 0 means we've been initialized */ + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES 64 +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +enum regnames { _EAX, /* 0 */ + _ECX, /* 1 */ + _EDX, /* 2 */ + _EBX, /* 3 */ + _ESP, /* 4 */ + _EBP, /* 5 */ + _ESI, /* 6 */ + _EDI, /* 7 */ + _PC /* 8 also known as eip */ , + _PS /* 9 also known as eflags */ , + _CS, /* 10 */ + _SS, /* 11 */ + _DS, /* 12 */ + _ES, /* 13 */ + _FS, /* 14 */ + _GS +}; /* 15 */ + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* */ + +#define BREAKPOINT() asm(" int $3"); + +/* Put the error code here just in case the user cares. */ +int gdb_i386errcode; +/* Likewise, the vector number here (since GDB only gets the signal + number through the usual means, and that's not very specific). */ +int gdb_i386vector = -1; + +static spinlock_t slavecpulocks[KGDB_MAX_NO_CPUS]; +volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#ifdef CONFIG_SMP +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +spinlock_t kgdb_nmispinlock = SPIN_LOCK_UNLOCKED; +#else +unsigned kgdb_spinlock = 0; +unsigned kgdb_nmispinlock = 0; +#endif + +static void +kgdb_usercode(void) +{ +} + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + do { + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + if (!putDebugChar(ch)) + return; + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + + } while ((getDebugChar() & 0x7f) != '+'); + +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static short error; + +static void +regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_EAX] = regs->eax; + gdb_regs[_EBX] = regs->ebx; + gdb_regs[_ECX] = regs->ecx; + gdb_regs[_EDX] = regs->edx; + gdb_regs[_ESI] = regs->esi; + gdb_regs[_EDI] = regs->edi; + gdb_regs[_EBP] = regs->ebp; + gdb_regs[_DS] = regs->xds; + gdb_regs[_ES] = regs->xes; + gdb_regs[_PS] = regs->eflags; + gdb_regs[_CS] = regs->xcs; + gdb_regs[_PC] = regs->eip; + gdb_regs[_ESP] = (int) (®s->esp); + gdb_regs[_SS] = __KERNEL_DS; + gdb_regs[_FS] = 0xFFFF; + gdb_regs[_GS] = 0xFFFF; +} /* regs_to_gdb_regs */ + +static void +gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) +{ + regs->eax = gdb_regs[_EAX]; + regs->ebx = gdb_regs[_EBX]; + regs->ecx = gdb_regs[_ECX]; + regs->edx = gdb_regs[_EDX]; + regs->esi = gdb_regs[_ESI]; + regs->edi = gdb_regs[_EDI]; + regs->ebp = gdb_regs[_EBP]; + regs->xds = gdb_regs[_DS]; + regs->xes = gdb_regs[_ES]; + regs->eflags = gdb_regs[_PS]; + regs->xcs = gdb_regs[_CS]; + regs->eip = gdb_regs[_PC]; +#if 0 /* can't change these */ + regs->esp = gdb_regs[_ESP]; + regs->xss = gdb_regs[_SS]; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif + +} /* gdb_regs_to_regs */ + +/* Indicate to caller of mem2hex or hex2mem that there has been an + error. */ +static volatile int kgdb_memerr = 0; +volatile int kgdb_memerr_expected = 0; +static volatile int kgdb_memerr_cnt = 0; +static int garbage_loc = -1; + +int +get_char(char *addr) +{ + return *addr; +} + +void +set_char(char *addr, int val) +{ + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set kgdb_memerr in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + + ch = get_char(mem++); + + if (may_fault && kgdb_memerr) { + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + kgdb_memerr_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch); + + if (may_fault && kgdb_memerr) { + return (mem); + } + } + if (may_fault) + kgdb_memerr_expected = 0; + return (mem); +} + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToInt(char **ptr, int *intValue) +{ + int numChars = 0; + int hexValue; + + *intValue = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *intValue = (*intValue << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#ifdef CONFIG_KGDB_THREAD +static int +stubhex(int ch) +{ + if (ch >= 'a' && ch <= 'f') + return ch - 'a' + 10; + if (ch >= '0' && ch <= '9') + return ch - '0'; + if (ch >= 'A' && ch <= 'F') + return ch - 'A' + 10; + return -1; +} + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif + +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +#ifdef CONFIG_KGDB_THREAD +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif + +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} + +#ifdef CONFIG_KGDB_THREAD +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } +#if 0 + thread = init_tasks[0]; + do { + if (thread->pid == pid) { + return thread; + } + thread = thread->next_task; + } while (thread != init_tasks[0]); +#endif + return NULL; +} +#endif + +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned addr; +} breakinfo[4] = { { +enabled:0}, { +enabled:0}, { +enabled:0}, { +enabled:0}}; + +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned dr7; + + asm volatile ("movl %%db7, %0\n":"=r" (dr7) + :); + do { + unsigned addr0, addr1, addr2, addr3; + asm volatile ("movl %%db0, %0\n" + "movl %%db1, %1\n" + "movl %%db2, %2\n" + "movl %%db3, %3\n":"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3):); + } while (0); + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movl %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movl %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movl %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movl %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movl %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +void +gdb_wait(void *arg) +{ + unsigned flags; + int processor; + + local_irq_save(flags); + processor = smp_processor_id(); + procindebug[processor] = 1; + current->thread.kgdbregs = arg; + spin_lock(slavecpulocks + processor); + correct_hw_break(); + procindebug[processor] = 0; + local_irq_restore(flags); +} + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + int addr, length; + int breakno, breaktype; + char *ptr; + int newPC; + unsigned long flags = ~0UL; + int gdb_regs[NUMREGBYTES / 4]; + int i; + int dr6; + int reboot = 0; +#ifdef CONFIG_KGDB_THREAD + int nothreads; + int maxthreads; + int threadid; + threadref thref; + struct task_struct *thread = NULL; +#endif +#define regs (*linux_regs) + + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { + return (0); + } + + if (kgdb_memerr_expected) { + /* + * This fault occured because of the get_char or set_char + * routines. These two routines use either eax of edx to + * indirectly reference the location in memory that they + * are working with. For a page fault, when we return + * the instruction will be retried, so we have to make + * sure that these registers point to valid memory. + */ + kgdb_memerr = 1; /* set mem error flag */ + kgdb_memerr_expected = 0; + kgdb_memerr_cnt++; /* helps in debugging */ + regs.eax = (long) &garbage_loc; /* make valid address */ + regs.edx = (long) &garbage_loc; /* make valid address */ + return (0); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_nmispinlock)) +#else + if (!kgdb_nmispinlock) +#endif + { + + /* Get kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_lock(&kgdb_spinlock); +#else + kgdb_spinlock = 1; +#endif + + local_irq_save(flags); + + /* Disable hardware debugging while we are in kgdb */ + __asm__("movl %0,%%db7": /* no output */ + :"r"(0)); + + for (i = 0; i < NR_CPUS; i++) { + spin_lock_init(&slavecpulocks[i]); + _raw_spin_lock(&slavecpulocks[i]); + } + + if (num_online_cpus() > 1) { + /* Force other cpus in debugger */ + if (smp_call_function(gdb_wait, NULL, 0, 99) != 0) { + return (1); + } + } + + procindebug[smp_processor_id()] = 1; + } + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'g': /* return the value of the CPU registers */ + if (!usethread || usethread == current) { + regs_to_gdb_regs(gdb_regs, ®s); + } else { + memset(gdb_regs, 0, NUMREGBYTES); + if (usethread->thread.kgdbregs) { + kgdb_memerr_expected = 1; + kgdb_memerr = 0; + get_char((char *) usethread->thread. + kgdbregs); + kgdb_memerr_expected = 0; + if (kgdb_memerr) { + gdb_regs[_PC] = + (int) kgdb_usercode; + } else { + regs_to_gdb_regs(gdb_regs, + usethread-> + thread. + kgdbregs); + } + } else { + gdb_regs[_PC] = (int) kgdb_usercode; + } + } + mem2hex((char *) gdb_regs, remcomOutBuffer, NUMREGBYTES, + 0); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], (char *) gdb_regs, + NUMREGBYTES, 0); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) { + ptr = 0; + mem2hex((char *) addr, + remcomOutBuffer, length, + 1); + if (kgdb_memerr) { + strcpy(remcomOutBuffer, + "E03"); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + } + break; + + /* MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) + if (*(ptr++) == ',') + if (hexToInt(&ptr, &length)) + if (*(ptr++) == ':') { + hex2mem(ptr, + (char *) addr, + length, 1); + + if (kgdb_memerr) { + strcpy + (remcomOutBuffer, + "E03"); + } else { + strcpy + (remcomOutBuffer, + "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + } + break; + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + case 'c': + case 's': +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* try to read optional parameter, pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) { + regs.eip = addr; + } + + newPC = regs.eip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno)) { + if (breakinfo[breakno].type == + 0) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + } + correct_hw_break(); + asm volatile ("movl %0, %%db6\n"::"r" (0)); + for (i = 0; i < NR_CPUS; i++) { + _raw_spin_unlock(&slavecpulocks[i]); + } + + procindebug[smp_processor_id()] = 0; + /* Release kgdb spinlock */ +#ifdef CONFIG_SMP + _raw_spin_unlock(&kgdb_spinlock); +#else + kgdb_spinlock = 0; +#endif + if (flags != ~0UL) + local_irq_restore(flags); + return (0); + + /* kill the program */ + case 'k': + break; + + /* query */ + case 'q': + switch (remcomInBuffer[1]) { +#ifdef CONFIG_KGDB_THREAD + case 'L': + /* List threads */ + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads + && threadid < pid_max; threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + } + } + if (threadid == pid_max) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; + + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; +#endif + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, err_code, + remcomOutBuffer); + break; + } + break; + +#ifdef CONFIG_KGDB_THREAD + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + usethread = thread; + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; +#endif + + case 'r': + reboot = 1; + strcpy(remcomOutBuffer, "OK"); + break; + case 'Y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + ptr++; + hexToInt(&ptr, &breaktype); + ptr++; + hexToInt(&ptr, &length); + ptr++; + hexToInt(&ptr, &addr); + if (set_hw_break + (breakno & 0x3, breaktype & 0x3, length & 0x3, addr) + == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + if (reboot == 1) { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt)); + __asm__ __volatile__("int3"); + } + } +} + +/* this function is used to set up exception handlers for tracing and + breakpoints */ +void +set_debug_traps(void) +{ + /* + * linux_debug_hook is defined in traps.c. We store a pointer + * to our own exception handler into it. + */ + linux_debug_hook = handle_exception; + + /* + * In case GDB is started before us, ack any packets (presumably + * "$?#xx") sitting there. */ + putDebugChar('+'); + + initialized = 1; +} + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ + +void +breakpoint(void) +{ + if (initialized) + BREAKPOINT(); +} + +#ifdef CONFIG_GDB_CONSOLE +char gdbconbuf[BUFMAX]; + +void +gdb_console_write(struct console *co, const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + + if (!gdb_initialized) { + return; + } + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } +} +#endif +static int __init +kgdb_opt_gdb(char *dummy) +{ + gdb_enter = 1; + return 1; +} +static int __init +kgdb_opt_gdbttyS(char *str) +{ + gdb_ttyS = simple_strtoul(str, NULL, 10); + return 1; +} +static int __init +kgdb_opt_gdbbaud(char *str) +{ + gdb_baud = simple_strtoul(str, NULL, 10); + return 1; +} + +/* + * Sequence of these lines has to be maintained because gdb option is a prefix + * of the other two options + */ + +__setup("gdbttyS=", kgdb_opt_gdbttyS); +__setup("gdbbaud=", kgdb_opt_gdbbaud); +__setup("gdb", kgdb_opt_gdb); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/i386_ksyms.c 90-mjb/arch/i386/kernel/i386_ksyms.c --- 00-virgin/arch/i386/kernel/i386_ksyms.c Mon Jan 13 21:09:20 2003 +++ 90-mjb/arch/i386/kernel/i386_ksyms.c Thu Feb 6 19:49:50 2003 @@ -67,7 +67,6 @@ EXPORT_SYMBOL(EISA_bus); EXPORT_SYMBOL(MCA_bus); #ifdef CONFIG_DISCONTIGMEM EXPORT_SYMBOL(node_data); -EXPORT_SYMBOL(pfn_to_nid); #endif #ifdef CONFIG_X86_NUMAQ EXPORT_SYMBOL(xquad_portio); @@ -144,6 +143,20 @@ EXPORT_SYMBOL(mmx_copy_page); #ifdef CONFIG_X86_HT EXPORT_SYMBOL(smp_num_siblings); EXPORT_SYMBOL(cpu_sibling_map); +#endif + +#ifdef CONFIG_X86_REMOTE_DEBUG +void __this_fixmap_does_not_exist(void) +{ + BUG(); +} +EXPORT_SYMBOL(__this_fixmap_does_not_exist); + +void __br_lock_usage_bug(void) +{ + BUG(); +} +EXPORT_SYMBOL(__br_lock_usage_bug); #endif #ifdef CONFIG_SMP diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/io_apic.c 90-mjb/arch/i386/kernel/io_apic.c --- 00-virgin/arch/i386/kernel/io_apic.c Fri Jan 17 09:18:20 2003 +++ 90-mjb/arch/i386/kernel/io_apic.c Thu Feb 6 19:49:46 2003 @@ -116,40 +116,84 @@ static void __init replace_pin_at_irq(un } } -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_pin_list *entry = irq_2_pin + irq; \ - \ - for (;;) { \ - unsigned int reg; \ - pin = entry->pin; \ - if (pin == -1) \ - break; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ - if (!entry->next) \ - break; \ - entry = irq_2_pin + entry->next; \ - } \ - FINAL; \ -} - -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ -DO_ACTION( __mask_and_edge, 0, = (reg & 0xffff7fff) | 0x00010000, ) - /* mask = 1, trigger = 0 */ -DO_ACTION( __unmask_and_level, 0, = (reg & 0xfffeffff) | 0x00008000, ) - /* mask = 0, trigger = 1 */ +/* mask = 1 */ +static void __mask_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + io_apic_modify(entry->apic, 0x10 + pin*2, reg |= 0x00010000); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + io_apic_sync(entry->apic); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + io_apic_modify(entry->apic, 0x10 + pin*2, reg &= 0xfffeffff); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg = (reg & 0xffff7fff) | 0x00010000; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +{ + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + for (;;) { + unsigned int reg; + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg = (reg & 0xfffeffff) | 0x00008000; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} static void mask_IO_APIC_irq (unsigned int irq) { @@ -197,13 +241,23 @@ static void clear_IO_APIC (void) static void set_ioapic_affinity (unsigned int irq, unsigned long mask) { unsigned long flags; + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; /* * Only the first 8 bits are valid. */ mask = mask << 24; spin_lock_irqsave(&ioapic_lock, flags); - __DO_ACTION(1, = mask, ) + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(entry->apic, 0x10 + 1 + pin*2, mask); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1131,7 +1185,7 @@ void disable_IO_APIC(void) * * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 */ - +#ifndef CONFIG_X86_NUMAQ static void __init setup_ioapic_ids_from_mpc (void) { struct IO_APIC_reg_00 reg_00; @@ -1225,6 +1279,9 @@ static void __init setup_ioapic_ids_from printk(" ok.\n"); } } +#else /* !CONFIG_X86_NUMAQ */ +static void __init setup_ioapic_ids_from_mpc(void) { } +#endif /* CONFIG_X86_NUMAQ */ /* * There is a nasty bug in some older SMP boards, their mptable lies diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/mpparse.c 90-mjb/arch/i386/kernel/mpparse.c --- 00-virgin/arch/i386/kernel/mpparse.c Fri Jan 17 09:18:20 2003 +++ 90-mjb/arch/i386/kernel/mpparse.c Wed Feb 5 22:23:03 2003 @@ -110,7 +110,7 @@ void __init MP_processor_info (struct mp if (!(m->mpc_cpuflag & CPU_ENABLED)) return; - apicid = mpc_apic_id(m, translation_table[mpc_record]->trans_quad); + apicid = mpc_apic_id(m, translation_table[mpc_record]); if (m->mpc_featureflag&(1<<0)) Dprintk(" Floating point unit present.\n"); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/nmi.c 90-mjb/arch/i386/kernel/nmi.c --- 00-virgin/arch/i386/kernel/nmi.c Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/i386/kernel/nmi.c Thu Feb 6 19:49:47 2003 @@ -20,11 +20,26 @@ #include #include #include +#include #include #include #include +#ifdef CONFIG_X86_REMOTE_DEBUG +extern gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs) ; \ + after; \ + } \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + unsigned int nmi_watchdog = NMI_NONE; static unsigned int nmi_hz = HZ; unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ @@ -63,8 +78,6 @@ extern void show_registers(struct pt_reg CRU_ESCR0 (with any non-null event selector) through a complemented max threshold. [IA32-Vol3, Section 14.9.9] */ #define MSR_P4_IQ_COUNTER0 0x30C -#define MSR_P4_IQ_CCCR0 0x36C -#define MSR_P4_CRU_ESCR0 0x3B8 #define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) #define P4_NMI_IQ_CCCR0 \ (P4_CCCR_OVF_PMI|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ @@ -363,12 +376,59 @@ void nmi_watchdog_tick (struct pt_regs * sum = irq_stat[cpu].apic_timer_irqs; if (last_irq_sums[cpu] == sum) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_spinlock)) +#else + if (kgdb_spinlock) +#endif + { + /* We are inside kgdb, this isn't a stuck cpu */ + alert_counter[cpu] = 0; + } else { +#ifdef CONFIG_SMP + if (spin_is_locked(&kgdb_nmispinlock)) +#else + if (kgdb_nmispinlock) +#endif + { + if (!procindebug[cpu]) { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } + return; + } + } +#endif /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; if (alert_counter[cpu] == 5*nmi_hz) { +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_SMP + if (spin_trylock(&kgdb_nmispinlock)) +#else + kgdb_nmispinlock = 1; +#endif + { + procindebug[cpu] = 1; + CHK_REMOTE_DEBUG(2,SIGBUS,0,regs,) + } +#ifdef CONFIG_SMP + else { + procindebug[cpu] = 1; + current->thread.kgdbregs = regs; + while (1) { + /* nothing */ + } + } +#endif +#endif spin_lock(&nmi_print_lock); /* * We are in trouble anyway, lets at least try diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/numaq.c 90-mjb/arch/i386/kernel/numaq.c --- 00-virgin/arch/i386/kernel/numaq.c Sun Nov 17 20:29:51 2002 +++ 90-mjb/arch/i386/kernel/numaq.c Thu Feb 6 19:49:50 2003 @@ -27,6 +27,7 @@ #include #include #include +#include #include /* These are needed before the pgdat's are created */ @@ -82,19 +83,7 @@ static void __init smp_dump_qct(void) * physnode_map[8- ] = -1; */ int physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1}; - -#define PFN_TO_ELEMENT(pfn) (pfn / PAGES_PER_ELEMENT) -#define PA_TO_ELEMENT(pa) (PFN_TO_ELEMENT(pa >> PAGE_SHIFT)) - -int pfn_to_nid(unsigned long pfn) -{ - int nid = physnode_map[PFN_TO_ELEMENT(pfn)]; - - if (nid == -1) - BUG(); /* address is not present */ - - return nid; -} +EXPORT_SYMBOL(physnode_map); /* * for each node mark the regions diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/smp.c 90-mjb/arch/i386/kernel/smp.c --- 00-virgin/arch/i386/kernel/smp.c Fri Jan 17 09:18:20 2003 +++ 90-mjb/arch/i386/kernel/smp.c Wed Feb 5 22:23:05 2003 @@ -516,10 +516,17 @@ int smp_call_function (void (*func) (voi { struct call_data_struct data; int cpus = num_online_cpus()-1; + int count = 0; + int gdb; - if (!cpus) + if (cpus <= 0) return 0; + gdb = 0; + if (wait == 99) { + wait = 0; + gdb = 1; + } data.func = func; data.info = info; atomic_set(&data.started, 0); @@ -534,12 +541,27 @@ int smp_call_function (void (*func) (voi send_IPI_allbutself(CALL_FUNCTION_VECTOR); /* Wait for response */ - while (atomic_read(&data.started) != cpus) + while (atomic_read(&data.started) != cpus) { + if (gdb) { + if (count++ == 2000000) { + printk("%s: timeout\n", __FUNCTION__); + break; + } + if (count == 1000000) { + printk("looks bad\n"); + printk("cpus=%d, started=%d\n", cpus, + atomic_read(&data.started)); + } + if (count > 1000000) + udelay(1); + } barrier(); + } if (wait) while (atomic_read(&data.finished) != cpus) barrier(); + spin_unlock(&call_lock); return 0; @@ -581,9 +603,9 @@ asmlinkage void smp_reschedule_interrupt ack_APIC_irq(); } -asmlinkage void smp_call_function_interrupt(void) +asmlinkage void smp_call_function_interrupt(struct pt_regs regs) { - void (*func) (void *info) = call_data->func; + void (*func) (void *info, struct pt_regs *) = (void (*)(void *, struct pt_regs*))call_data->func; void *info = call_data->info; int wait = call_data->wait; @@ -598,7 +620,7 @@ asmlinkage void smp_call_function_interr * At this point the info structure may be out of scope unless wait==1 */ irq_enter(); - (*func)(info); + (*func)(info, ®s); irq_exit(); if (wait) { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/smpboot.c 90-mjb/arch/i386/kernel/smpboot.c --- 00-virgin/arch/i386/kernel/smpboot.c Fri Jan 17 09:18:20 2003 +++ 90-mjb/arch/i386/kernel/smpboot.c Wed Feb 5 22:22:57 2003 @@ -62,7 +62,7 @@ int smp_num_siblings = 1; int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ /* Bitmask of currently online CPUs */ -unsigned long cpu_online_map; +unsigned long cpu_online_map = 1; static volatile unsigned long cpu_callin_map; volatile unsigned long cpu_callout_map; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/srat.c 90-mjb/arch/i386/kernel/srat.c --- 00-virgin/arch/i386/kernel/srat.c Wed Dec 31 16:00:00 1969 +++ 90-mjb/arch/i386/kernel/srat.c Thu Feb 6 19:49:40 2003 @@ -0,0 +1,471 @@ + /* + * This code is taken from 64bit discontig mem support. + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + */ + +#include +#include +#include +#include +#include +#include +#include + + +#define SRAT_DEBUG + +#define NUM_KLUDGE_PAGES 4 /* Size of page descriptor kludge */ +#define PAGE_KLUDGE_START ((u32 *)empty_zero_page - NUM_KLUDGE_PAGES) + + +/* + * proximity macros and definitions + */ +#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ +#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ +#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) +#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) +#define MAX_PXM_DOMAINS 256 /* 1 byte and no promises about values */ +/* bitmap length; _PXM is at most 255 */ +#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) +static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ + +struct node_memory_chunk_s node_memory_chunk[MAXCLUMPS]; +struct node_cpuid_s node_cpuid[NR_CPUS]; + +static int srat_num_cpus; /* number of cpus */ +static int num_memory_chunks; /* total number of memory chunks */ +static unsigned long zholes_size[MAX_NUMNODES]; + +unsigned long node_start_pfn[MAX_NUMNODES]; +unsigned long node_end_pfn[MAX_NUMNODES]; + +/* extern unsigned char acpi_checksum(void *buffer, int length); */ + +/* Identify which cnode a physical address resides on */ +int pa_to_nid(u64 paddr) +{ + int i; + struct node_memory_chunk_s *nmcp; + + /* We've got a sorted list. Binary search here? Do we care?? */ + nmcp = node_memory_chunk; + for (i = num_memory_chunks; --i >= 0; nmcp++) + if (paddr >= nmcp->start_paddr && paddr <= nmcp->end_paddr) + return (int)nmcp->nid; + + return -1; +} + +int pfn_to_nid(unsigned long pfn) +{ + return pa_to_nid(((unsigned long long)pfn) << PAGE_SHIFT); +} + +/* Identify CPU proximity domains */ + +static void __init parse_cpu_affinity_structure(char *p) +{ + struct acpi_table_processor_affinity *cpu_affinity = + (struct acpi_table_processor_affinity *) p; + + if (!cpu_affinity->flags.enabled) + return; /* empty entry */ + + /* mark this node as "seen" in node bitmap */ + BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain); + + node_cpuid[srat_num_cpus].phys_id = cpu_affinity->apic_id; + /* nid should be overridden as logical node id later */ + node_cpuid[srat_num_cpus].pxm = cpu_affinity->proximity_domain; + srat_num_cpus++; + +#ifdef SRAT_DEBUG + printk("CPU 0x%02X in proximity domain 0x%02X\n", + cpu_affinity->apic_id, cpu_affinity->proximity_domain); +#endif +} + +/* + * Identify memory proximity domains and hot-remove capabilities. + * Fill node memory chunk list structure. + */ + +static void __init parse_memory_affinity_structure (char *sratp) +{ + struct acpi_table_memory_affinity *memory_affinity = + (struct acpi_table_memory_affinity *) sratp; + u64 paddr, size; + u8 pxm; + struct node_memory_chunk_s *p, *q, *pend; + + if (!memory_affinity->flags.enabled) + return; /* empty entry */ + + /* mark this node as "seen" in node bitmap */ + BMAP_SET(pxm_bitmap, memory_affinity->proximity_domain); + + /* calculate info for memory chunk structure */ + paddr = memory_affinity->base_addr_hi; + paddr = (paddr << 32) | memory_affinity->base_addr_lo; + size = memory_affinity->length_hi; + size = (size << 32) | memory_affinity->length_lo; + pxm = memory_affinity->proximity_domain; + + if (num_memory_chunks >= MAXCLUMPS) { + printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n", + size/(1024*1024), paddr); + return; + } + + /* Insertion sort based on base address */ + pend = &node_memory_chunk[num_memory_chunks]; + for (p = &node_memory_chunk[0]; p < pend; p++) { + if (paddr < p->start_paddr) + break; + } + if (p < pend) { + for (q = pend; q >= p; q--) + *(q + 1) = *q; + } + p->start_paddr = paddr; + p->size = size; + p->end_paddr = paddr + size - 1; + p->pxm = pxm; + + num_memory_chunks++; + + +#ifdef SRAT_DEBUG + printk("Memory range 0x%llX to 0x%llX (type 0x%X) in proximity domain 0x%02X %s\n", + paddr, paddr + size - 1, + memory_affinity->memory_type, + memory_affinity->proximity_domain, + (memory_affinity->flags.hot_pluggable ? + "enabled and removable" : "enabled" ) ); +#endif +} + + +/* Parse the ACPI Static Resource Affinity Table */ +static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) +{ + u8 *start, *end, *p; + int i, j, nid; + u8 pxm_to_nid_map[MAX_PXM_DOMAINS];/* _PXM to logical node ID map */ + u8 nid_to_pxm_map[MAX_NUMNODES];/* logical node ID to _PXM map */ + + start = (u8 *)(&(sratp->reserved) + 1); /* skip header */ + p = start; + end = (u8 *)sratp + sratp->header.length; +printk("In acpi20_parse_srat: sratp=0x%p, start=0x%p, end=0x%p\n", sratp, start, end); + + memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */ + memset(node_memory_chunk, 0, sizeof(node_memory_chunk)); + memset(node_cpuid, 0, sizeof(node_cpuid)); + memset(zholes_size, 0, sizeof(zholes_size)); + + /* -1 in these maps means not available */ + memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map)); + memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map)); + + num_memory_chunks = 0; + while (p < end) { + switch (*p) { + case ACPI_SRAT_PROCESSOR_AFFINITY: + parse_cpu_affinity_structure(p); + break; + case ACPI_SRAT_MEMORY_AFFINITY: + parse_memory_affinity_structure(p); + break; + default: + printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]); + break; + } + p += p[1]; + if (p[1] == 0) { + printk("acpi20_parse_srat: Entry length value is zero;" + " can't parse any further!\n"); + break; + } + } +printk("SRAT scan complete\n"); + + /* Calculate total number of nodes in system from PXM bitmap and create + * a set of sequential node IDs starting at zero. (ACPI doesn't seem + * to specify the range of _PXM values.) + */ + numnodes = 0; /* init total nodes in system */ + for (i = 0; i < MAX_PXM_DOMAINS; i++) { + if (BMAP_TEST(pxm_bitmap, i)) { + pxm_to_nid_map[i] = numnodes; + nid_to_pxm_map[numnodes] = i; + node_set_online(numnodes); + ++numnodes; + } + } +printk("numnodes=%d\n", numnodes); + if (numnodes == 0) + BUG(); + + /* set cnode id in memory chunk structure */ + for (i = 0; i < num_memory_chunks; i++) + node_memory_chunk[i].nid = pxm_to_nid_map[node_memory_chunk[i].pxm]; + + /* set cnode id in cpu structure */ + for (i = 0; i < srat_num_cpus; i++) + node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].pxm]; + + printk("pxm bitmap: "); + for (i = 0; i < sizeof(pxm_bitmap); i++) { + printk("%02X ", pxm_bitmap[i]); + } + printk("\n"); + printk("Number of logical nodes in system = %d\n", numnodes); + printk("Number of memory chunks in system = %d\n", num_memory_chunks); + + /* PAT NOTE: debug only */ + for (j = 0; j < num_memory_chunks; j++){ + printk("chunk %d nid %d start_paddr %16llx end_paddr %16llx size %16llx\n", + j, node_memory_chunk[j].nid, + node_memory_chunk[j].start_paddr, + node_memory_chunk[j].end_paddr, + node_memory_chunk[j].size); + } + printk("done with printing out the chunks\n"); + /* PAT NOTE: end debug only */ + +printk("Memory table:\n"); + + /*calculate start/size arrays*/ + for (nid = 0; nid < numnodes; nid++) { + u64 start_addr, size; + int been_here_before; + + start_addr = size = 0; + been_here_before = 0; + for (j = 0; j < num_memory_chunks; j++){ + if (node_memory_chunk[j].nid == nid) { + /* + * This should all be in pfns!!!! + * + * (1) move assignment into node_start_pfn and node_end_pfn into this function + * if node_start_pfn[nid] < (node_memory_chunk[j].start_addr >> PAGE_SHIFT) + * we've identified a hole... + * (do we need to validate that it's a hole?) + * make sure it handles multiple holes... so add zholes_size to zholes_size + * zholes_size[nid] = zholes_size[nid] + (node_memory_chunk[j].start_addr - node_end_pfn[nid] + * node_end_pfn[nid] gets updated to start_addr + size + * need to make sure to fill in if it's the first time through this code. + */ + if (been_here_before == 0) { + printk("found chunk for nid %d\n", nid); + + start_addr = node_memory_chunk[j].start_paddr; + size = node_memory_chunk[j].size; + + node_start_pfn[nid] = (start_addr >> PAGE_SHIFT); + node_end_pfn[nid] = ((start_addr + size) >> PAGE_SHIFT); + + been_here_before = 1; + } else { + start_addr = node_memory_chunk[j].start_paddr; + size = node_memory_chunk[j].size; + + printk("HOLE: chunk %d nid %d start_paddr %16llx end_paddr %16llx size %16llx\n", + j, node_memory_chunk[j].nid, + node_memory_chunk[j].start_paddr, + node_memory_chunk[j].end_paddr, + node_memory_chunk[j].size); + + if (node_start_pfn[nid] < (start_addr >> PAGE_SHIFT)) { + printk("found a whole on nid %d, chunk %d\n", nid, j); + zholes_size[nid] = zholes_size[nid] + + ((start_addr >> PAGE_SHIFT) - node_end_pfn[nid]); + node_end_pfn[nid] = ((start_addr + size) >> PAGE_SHIFT); + } + } + printk("%s (%d): start_pfn = 0x%08lx end_pfn = %08lx\n", + __FUNCTION__, nid, node_start_pfn[nid], node_end_pfn[nid]); + printk("%s (%d): start=0x%llX size=0x%llX\n", + __FUNCTION__, nid, start_addr, size); + } + } + printk("%s (%d): start_pfn = 0x%08lx end_pfn = %08lx\n", + __FUNCTION__, nid, node_start_pfn[nid], node_end_pfn[nid]); + } + return 0; +} + + +#define kludge_to_virt(idx) (PAGE_SIZE * ((unsigned long)((u32 *)empty_zero_page - (u32 *)pg0) - NUM_KLUDGE_PAGES + (unsigned long)(idx)) ) + +#define pde_kludge(idx, phys) (PAGE_KLUDGE_START[idx] = ((phys) & ~(PAGE_SIZE - 1)) | (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)) + +/* + * Temporarily use the virtual area starting from PAGE_KLUDGE_START, + * to map the target physical address. By using this area, we can + * map up to NUM_KLUDGE_PAGES pages temporarily, i.e. until the next + * page_kludge() call. + */ +static __init void * page_kludge(unsigned long phys, unsigned long size) +{ + unsigned long base, offset, mapped_size; + int idx; + + offset = phys & (PAGE_SIZE - 1); + mapped_size = PAGE_SIZE - offset; + pde_kludge(0, phys); + base = kludge_to_virt(0); + __flush_tlb_one(base); + wbinvd(); + + printk("page_kludge(0x%lx, 0x%lx): idx=%d mapped at %lx\n", phys, size, + FIX_IO_APIC_BASE_END, base); + + /* + * Most cases can be covered by the below. + */ + idx = 0; + while (mapped_size < size) { + if (idx >= NUM_KLUDGE_PAGES) + return NULL; /* cannot handle this */ + phys += PAGE_SIZE; + pde_kludge(idx, phys); + __flush_tlb_one(kludge_to_virt(idx)); + mapped_size += PAGE_SIZE; + ++idx; + } + + return((void *)(base + offset)); +} + + +void __init get_memcfg_from_srat(void) +{ + struct acpi_table_header *header = NULL; + struct acpi_table_rsdp *rsdp = NULL; + struct acpi_table_rsdt *rsdt = NULL; + struct acpi_pointer * rsdp_address; + struct acpi_table_rsdt saved_rsdt; + int tables = 0; + int i = 0; + u32 pde_save[NUM_KLUDGE_PAGES]; + + acpi_find_root_pointer(ACPI_PHYSICAL_ADDRESSING, rsdp_address); + + if (rsdp_address->pointer_type == ACPI_PHYSICAL_POINTER) { + printk("%s: assigning address to rsdp\n", __FUNCTION__); + rsdp = (struct acpi_table_rsdp *)rsdp_address->pointer.physical; + } else { + printk("%s: rsdp_address is not a physical pointer\n", __FUNCTION__); + return; + } + if (!rsdp) { + printk("%s: Didn't find ACPI root!\n", __FUNCTION__); + return; + } + + printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision, + rsdp->oem_id); + + if (strncmp(rsdp->signature, RSDP_SIG,strlen(RSDP_SIG))) { + printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__); + return; + } + + printk("%s: calling page_kludge(0x%08X, %d)\n", __FUNCTION__, + rsdp->rsdt_address, sizeof(struct acpi_table_rsdt)); + rsdt = (struct acpi_table_rsdt *) + page_kludge(rsdp->rsdt_address, sizeof(struct acpi_table_rsdt)); + + if (!rsdt) { + printk(KERN_WARNING "%s: ACPI: Invalid root system description tables (RSDT)\n", __FUNCTION__); + return; + } + printk("%s: page_kludge returned 0x%08X\n", __FUNCTION__, (ulong)rsdt); + + header = & rsdt->header; + + if (strncmp(header->signature, RSDT_SIG, strlen(RSDT_SIG))) { + printk(KERN_WARNING "ACPI: RSDT signature incorrect\n"); + return; + } + + /* + * The number of tables is computed by taking the + * size of all entries (header size minus total + * size of RSDT) divided by the size of each entry + * (4-byte table pointers). + */ + tables = (header->length - sizeof(struct acpi_table_header)) / 4; +printk("tables = %d\n", tables); + + memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt)); + + if (saved_rsdt.header.length > sizeof(saved_rsdt)) { + printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n", saved_rsdt.header.length); + return; + } +printk("Begin table scan....\n"); + memcpy(pde_save, PAGE_KLUDGE_START, sizeof(pde_save)); + + for (i = 0; i < tables; i++) { + /* Map in header, then map in full table length. */ + header = (struct acpi_table_header *) + page_kludge(saved_rsdt.entry[i], sizeof(struct acpi_table_header)); + if (!header) + break; + header = (struct acpi_table_header *) + page_kludge(saved_rsdt.entry[i], header->length); + if (!header) + break; + + if (strncmp((char *) &header->signature, "SRAT", 4)) + continue; +/* PATNOTE TRY THIS: acpi_table_compute_checksum() */ + /* if (acpi_checksum(header, header->length)) { */ + /* printk(KERN_WARNING "ACPI %s has invalid checksum\n", */ + /*XXX -john acpi_table_signatures[i]*/ /* i); */ + /* continue; */ + /* } */ + + acpi20_parse_srat((struct acpi_table_srat *)header); + goto out; + } + + printk("get_memcfg_from_srat: no SRAT found!\n"); + out: + /* Undo page kludge. */ + memcpy(PAGE_KLUDGE_START, pde_save, sizeof(pde_save)); + __flush_tlb(); + wbinvd(); +} + +unsigned long __init get_zholes_size(int nid) +{ + if((nid >= numnodes) | (nid >= MAX_NUMNODES)) + printk("%s: nid = %d is invalid. numnodes = %d", + __FUNCTION__, nid, numnodes); + return zholes_size[nid]; +} diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/time.c 90-mjb/arch/i386/kernel/time.c --- 00-virgin/arch/i386/kernel/time.c Tue Jan 14 10:06:13 2003 +++ 90-mjb/arch/i386/kernel/time.c Thu Feb 6 19:49:44 2003 @@ -70,7 +70,7 @@ u64 jiffies_64; unsigned long cpu_khz; /* Detected as we calibrate the TSC */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; @@ -87,19 +87,21 @@ struct timer_opts* timer = &timer_none; */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - usec = timer->get_offset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + + usec = timer->get_offset(); + { + unsigned long lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); + } + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } while (unlikely(seq != fr_read_end(&xtime_lock))); while (usec >= 1000000) { usec -= 1000000; @@ -112,7 +114,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -133,7 +135,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* @@ -265,6 +267,34 @@ static inline void do_timer_interrupt(in #endif } + +/* Lost tick detection and compensation */ +static inline void detect_lost_tick(void) +{ + /* read time since last interrupt */ + unsigned long delta = timer->get_offset(); + static unsigned long dbg_print; + + /* check if delta is greater then two ticks */ + if(delta >= 2*(1000000/HZ)){ + + /* only print debug info first 5 times */ + if(dbg_print < 5){ + printk(KERN_WARNING "\nWarning! Detected %lu micro-second" + " gap between interrupts.\n",delta); + printk(KERN_WARNING " Compensating for %lu lost ticks.\n", + delta/(1000000/HZ)-1); + /* dump trace info */ + show_trace(NULL); + dbg_print++; + } + /* calculate number of missed ticks */ + delta = delta/(1000000/HZ)-1; + jiffies += delta; + } + +} + /* * This is the same as the above, except we _also_ save the current * Time Stamp Counter value at the time of the timer interrupt, so that @@ -279,13 +309,14 @@ void timer_interrupt(int irq, void *dev_ * the irq version of write_lock because as just said we have irq * locally disabled. -arca */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); + detect_lost_tick(); timer->mark_offset(); do_timer_interrupt(irq, NULL, regs); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/timers/Makefile 90-mjb/arch/i386/kernel/timers/Makefile --- 00-virgin/arch/i386/kernel/timers/Makefile Tue Jan 14 10:06:13 2003 +++ 90-mjb/arch/i386/kernel/timers/Makefile Thu Feb 6 19:49:42 2003 @@ -4,4 +4,4 @@ obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o -obj-$(CONFIG_X86_CYCLONE) += timer_cyclone.o +obj-$(CONFIG_X86_SUMMIT) += timer_cyclone.o diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/timers/timer.c 90-mjb/arch/i386/kernel/timers/timer.c --- 00-virgin/arch/i386/kernel/timers/timer.c Thu Jan 2 22:04:58 2003 +++ 90-mjb/arch/i386/kernel/timers/timer.c Thu Feb 6 19:49:42 2003 @@ -4,9 +4,14 @@ /* list of externed timers */ extern struct timer_opts timer_pit; extern struct timer_opts timer_tsc; - +#ifdef CONFIG_X86_SUMMIT +extern struct timer_opts timer_cyclone; +#endif /* list of timers, ordered by preference, NULL terminated */ static struct timer_opts* timers[] = { +#ifdef CONFIG_X86_SUMMIT + &timer_cyclone, +#endif &timer_tsc, &timer_pit, NULL, diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/timers/timer_cyclone.c 90-mjb/arch/i386/kernel/timers/timer_cyclone.c --- 00-virgin/arch/i386/kernel/timers/timer_cyclone.c Tue Jan 14 10:06:13 2003 +++ 90-mjb/arch/i386/kernel/timers/timer_cyclone.c Thu Feb 6 19:49:42 2003 @@ -17,7 +17,7 @@ #include extern spinlock_t i8253_lock; - +extern unsigned long fast_gettimeoffset_quotient; /* Number of usecs that the last interrupt was delayed */ static int delay_at_last_interrupt; @@ -142,6 +142,28 @@ static int init_cyclone(void) printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); cyclone_timer = 0; return -ENODEV; + } + } + + /* init fast_gettimeoffset_quotent and cpu_khz. + * XXX - This should really be done elsewhere, + * and in a more generic fashion. -johnstul@us.ibm.com + */ + if (cpu_has_tsc) { + unsigned long tsc_quotient = calibrate_tsc(); + if (tsc_quotient) { + fast_gettimeoffset_quotient = tsc_quotient; + /* report CPU clock rate in Hz. + * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = + * clock/second. Our precision is about 100 ppm. + */ + { unsigned long eax=0, edx=1000; + __asm__("divl %2" + :"=a" (cpu_khz), "=d" (edx) + :"r" (tsc_quotient), + "0" (eax), "1" (edx)); + printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); + } } } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/timers/timer_tsc.c 90-mjb/arch/i386/kernel/timers/timer_tsc.c --- 00-virgin/arch/i386/kernel/timers/timer_tsc.c Tue Jan 14 10:06:13 2003 +++ 90-mjb/arch/i386/kernel/timers/timer_tsc.c Thu Feb 6 19:49:42 2003 @@ -130,7 +130,7 @@ static void delay_tsc(unsigned long loop #define CALIBRATE_LATCH (5 * LATCH) #define CALIBRATE_TIME (5 * 1000020/HZ) -static unsigned long __init calibrate_tsc(void) +unsigned long __init calibrate_tsc(void) { /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/kernel/traps.c 90-mjb/arch/i386/kernel/traps.c --- 00-virgin/arch/i386/kernel/traps.c Mon Jan 13 21:09:20 2003 +++ 90-mjb/arch/i386/kernel/traps.c Wed Feb 5 22:23:05 2003 @@ -50,6 +50,24 @@ #include #include +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + +#ifdef CONFIG_X86_REMOTE_DEBUG +gdb_debug_hook * linux_debug_hook; +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (linux_debug_hook != (gdb_debug_hook *) NULL && !user_mode(regs)) \ + { \ + (*linux_debug_hook)(trapnr, signr, error_code, regs) ; \ + after; \ + } \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + asmlinkage int system_call(void); asmlinkage void lcall7(void); asmlinkage void lcall27(void); @@ -252,6 +270,7 @@ void die(const char * str, struct pt_reg bust_spinlocks(1); handle_BUG(regs); printk("%s: %04lx\n", str, err & 0xffff); + CHK_REMOTE_DEBUG(1,SIGTRAP,err,regs,) show_registers(regs); bust_spinlocks(0); spin_unlock_irq(&die_lock); @@ -312,6 +331,7 @@ static inline void do_trap(int trapnr, i #define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,)\ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -329,7 +349,9 @@ asmlinkage void do_##name(struct pt_regs #define DO_VM86_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,return)\ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + return; \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -374,8 +396,10 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) + if (!fixup_exception(regs)) { + CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,); die("general protection fault", regs, error_code); + } } static void mem_parity_error(unsigned char reason, struct pt_regs * regs) @@ -537,8 +561,10 @@ asmlinkage void do_debug(struct pt_regs * allowing programs to debug themselves without the ptrace() * interface. */ +#ifndef CONFIG_X86_REMOTE_DEBUG if ((regs->xcs & 3) == 0) goto clear_TF_reenable; +#endif if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } @@ -550,11 +576,13 @@ asmlinkage void do_debug(struct pt_regs info.si_errno = 0; info.si_code = TRAP_BRKPT; - /* If this is a kernel mode trap, save the user PC on entry to - * the kernel, that's what the debugger can make sense of. - */ - info.si_addr = ((regs->xcs & 3) == 0) ? (void *)tsk->thread.eip : - (void *)regs->eip; + + /* If this is a kernel mode trap, we need to reset db7 to allow us + * to continue sanely */ + if ((regs->xcs & 3) == 0) + goto clear_dr7; + + info.si_addr = (void *)regs->eip; force_sig_info(SIGTRAP, &info, tsk); /* Disable additional traps. They'll be re-enabled when @@ -564,13 +592,16 @@ clear_dr7: __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); + CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) return; debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); return; +#ifndef CONFIG_X86_REMOTE_DEBUG clear_TF_reenable: +#endif set_tsk_thread_flag(tsk, TIF_SINGLESTEP); clear_TF: regs->eflags &= ~TF_MASK; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/mm/discontig.c 90-mjb/arch/i386/mm/discontig.c --- 00-virgin/arch/i386/mm/discontig.c Sun Nov 17 20:29:47 2002 +++ 90-mjb/arch/i386/mm/discontig.c Thu Feb 6 19:49:39 2003 @@ -48,6 +48,14 @@ extern unsigned long max_low_pfn; extern unsigned long totalram_pages; extern unsigned long totalhigh_pages; +#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) + +unsigned long node_remap_start_pfn[MAX_NUMNODES]; +unsigned long node_remap_size[MAX_NUMNODES]; +unsigned long node_remap_offset[MAX_NUMNODES]; +void *node_remap_start_vaddr[MAX_NUMNODES]; +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + /* * Find the highest page frame number we have available for the node */ @@ -65,12 +73,13 @@ static void __init find_max_pfn_node(int */ static void __init allocate_pgdat(int nid) { - unsigned long node_datasz; - - node_datasz = PFN_UP(sizeof(struct pglist_data)); - NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); - min_low_pfn += node_datasz; - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + if (nid) + NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; + else { + NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); + min_low_pfn += PFN_UP(sizeof(pg_data_t)); + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + } } /* @@ -113,14 +122,6 @@ static void __init register_bootmem_low_ } } -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -unsigned long node_remap_start_pfn[MAX_NUMNODES]; -unsigned long node_remap_size[MAX_NUMNODES]; -unsigned long node_remap_offset[MAX_NUMNODES]; -void *node_remap_start_vaddr[MAX_NUMNODES]; -extern void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - void __init remap_numa_kva(void) { void *vaddr; @@ -145,7 +146,7 @@ static unsigned long calculate_numa_rema for (nid = 1; nid < numnodes; nid++) { /* calculate the size of the mem_map needed in bytes */ size = (node_end_pfn[nid] - node_start_pfn[nid] + 1) - * sizeof(struct page); + * sizeof(struct page) + sizeof(pg_data_t); /* convert size to large (pmd size) pages, rounding up */ size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; /* now the roundup is correct, convert to PAGE_SIZE pages */ @@ -195,9 +196,9 @@ unsigned long __init setup_memory(void) printk("Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); for (nid = 0; nid < numnodes; nid++) { - allocate_pgdat(nid); node_remap_start_vaddr[nid] = pfn_to_kaddr( highstart_pfn - node_remap_offset[nid]); + allocate_pgdat(nid); printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, (ulong) node_remap_start_vaddr[nid], (ulong) pfn_to_kaddr(highstart_pfn @@ -251,13 +252,6 @@ unsigned long __init setup_memory(void) */ find_smp_config(); - /*insert other nodes into pgdat_list*/ - for (nid = 1; nid < numnodes; nid++){ - NODE_DATA(nid)->pgdat_next = pgdat_list; - pgdat_list = NODE_DATA(nid); - } - - #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (system_max_low_pfn << PAGE_SHIFT)) { @@ -281,6 +275,19 @@ unsigned long __init setup_memory(void) void __init zone_sizes_init(void) { int nid; + unsigned long zholes_size; + + /* + * Insert nodes into pgdat_list backward so they appear in order. + * Clobber node 0's links and NULL out pgdat_list before starting. + */ + pgdat_list = NULL; + for (nid = numnodes - 1; nid >= 0; nid--) { + if (nid) + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + NODE_DATA(nid)->pgdat_next = pgdat_list; + pgdat_list = NODE_DATA(nid); + } for (nid = 0; nid < numnodes; nid++) { unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; @@ -307,18 +314,24 @@ void __init zone_sizes_init(void) #endif } } + zholes_size = get_zholes_size(nid); /* * We let the lmem_map for node 0 be allocated from the * normal bootmem allocator, but other nodes come from the * remapped KVA area - mbligh */ - if (nid) + if (!nid) + free_area_init_node(nid, NODE_DATA(nid), 0, zones_size, + start, (unsigned long *) zholes_size); + else { + unsigned long lmem_map; + lmem_map = (unsigned long)node_remap_start_vaddr[nid]; + lmem_map += sizeof(pg_data_t) + PAGE_SIZE - 1; + lmem_map &= PAGE_MASK; free_area_init_node(nid, NODE_DATA(nid), - node_remap_start_vaddr[nid], zones_size, - start, 0); - else - free_area_init_node(nid, NODE_DATA(nid), 0, - zones_size, start, 0); + (struct page *)lmem_map, zones_size, + start, (unsigned long *) zholes_size); + } } return; } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/mm/fault.c 90-mjb/arch/i386/mm/fault.c --- 00-virgin/arch/i386/mm/fault.c Mon Jan 13 21:09:20 2003 +++ 90-mjb/arch/i386/mm/fault.c Wed Feb 5 22:23:05 2003 @@ -2,6 +2,11 @@ * linux/arch/i386/mm/fault.c * * Copyright (C) 1995 Linus Torvalds + * + * Change History + * + * Tigran Aivazian Remote debugging support. + * */ #include @@ -20,6 +25,9 @@ #include #include /* For unblank_screen() */ #include +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif #include #include @@ -193,6 +201,15 @@ asmlinkage void do_page_fault(struct pt_ if (in_atomic() || !mm) goto no_context; +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs) ; + return; /* return w/modified regs */ + } + } +#endif + down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -291,6 +308,19 @@ bad_area: force_sig_info(SIGSEGV, &info, tsk); return; } + +#ifdef CONFIG_X86_REMOTE_DEBUG + if (kgdb_memerr_expected) { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + return; /* Return with modified registers */ + } + } else { + if (linux_debug_hook != (gdb_debug_hook *) NULL) { + (*linux_debug_hook)(14, SIGSEGV, error_code, regs); + } + } +#endif #ifdef CONFIG_X86_F00F_BUG /* diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/mm/init.c 90-mjb/arch/i386/mm/init.c --- 00-virgin/arch/i386/mm/init.c Mon Jan 13 21:09:20 2003 +++ 90-mjb/arch/i386/mm/init.c Thu Feb 6 19:49:49 2003 @@ -508,20 +508,36 @@ void __init mem_init(void) #endif } -#if CONFIG_X86_PAE -struct kmem_cache_s *pae_pgd_cachep; +#include + +kmem_cache_t *pmd_cache; +kmem_cache_t *pgd_cache; + +void pmd_ctor(void *, kmem_cache_t *, unsigned long); +void pgd_ctor(void *, kmem_cache_t *, unsigned long); void __init pgtable_cache_init(void) { + if (PTRS_PER_PMD > 1) { + pmd_cache = kmem_cache_create("pae_pmd", + PTRS_PER_PMD*sizeof(pmd_t), + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + pmd_ctor, + NULL); + + if (!pmd_cache) + panic("pgtable_cache_init(): cannot create pmd cache"); + } + /* * PAE pgds must be 16-byte aligned: */ - pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); - if (!pae_pgd_cachep) - panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); + pgd_cache = kmem_cache_create("pgd", PTRS_PER_PGD*sizeof(pgd_t), 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, pgd_ctor, NULL); + if (!pgd_cache) + panic("pgtable_cache_init(): Cannot create pgd cache"); } -#endif /* Put this after the callers, so that it cannot be inlined */ static int do_test_wp_bit(void) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/mm/pgtable.c 90-mjb/arch/i386/mm/pgtable.c --- 00-virgin/arch/i386/mm/pgtable.c Sun Nov 17 20:29:59 2002 +++ 90-mjb/arch/i386/mm/pgtable.c Thu Feb 6 19:49:49 2003 @@ -166,61 +166,60 @@ struct page *pte_alloc_one(struct mm_str return pte; } -#if CONFIG_X86_PAE +extern kmem_cache_t *pmd_cache; +extern kmem_cache_t *pgd_cache; -pgd_t *pgd_alloc(struct mm_struct *mm) +void pmd_ctor(void *__pmd, kmem_cache_t *pmd_cache, unsigned long flags) { - int i; - pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); - - if (pgd) { - for (i = 0; i < USER_PTRS_PER_PGD; i++) { - unsigned long pmd = __get_free_page(GFP_KERNEL); - if (!pmd) - goto out_oom; - clear_page(pmd); - set_pgd(pgd + i, __pgd(1 + __pa(pmd))); - } - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } - return pgd; -out_oom: - for (i--; i >= 0; i--) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); - return NULL; + clear_page(__pmd); } -void pgd_free(pgd_t *pgd) +void pgd_ctor(void *__pgd, kmem_cache_t *pgd_cache, unsigned long flags) { - int i; + pgd_t *pgd = __pgd; - for (i = 0; i < USER_PTRS_PER_PGD; i++) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); + if (PTRS_PER_PMD == 1) + memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); } -#else - pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + int i; + pgd_t *pgd = kmem_cache_alloc(pgd_cache, SLAB_KERNEL); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + if (PTRS_PER_PMD == 1) + return pgd; + else if (!pgd) + return NULL; + + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd_t *pmd = kmem_cache_alloc(pmd_cache, SLAB_KERNEL); + if (!pmd) + goto out_oom; + set_pgd(pgd + i, __pgd(1 + __pa((unsigned long long)((unsigned long)pmd)))); } return pgd; + +out_oom: + for (i--; i >= 0; --i) + kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pgd_cache, (void *)pgd); + return NULL; } void pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); -} + int i; -#endif /* CONFIG_X86_PAE */ + if (PTRS_PER_PMD > 1) { + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + set_pgd(pgd + i, __pgd(0)); + } + } + kmem_cache_free(pgd_cache, (void *)pgd); +} diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/oprofile/Makefile 90-mjb/arch/i386/oprofile/Makefile --- 00-virgin/arch/i386/oprofile/Makefile Sun Dec 1 09:59:46 2002 +++ 90-mjb/arch/i386/oprofile/Makefile Thu Feb 6 19:49:47 2003 @@ -7,4 +7,4 @@ DRIVER_OBJS = $(addprefix ../../../drive oprofile-y := $(DRIVER_OBJS) init.o timer_int.o oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ - op_model_ppro.o + op_model_ppro.o op_model_p4.o diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/oprofile/nmi_int.c 90-mjb/arch/i386/oprofile/nmi_int.c --- 00-virgin/arch/i386/oprofile/nmi_int.c Thu Jan 2 22:04:59 2003 +++ 90-mjb/arch/i386/oprofile/nmi_int.c Thu Feb 6 19:49:47 2003 @@ -214,12 +214,61 @@ struct oprofile_operations nmi_ops = { .stop = nmi_stop }; + +#if !defined(CONFIG_X86_64) + +static int __init p4_init(enum oprofile_cpu * cpu) +{ + __u8 cpu_model = current_cpu_data.x86_model; + + if (cpu_model > 3) + return 0; + +#ifndef CONFIG_SMP + *cpu = OPROFILE_CPU_P4; + model = &op_p4_spec; +#else + switch (smp_num_siblings) { + case 1: + *cpu = OPROFILE_CPU_P4; + model = &op_p4_spec; + return 1; + + case 2: + *cpu = OPROFILE_CPU_P4_HT2; + model = &op_p4_ht2_spec; + return 1; + } +#endif + + printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n"); + printk(KERN_INFO "oprofile: Reverting to timer mode.\n"); + return 0; +} + + +static int __init ppro_init(enum oprofile_cpu * cpu) +{ + __u8 cpu_model = current_cpu_data.x86_model; + + if (cpu_model > 5) { + *cpu = OPROFILE_CPU_PIII; + } else if (cpu_model > 2) { + *cpu = OPROFILE_CPU_PII; + } else { + *cpu = OPROFILE_CPU_PPRO; + } + + model = &op_ppro_spec; + return 1; +} + +#endif /* !CONFIG_X86_64 */ int __init nmi_init(struct oprofile_operations ** ops, enum oprofile_cpu * cpu) { __u8 vendor = current_cpu_data.x86_vendor; __u8 family = current_cpu_data.x86; - __u8 cpu_model = current_cpu_data.x86_model; if (!cpu_has_apic) return 0; @@ -233,23 +282,26 @@ int __init nmi_init(struct oprofile_oper *cpu = OPROFILE_CPU_ATHLON; break; -#ifndef CONFIG_X86_64 +#if !defined(CONFIG_X86_64) case X86_VENDOR_INTEL: - /* Less than a P6-class processor */ - if (family != 6) - return 0; - - if (cpu_model > 5) { - *cpu = OPROFILE_CPU_PIII; - } else if (cpu_model > 2) { - *cpu = OPROFILE_CPU_PII; - } else { - *cpu = OPROFILE_CPU_PPRO; + switch (family) { + /* Pentium IV */ + case 0xf: + if (!p4_init(cpu)) + return 0; + break; + + /* A P6-class processor */ + case 6: + if (!ppro_init(cpu)) + return 0; + break; + + default: + return 0; } - - model = &op_ppro_spec; break; -#endif +#endif /* !CONFIG_X86_64 */ default: return 0; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/oprofile/op_counter.h 90-mjb/arch/i386/oprofile/op_counter.h --- 00-virgin/arch/i386/oprofile/op_counter.h Sun Nov 17 20:29:46 2002 +++ 90-mjb/arch/i386/oprofile/op_counter.h Thu Feb 6 19:49:47 2003 @@ -10,7 +10,7 @@ #ifndef OP_COUNTER_H #define OP_COUNTER_H -#define OP_MAX_COUNTER 4 +#define OP_MAX_COUNTER 8 /* Per-perfctr configuration as set via * oprofilefs. diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/oprofile/op_model_p4.c 90-mjb/arch/i386/oprofile/op_model_p4.c --- 00-virgin/arch/i386/oprofile/op_model_p4.c Wed Dec 31 16:00:00 1969 +++ 90-mjb/arch/i386/oprofile/op_model_p4.c Thu Feb 6 19:49:47 2003 @@ -0,0 +1,670 @@ +/** + * @file op_model_p4.c + * P4 model-specific MSR operations + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author Graydon Hoare + */ + +#include +#include +#include +#include +#include +#include + +#include "op_x86_model.h" +#include "op_counter.h" + +#define NUM_EVENTS 39 + +#define NUM_COUNTERS_NON_HT 8 +#define NUM_ESCRS_NON_HT 45 +#define NUM_CCCRS_NON_HT 18 +#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT) + +#define NUM_COUNTERS_HT2 4 +#define NUM_ESCRS_HT2 23 +#define NUM_CCCRS_HT2 9 +#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) + +static unsigned int num_counters = NUM_COUNTERS_NON_HT; +static unsigned int num_cccrs = NUM_CCCRS_NON_HT; + + +/* this has to be checked dynamically since the + hyper-threadedness of a chip is discovered at + kernel boot-time. */ +static inline void setup_num_counters(void) +{ +#ifdef CONFIG_SMP + if (smp_num_siblings == 2) { + num_counters = NUM_COUNTERS_HT2; + num_cccrs = NUM_CCCRS_HT2; + } +#endif +} + + +/* tables to simulate simplified hardware view of p4 registers */ +struct p4_counter_binding { + int virt_counter; + int counter_address; + int cccr_address; +}; + +struct p4_event_binding { + int escr_select; /* value to put in CCCR */ + int event_select; /* value to put in ESCR */ + struct { + int virt_counter; /* for this counter... */ + int escr_address; /* use this ESCR */ + } bindings[2]; +}; + +/* nb: these CTR_* defines are a duplicate of defines in + libop/op_events.c. */ + + +#define CTR_BPU_0 (1 << 0) +#define CTR_MS_0 (1 << 1) +#define CTR_FLAME_0 (1 << 2) +#define CTR_IQ_4 (1 << 3) +#define CTR_BPU_2 (1 << 4) +#define CTR_MS_2 (1 << 5) +#define CTR_FLAME_2 (1 << 6) +#define CTR_IQ_5 (1 << 7) + +static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { + { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, + { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, + { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, + { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 }, + { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 }, + { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 }, + { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 }, + { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } +}; + +/* p4 event codes in libop/op_event.h are indices into this table. */ + +static struct p4_event_binding p4_events[NUM_EVENTS] = { + + { /* BRANCH_RETIRED */ + 0x05, 0x06, + { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, + {CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* MISPRED_BRANCH_RETIRED */ + 0x04, 0x03, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* TC_DELIVER_MODE */ + 0x01, 0x01, + { { CTR_MS_0, MSR_P4_TC_ESCR0}, + { CTR_MS_2, MSR_P4_TC_ESCR1} } + }, + + { /* BPU_FETCH_REQUEST */ + 0x00, 0x03, + { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, + { CTR_BPU_2, MSR_P4_BPU_ESCR1} } + }, + + { /* ITLB_REFERENCE */ + 0x03, 0x18, + { { CTR_BPU_0, MSR_P4_ITLB_ESCR0}, + { CTR_BPU_2, MSR_P4_ITLB_ESCR1} } + }, + + { /* MEMORY_CANCEL */ + 0x05, 0x02, + { { CTR_FLAME_0, MSR_P4_DAC_ESCR0}, + { CTR_FLAME_2, MSR_P4_DAC_ESCR1} } + }, + + { /* MEMORY_COMPLETE */ + 0x02, 0x08, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* LOAD_PORT_REPLAY */ + 0x02, 0x04, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* STORE_PORT_REPLAY */ + 0x02, 0x05, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* MOB_LOAD_REPLAY */ + 0x02, 0x03, + { { CTR_BPU_0, MSR_P4_MOB_ESCR0}, + { CTR_BPU_2, MSR_P4_MOB_ESCR1} } + }, + + { /* PAGE_WALK_TYPE */ + 0x04, 0x01, + { { CTR_BPU_0, MSR_P4_PMH_ESCR0}, + { CTR_BPU_2, MSR_P4_PMH_ESCR1} } + }, + + { /* BSQ_CACHE_REFERENCE */ + 0x07, 0x0c, + { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, + { CTR_BPU_2, MSR_P4_BSU_ESCR1} } + }, + + { /* IOQ_ALLOCATION */ + 0x06, 0x03, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + {-1,-1} } + }, + + { /* IOQ_ACTIVE_ENTRIES */ + 0x06, 0x1a, + { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, + {-1,-1} } + }, + + { /* FSB_DATA_ACTIVITY */ + 0x06, 0x17, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { CTR_BPU_2, MSR_P4_FSB_ESCR1} } + }, + + { /* BSQ_ALLOCATION */ + 0x07, 0x05, + { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, + {-1,-1} } + }, + + { /* BSQ_ACTIVE_ENTRIES */ + 0x07, 0x06, + { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, + {-1,-1} } + }, + + { /* X87_ASSIST */ + 0x05, 0x03, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* SSE_INPUT_ASSIST */ + 0x01, 0x34, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* PACKED_SP_UOP */ + 0x01, 0x08, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* PACKED_DP_UOP */ + 0x01, 0x0c, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* SCALAR_SP_UOP */ + 0x01, 0x0a, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* SCALAR_DP_UOP */ + 0x01, 0x0e, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* 64BIT_MMX_UOP */ + 0x01, 0x02, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* 128BIT_MMX_UOP */ + 0x01, 0x1a, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* X87_FP_UOP */ + 0x01, 0x04, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* X87_SIMD_MOVES_UOP */ + 0x01, 0x2e, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* MACHINE_CLEAR */ + 0x05, 0x02, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* GLOBAL_POWER_EVENTS */ + 0x06, 0x13 /* manual says 0x05 */, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { CTR_BPU_2, MSR_P4_FSB_ESCR1} } + }, + + { /* TC_MS_XFER */ + 0x00, 0x05, + { { CTR_MS_0, MSR_P4_MS_ESCR0}, + { CTR_MS_2, MSR_P4_MS_ESCR1} } + }, + + { /* UOP_QUEUE_WRITES */ + 0x00, 0x09, + { { CTR_MS_0, MSR_P4_MS_ESCR0}, + { CTR_MS_2, MSR_P4_MS_ESCR1} } + }, + + { /* FRONT_END_EVENT */ + 0x05, 0x08, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* EXECUTION_EVENT */ + 0x05, 0x0c, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* REPLAY_EVENT */ + 0x05, 0x09, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* INSTR_RETIRED */ + 0x04, 0x02, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* UOPS_RETIRED */ + 0x04, 0x01, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* UOP_TYPE */ + 0x02, 0x02, + { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, + { CTR_IQ_5, MSR_P4_RAT_ESCR1} } + }, + + { /* RETIRED_MISPRED_BRANCH_TYPE */ + 0x02, 0x05, + { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, + { CTR_MS_2, MSR_P4_TBPU_ESCR1} } + }, + + { /* RETIRED_BRANCH_TYPE */ + 0x02, 0x04, + { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, + { CTR_MS_2, MSR_P4_TBPU_ESCR1} } + } +}; + + +#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7) + +#define ESCR_RESERVED_BITS 0x80000003 +#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS) +#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2)) +#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3)) +#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1))) +#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) +#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x1f) << 25)) +#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) +#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0); +#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0); + +#define CCCR_RESERVED_BITS 0x38030FFF +#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) +#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000) +#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13)) +#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26)) +#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) +#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) +#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) +#define CCCR_READ(low, high, i) do {rdmsr (p4_counters[(i)].cccr_address, (low), (high));} while (0); +#define CCCR_WRITE(low, high, i) do {wrmsr (p4_counters[(i)].cccr_address, (low), (high));} while (0); +#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) +#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) + +#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0); +#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0); +#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) + +/* these access the underlying cccrs 1-18, not the subset of 8 bound to "virtual counters" */ +#define RAW_CCCR_READ(low, high, i) do {rdmsr (MSR_P4_BPU_CCCR0 + (i), (low), (high));} while (0); +#define RAW_CCCR_WRITE(low, high, i) do {wrmsr (MSR_P4_BPU_CCCR0 + (i), (low), (high));} while (0); + + +/* this assigns a "stagger" to the current CPU, which is used throughout + the code in this module as an extra array offset, to select the "even" + or "odd" part of all the divided resources. */ +static inline unsigned int get_stagger(void) +{ +#ifdef CONFIG_SMP + int cpu; + if (smp_num_siblings > 1) { + cpu = smp_processor_id(); + return (cpu_sibling_map[cpu] > cpu) ? 0 : 1; + } +#endif + return 0; +} + + +/* finally, mediate access to a real hardware counter + by passing a "virtual" counter numer to this macro, + along with your stagger setting. */ +#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger))) + +static unsigned long reset_value[NUM_COUNTERS_NON_HT]; + + +static void p4_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + unsigned int addr, stag; + + setup_num_counters(); + stag = get_stagger(); + + /* the 8 counter registers we pay attention to */ + for (i = 0; i < num_counters; ++i) + msrs->counters.addrs[i] = + p4_counters[VIRT_CTR(stag, i)].counter_address; + + /* 18 CCCR registers */ + for (i=stag, addr = MSR_P4_BPU_CCCR0; + addr <= MSR_P4_IQ_CCCR5; ++i, addr += (1 + stag)) + msrs->controls.addrs[i] = addr; + + /* 43 ESCR registers */ + for (addr = MSR_P4_BSU_ESCR0; + addr <= MSR_P4_SSU_ESCR0; ++i, addr += (1 + stag)){ + msrs->controls.addrs[i] = addr; + } + + for (addr = MSR_P4_MS_ESCR0; + addr <= MSR_P4_TC_ESCR1; ++i, addr += (1 + stag)){ + msrs->controls.addrs[i] = addr; + } + + for (addr = MSR_P4_IX_ESCR0; + addr <= MSR_P4_CRU_ESCR3; ++i, addr += (1 + stag)){ + msrs->controls.addrs[i] = addr; + } + + /* there are 2 remaining non-contiguously located ESCRs */ + + if (num_counters == NUM_COUNTERS_NON_HT) { + /* standard non-HT CPUs handle both remaining ESCRs*/ + msrs->controls.addrs[i++] = MSR_P4_CRU_ESCR5; + msrs->controls.addrs[i++] = MSR_P4_CRU_ESCR4; + + } else if (stag == 0) { + /* HT CPUs give the first remainder to the even thread, as + the 32nd control register */ + msrs->controls.addrs[i++] = MSR_P4_CRU_ESCR4; + + } else { + /* and two copies of the second to the odd thread, + for the 31st and 32nd control registers */ + msrs->controls.addrs[i++] = MSR_P4_CRU_ESCR5; + msrs->controls.addrs[i++] = MSR_P4_CRU_ESCR5; + } +} + + +static void pmc_setup_one_p4_counter(unsigned int ctr) +{ + int i; + int const maxbind = 2; + unsigned int cccr = 0; + unsigned int escr = 0; + unsigned int high = 0; + unsigned int counter_bit; + struct p4_event_binding * ev = 0; + unsigned int stag; + + stag = get_stagger(); + + /* convert from counter *number* to counter *bit* */ + counter_bit = 1 << ctr; + + /* find our event binding structure. */ + if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { + printk(KERN_ERR + "oprofile: P4 event code 0x%lx out of range\n", + counter_config[ctr].event); + return; + } + + ev = &(p4_events[counter_config[ctr].event - 1]); + + for (i = 0; i < maxbind; i++) { + if (ev->bindings[i].virt_counter & counter_bit) { + + /* modify ESCR */ + ESCR_READ(escr, high, ev, i); + ESCR_CLEAR(escr); + if (stag == 0) { + ESCR_SET_USR_0(escr, counter_config[ctr].user); + ESCR_SET_OS_0(escr, counter_config[ctr].kernel); + } else { + ESCR_SET_USR_1(escr, counter_config[ctr].user); + ESCR_SET_OS_1(escr, counter_config[ctr].kernel); + } + ESCR_SET_EVENT_SELECT(escr, ev->event_select); + ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); + ESCR_WRITE(escr, high, ev, i); + + /* modify CCCR */ + CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); + CCCR_CLEAR(cccr); + CCCR_SET_REQUIRED_BITS(cccr); + CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); + if (stag == 0) { + CCCR_SET_PMI_OVF_0(cccr); + } else { + CCCR_SET_PMI_OVF_1(cccr); + } + CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); + return; + } + } +} + + +static void p4_setup_ctrs(struct op_msrs const * const msrs) +{ + unsigned int i; + unsigned int low, high; + unsigned int addr; + unsigned int stag; + + stag = get_stagger(); + + rdmsr(MSR_IA32_MISC_ENABLE, low, high); + if (! MISC_PMC_ENABLED_P(low)) { + printk(KERN_ERR "oprofile: P4 PMC not available\n"); + return; + } + + /* clear all cccrs (including those outside our concern) */ + for (i = stag ; i < num_cccrs ; i += (1 + stag)) { + RAW_CCCR_READ(low, high, i); + CCCR_CLEAR(low); + CCCR_SET_REQUIRED_BITS(low); + RAW_CCCR_WRITE(low, high, i); + } + + /* clear all escrs (including those outside out concern) */ + for (addr = MSR_P4_BSU_ESCR0 + stag; + addr <= MSR_P4_SSU_ESCR0; addr += (1 + stag)){ + wrmsr(addr, 0, 0); + } + + for (addr = MSR_P4_MS_ESCR0 + stag; + addr <= MSR_P4_TC_ESCR1; addr += (1 + stag)){ + wrmsr(addr, 0, 0); + } + + for (addr = MSR_P4_IX_ESCR0 + stag; + addr <= MSR_P4_CRU_ESCR3; addr += (1 + stag)){ + wrmsr(addr, 0, 0); + } + + if (num_counters == NUM_COUNTERS_NON_HT) { + wrmsr(MSR_P4_CRU_ESCR4, 0, 0); + wrmsr(MSR_P4_CRU_ESCR5, 0, 0); + } else if (stag == 0) { + wrmsr(MSR_P4_CRU_ESCR4, 0, 0); + } else { + wrmsr(MSR_P4_CRU_ESCR5, 0, 0); + } + + /* setup all counters */ + for (i = 0 ; i < num_counters ; ++i) { + if (counter_config[i].event) { + reset_value[i] = counter_config[i].count; + pmc_setup_one_p4_counter(i); + CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); + } else { + reset_value[i] = 0; + } + } +} + + +static int p4_check_ctrs(unsigned int const cpu, + struct op_msrs const * const msrs, + struct pt_regs * const regs) +{ + unsigned long ctr, low, high, stag, real; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + + if (!counter_config[i].event) + continue; + + /* + * there is some eccentricity in the hardware which + * requires that we perform 2 extra corrections: + * + * - check both the CCCR:OVF flag for overflow and the + * counter high bit for un-flagged overflows. + * + * - write the counter back twice to ensure it gets + * updated properly. + * + * the former seems to be related to extra NMIs happening + * during the current NMI; the latter is reported as errata + * N15 in intel doc 249199-029, pentium 4 specification + * update, though their suggested work-around does not + * appear to solve the problem. + */ + + real = VIRT_CTR(stag, i); + + CCCR_READ(low, high, real); + CTR_READ(ctr, high, real); + if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { + oprofile_add_sample(regs->eip, i, cpu); + CTR_WRITE(reset_value[i], real); + CCCR_CLEAR_OVF(low); + CCCR_WRITE(low, high, real); + CTR_WRITE(reset_value[i], real); + /* P4 quirk: you have to re-unmask the apic vector */ + apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + return 1; + } + } + + /* P4 quirk: you have to re-unmask the apic vector */ + apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + return 0; +} + + +static void p4_start(struct op_msrs const * const msrs) +{ + unsigned int low, high, stag; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + if (!reset_value[i]) continue; + CCCR_READ(low, high, VIRT_CTR(stag, i)); + CCCR_SET_ENABLE(low); + CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + } +} + + +static void p4_stop(struct op_msrs const * const msrs) +{ + unsigned int low, high, stag; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + CCCR_READ(low, high, VIRT_CTR(stag, i)); + CCCR_SET_DISABLE(low); + CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + } +} + + +#ifdef CONFIG_SMP +struct op_x86_model_spec const op_p4_ht2_spec = { + .num_counters = NUM_COUNTERS_HT2, + .num_controls = NUM_CONTROLS_HT2, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop +}; +#endif + +struct op_x86_model_spec const op_p4_spec = { + .num_counters = NUM_COUNTERS_NON_HT, + .num_controls = NUM_CONTROLS_NON_HT, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop +}; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/oprofile/op_x86_model.h 90-mjb/arch/i386/oprofile/op_x86_model.h --- 00-virgin/arch/i386/oprofile/op_x86_model.h Sun Nov 17 20:29:28 2002 +++ 90-mjb/arch/i386/oprofile/op_x86_model.h Thu Feb 6 19:49:47 2003 @@ -11,8 +11,8 @@ #ifndef OP_X86_MODEL_H #define OP_X86_MODEL_H -/* will need re-working for Pentium IV */ -#define MAX_MSR 4 +/* Pentium IV needs all these */ +#define MAX_MSR 63 struct op_saved_msr { unsigned int high; @@ -47,6 +47,8 @@ struct op_x86_model_spec { }; extern struct op_x86_model_spec const op_ppro_spec; +extern struct op_x86_model_spec const op_p4_spec; +extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_athlon_spec; #endif /* OP_X86_MODEL_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/pci/numa.c 90-mjb/arch/i386/pci/numa.c --- 00-virgin/arch/i386/pci/numa.c Thu Jan 9 19:15:56 2003 +++ 90-mjb/arch/i386/pci/numa.c Thu Feb 6 19:49:49 2003 @@ -17,7 +17,7 @@ static int __pci_conf1_mq_read (int seg, { unsigned long flags; - if (!value || (bus > 255) || (dev > 31) || (fn > 7) || (reg > 255)) + if (!value || (bus > MAX_MP_BUSSES) || (dev > 31) || (fn > 7) || (reg > 255)) return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); @@ -45,7 +45,7 @@ static int __pci_conf1_mq_write (int seg { unsigned long flags; - if ((bus > 255) || (dev > 31) || (fn > 7) || (reg > 255)) + if ((bus > MAX_MP_BUSSES) || (dev > 31) || (fn > 7) || (reg > 255)) return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/i386/vmlinux.lds.S 90-mjb/arch/i386/vmlinux.lds.S --- 00-virgin/arch/i386/vmlinux.lds.S Fri Jan 17 09:18:20 2003 +++ 90-mjb/arch/i386/vmlinux.lds.S Wed Feb 5 22:23:00 2003 @@ -10,7 +10,7 @@ ENTRY(_start) jiffies = jiffies_64; SECTIONS { - . = 0xC0000000 + 0x100000; + . = __PAGE_OFFSET + 0x100000; /* read-only */ _text = .; /* Text and read-only data */ .text : { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/ia64/kernel/time.c 90-mjb/arch/ia64/kernel/time.c --- 00-virgin/arch/ia64/kernel/time.c Sun Nov 17 20:29:28 2002 +++ 90-mjb/arch/ia64/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -24,7 +24,7 @@ #include #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; extern unsigned long last_time_offset; @@ -89,7 +89,7 @@ gettimeoffset (void) void do_settimeofday (struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); { /* * This is revolting. We need to set "xtime" correctly. However, the value @@ -112,21 +112,21 @@ do_settimeofday (struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; } - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } void do_gettimeofday (struct timeval *tv) { - unsigned long flags, usec, sec, old; + unsigned long seq, usec, sec, old; - read_lock_irqsave(&xtime_lock, flags); - { + do { + seq = fr_read_begin(&xtime_lock); usec = gettimeoffset(); /* - * Ensure time never goes backwards, even when ITC on different CPUs are - * not perfectly synchronized. + * Ensure time never goes backwards, even when ITC on + * different CPUs are not perfectly synchronized. */ do { old = last_time_offset; @@ -138,8 +138,8 @@ do_gettimeofday (struct timeval *tv) sec = xtime.tv_sec; usec += xtime.tv_nsec / 1000; - } - read_unlock_irqrestore(&xtime_lock, flags); + } while (seq != fr_read_end(&xtime_lock)); + while (usec >= 1000000) { usec -= 1000000; @@ -182,10 +182,10 @@ timer_interrupt(int irq, void *dev_id, s * another CPU. We need to avoid to SMP race by acquiring the * xtime_lock. */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); do_timer(regs); local_cpu_data->itm_next = new_itm; - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } else local_cpu_data->itm_next = new_itm; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/m68k/kernel/time.c 90-mjb/arch/m68k/kernel/time.c --- 00-virgin/arch/m68k/kernel/time.c Sun Nov 17 20:29:29 2002 +++ 90-mjb/arch/m68k/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -129,7 +129,7 @@ void time_init(void) mach_sched_init(timer_interrupt); } -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* * This version of gettimeofday has near microsecond resolution. @@ -137,17 +137,20 @@ extern rwlock_t xtime_lock; void do_gettimeofday(struct timeval *tv) { extern unsigned long wall_jiffies; - unsigned long flags; + unsigned long seq; unsigned long usec, sec, lost; - read_lock_irqsave(&xtime_lock, flags); - usec = mach_gettimeoffset(); - lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000/HZ); - sec = xtime.tv_sec; - usec += xtime.tv_nsec/1000; - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + + usec = mach_gettimeoffset(); + lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000/HZ); + sec = xtime.tv_sec; + usec += xtime.tv_nsec/1000; + } while (seq != fr_read_end(&xtime_lock)); + while (usec >= 1000000) { usec -= 1000000; @@ -162,7 +165,7 @@ void do_settimeofday(struct timeval *tv) { extern unsigned long wall_jiffies; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_nsec * correctly. However, the value in this location is * is value at the last tick. @@ -183,5 +186,5 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/m68knommu/kernel/time.c 90-mjb/arch/m68knommu/kernel/time.c --- 00-virgin/arch/m68knommu/kernel/time.c Sun Nov 17 20:29:49 2002 +++ 90-mjb/arch/m68knommu/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -126,21 +126,24 @@ void time_init(void) mach_sched_init(timer_interrupt); } -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* * This version of gettimeofday has near microsecond resolution. */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - usec = mach_gettimeoffset ? mach_gettimeoffset() : 0; - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + + usec = mach_gettimeoffset ? mach_gettimeoffset() : 0; + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } while (seq != fr_read_end(&xtime_lock)); + while (usec >= 1000000) { usec -= 1000000; @@ -153,7 +156,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec * correctly. However, the value in this location is * is value at the last tick. @@ -174,5 +177,5 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/au1000/common/time.c 90-mjb/arch/mips/au1000/common/time.c --- 00-virgin/arch/mips/au1000/common/time.c Sun Nov 17 20:29:31 2002 +++ 90-mjb/arch/mips/au1000/common/time.c Thu Feb 6 19:49:45 2003 @@ -44,7 +44,7 @@ unsigned long uart_baud_base; static unsigned long r4k_offset; /* Amount to increment compare reg each time */ static unsigned long r4k_cur; /* What counter should be at next timer irq */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; #define ALLINTS (IE_IRQ0 | IE_IRQ1 | IE_IRQ2 | IE_IRQ3 | IE_IRQ4 | IE_IRQ5) @@ -150,10 +150,10 @@ void __init time_init(void) set_cp0_status(ALLINTS); /* Read time from the RTC chipset. */ - write_lock_irqsave (&xtime_lock, flags); + fr_write_lock_irqsave (&xtime_lock, flags); xtime.tv_sec = get_mips_time(); xtime.tv_usec = 0; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); } /* This is for machines which generate the exact clock. */ @@ -229,20 +229,23 @@ static unsigned long do_fast_gettimeoffs void do_gettimeofday(struct timeval *tv) { - unsigned int flags; + unsigned long seq; - read_lock_irqsave (&xtime_lock, flags); - *tv = xtime; - tv->tv_usec += do_fast_gettimeoffset(); + do { + seq = fr_read_begin(&xtime_lock); - /* - * xtime is atomically updated in timer_bh. jiffies - wall_jiffies - * is nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; + *tv = xtime; + tv->tv_usec += do_fast_gettimeoffset(); + + /* + * xtime is atomically updated in timer_bh. jiffies - wall_jiffies + * is nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; + + } while (seq != fr_read_end(&xtime_lock)); - read_unlock_irqrestore (&xtime_lock, flags); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -252,7 +255,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec correctly. * However, the value in this location is is value at the last tick. @@ -272,7 +275,7 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } /* diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/baget/time.c 90-mjb/arch/mips/baget/time.c --- 00-virgin/arch/mips/baget/time.c Sun Nov 17 20:29:20 2002 +++ 90-mjb/arch/mips/baget/time.c Thu Feb 6 19:49:45 2003 @@ -23,7 +23,7 @@ #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* * To have precision clock, we need to fix available clock frequency @@ -79,20 +79,21 @@ void __init time_init(void) void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; - read_lock_irqsave (&xtime_lock, flags); - *tv = xtime; - read_unlock_irqrestore (&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + *tv = xtime; + } while (seq != fr_read_end(&xtime_lock)); } void do_settimeofday(struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); xtime = *tv; time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/dec/time.c 90-mjb/arch/mips/dec/time.c --- 00-virgin/arch/mips/dec/time.c Thu Jan 2 22:05:00 2003 +++ 90-mjb/arch/mips/dec/time.c Thu Feb 6 19:49:45 2003 @@ -35,7 +35,7 @@ extern void (*board_time_init)(struct irqaction *irq); extern volatile unsigned long wall_jiffies; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* * Change this if you have some constant time drift @@ -210,20 +210,22 @@ static unsigned long (*do_gettimeoffset) */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; - read_lock_irqsave(&xtime_lock, flags); - *tv = xtime; - tv->tv_usec += do_gettimeoffset(); + do { + seq = fr_read_begin(&xtime_lock); + *tv = xtime; + tv->tv_usec += do_gettimeoffset(); - /* - * xtime is atomically updated in timer_bh. jiffies - wall_jiffies - * is nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; + /* + * xtime is atomically updated in timer_bh. jiffies - wall_jiffies + * is nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; + + } while (seq != fr_read_end(&xtime_lock)); - read_unlock_irqrestore(&xtime_lock, flags); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -233,7 +235,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec * correctly. However, the value in this location is @@ -254,7 +256,7 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* @@ -330,6 +332,7 @@ static inline void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { volatile unsigned char dummy; + unsigned long seq; dummy = CMOS_READ(RTC_REG_C); /* ACK RTC Interrupt */ @@ -357,23 +360,27 @@ timer_interrupt(int irq, void *dev_id, s * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be * called as close as possible to 500 ms before the new second starts. */ - read_lock(&xtime_lock); - if ((time_status & STA_UNSYNC) == 0 - && xtime.tv_sec > last_rtc_update + 660 - && xtime.tv_usec >= 500000 - tick / 2 - && xtime.tv_usec <= 500000 + tick / 2) { - if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - /* do it again in 60 s */ - last_rtc_update = xtime.tv_sec - 600; - } + do { + seq = fr_read_begin(&xtime_lock); + + if ((time_status & STA_UNSYNC) == 0 + && xtime.tv_sec > last_rtc_update + 660 + && xtime.tv_usec >= 500000 - tick / 2 + && xtime.tv_usec <= 500000 + tick / 2) { + if (set_rtc_mmss(xtime.tv_sec) == 0) + last_rtc_update = xtime.tv_sec; + else + /* do it again in 60 s */ + last_rtc_update = xtime.tv_sec - 600; + } + } while (seq != fr_read_end(&xtime_lock)); + /* As we return to user mode fire off the other CPU schedulers.. this is basically because we don't yet share IRQ's around. This message is rigged to be safe on the 386 - basically it's a hack, so don't look closely for now.. */ /*smp_message_pass(MSG_ALL_BUT_SELF, MSG_RESCHEDULE, 0L, 0); */ - read_unlock(&xtime_lock); + } static void r4k_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) @@ -470,10 +477,10 @@ void __init time_init(void) real_year = CMOS_READ(RTC_DEC_YEAR); year += real_year - 72 + 2000; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime.tv_sec = mktime(year, mon, day, hour, min, sec); xtime.tv_usec = 0; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); if (mips_cpu.options & MIPS_CPU_COUNTER) { write_32bit_cp0_register(CP0_COUNT, 0); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/ite-boards/generic/time.c 90-mjb/arch/mips/ite-boards/generic/time.c --- 00-virgin/arch/mips/ite-boards/generic/time.c Sun Nov 17 20:29:32 2002 +++ 90-mjb/arch/mips/ite-boards/generic/time.c Thu Feb 6 19:49:45 2003 @@ -38,7 +38,7 @@ extern void enable_cpu_timer(void); extern volatile unsigned long wall_jiffies; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; unsigned long missed_heart_beats = 0; static long last_rtc_update = 0; @@ -119,6 +119,8 @@ static int set_rtc_mmss(unsigned long no */ void mips_timer_interrupt(struct pt_regs *regs) { + unsigned long seq; + if (r4k_offset == 0) goto null; @@ -133,18 +135,22 @@ void mips_timer_interrupt(struct pt_regs * within 500ms before the * next second starts, * thus the following code. */ - read_lock(&xtime_lock); - if ((time_status & STA_UNSYNC) == 0 - && xtime.tv_sec > last_rtc_update + 660 - && xtime.tv_usec >= 500000 - (tick >> 1) - && xtime.tv_usec <= 500000 + (tick >> 1)) - if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else { - /* do it again in 60 s */ - last_rtc_update = xtime.tv_sec - 600; - } - read_unlock(&xtime_lock); + do { + seq = fr_read_begin(&xtime_lock); + + + if ((time_status & STA_UNSYNC) == 0 + && xtime.tv_sec > last_rtc_update + 660 + && xtime.tv_usec >= 500000 - (tick >> 1) + && xtime.tv_usec <= 500000 + (tick >> 1)) + if (set_rtc_mmss(xtime.tv_sec) == 0) + last_rtc_update = xtime.tv_sec; + else { + /* do it again in 60 s */ + last_rtc_update = xtime.tv_sec - 600; + } + + } while (seq != fr_read_end(&xtime_lock)); r4k_cur += r4k_offset; ack_r4ktimer(r4k_cur); @@ -247,10 +253,10 @@ void __init time_init(void) enable_cpu_timer(); /* Read time from the RTC chipset. */ - write_lock_irqsave (&xtime_lock, flags); + fr_write_lock_irqsave (&xtime_lock, flags); xtime.tv_sec = get_mips_time(); xtime.tv_usec = 0; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); } /* This is for machines which generate the exact clock. */ @@ -332,20 +338,23 @@ static unsigned long do_fast_gettimeoffs void do_gettimeofday(struct timeval *tv) { - unsigned int flags; + unsigned int seq; - read_lock_irqsave (&xtime_lock, flags); - *tv = xtime; - tv->tv_usec += do_fast_gettimeoffset(); + do { + seq = fr_read_begin(&xtime_lock); - /* - * xtime is atomically updated in timer_bh. jiffies - wall_jiffies - * is nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; + *tv = xtime; + tv->tv_usec += do_fast_gettimeoffset(); + + /* + * xtime is atomically updated in timer_bh. + * jiffies - wall_jiffies + * is nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; - read_unlock_irqrestore (&xtime_lock, flags); + } while (seq != fr_read_end(&xtime_lock)); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -355,7 +364,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec correctly. * However, the value in this location is is value at the last tick. @@ -375,5 +384,5 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/kernel/sysirix.c 90-mjb/arch/mips/kernel/sysirix.c --- 00-virgin/arch/mips/kernel/sysirix.c Thu Jan 2 22:05:00 2003 +++ 90-mjb/arch/mips/kernel/sysirix.c Thu Feb 6 19:49:45 2003 @@ -615,19 +615,19 @@ asmlinkage int irix_getgid(struct pt_reg return current->gid; } -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; asmlinkage int irix_stime(int value) { if (!capable(CAP_SYS_TIME)) return -EPERM; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime.tv_sec = value; xtime.tv_usec = 0; time_maxerror = MAXPHASE; time_esterror = MAXPHASE; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); return 0; } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/kernel/time.c 90-mjb/arch/mips/kernel/time.c --- 00-virgin/arch/mips/kernel/time.c Sun Nov 17 20:29:32 2002 +++ 90-mjb/arch/mips/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -37,7 +37,7 @@ u64 jiffies_64; /* * forward reference */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern volatile unsigned long wall_jiffies; /* @@ -62,20 +62,23 @@ int (*rtc_set_time)(unsigned long) = nul */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; - read_lock_irqsave (&xtime_lock, flags); - *tv = xtime; - tv->tv_usec += do_gettimeoffset(); + do { + seq = fr_read_begin(&xtime_lock); - /* - * xtime is atomically updated in timer_bh. jiffies - wall_jiffies - * is nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; + *tv = xtime; + tv->tv_usec += do_gettimeoffset(); + + /* + * xtime is atomically updated in timer_bh. + * jiffies - wall_jiffies + * is nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; + } while (seq != fr_read_end(&xtime_lock)); - read_unlock_irqrestore (&xtime_lock, flags); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -85,7 +88,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec * correctly. However, the value in this location is @@ -105,7 +108,7 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } @@ -291,6 +294,8 @@ unsigned long calibrate_div64_gettimeoff */ void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { + unsigned long seq; + if (mips_cpu.options & MIPS_CPU_COUNTER) { unsigned int count; @@ -340,19 +345,21 @@ void timer_interrupt(int irq, void *dev_ * CMOS clock accordingly every ~11 minutes. rtc_set_time() has to be * called as close as possible to 500 ms before the new second starts. */ - read_lock (&xtime_lock); - if ((time_status & STA_UNSYNC) == 0 && - xtime.tv_sec > last_rtc_update + 660 && - xtime.tv_usec >= 500000 - ((unsigned) tick) / 2 && - xtime.tv_usec <= 500000 + ((unsigned) tick) / 2) { - if (rtc_set_time(xtime.tv_sec) == 0) { - last_rtc_update = xtime.tv_sec; - } else { - last_rtc_update = xtime.tv_sec - 600; - /* do it again in 60 s */ + do { + seq = fr_read_begin(&xtime_lock); + + if ((time_status & STA_UNSYNC) == 0 && + xtime.tv_sec > last_rtc_update + 660 && + xtime.tv_usec >= 500000 - ((unsigned) tick) / 2 && + xtime.tv_usec <= 500000 + ((unsigned) tick) / 2) { + if (rtc_set_time(xtime.tv_sec) == 0) { + last_rtc_update = xtime.tv_sec; + } else { + last_rtc_update = xtime.tv_sec - 600; + /* do it again in 60 s */ + } } - } - read_unlock (&xtime_lock); + } while (seq != fr_read_end(&xtime_lock)); /* * If jiffies has overflowed in this timer_interrupt we must diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/mips-boards/generic/time.c 90-mjb/arch/mips/mips-boards/generic/time.c --- 00-virgin/arch/mips/mips-boards/generic/time.c Sun Nov 17 20:29:49 2002 +++ 90-mjb/arch/mips/mips-boards/generic/time.c Thu Feb 6 19:49:45 2003 @@ -45,7 +45,7 @@ unsigned long missed_heart_beats = 0; static unsigned long r4k_offset; /* Amount to increment compare reg each time */ static unsigned long r4k_cur; /* What counter should be at next timer irq */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; #define ALLINTS (IE_IRQ0 | IE_IRQ1 | IE_IRQ2 | IE_IRQ3 | IE_IRQ4 | IE_IRQ5) @@ -134,6 +134,7 @@ static int set_rtc_mmss(unsigned long no void mips_timer_interrupt(struct pt_regs *regs) { int irq = 7; + unsigned long seq; if (r4k_offset == 0) goto null; @@ -149,18 +150,21 @@ void mips_timer_interrupt(struct pt_regs * within 500ms before the * next second starts, * thus the following code. */ - read_lock(&xtime_lock); - if ((time_status & STA_UNSYNC) == 0 - && xtime.tv_sec > last_rtc_update + 660 - && xtime.tv_usec >= 500000 - (tick >> 1) - && xtime.tv_usec <= 500000 + (tick >> 1)) - if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - /* do it again in 60 s */ - last_rtc_update = xtime.tv_sec - 600; - read_unlock(&xtime_lock); + do { + seq = fr_read_begin(&xtime_lock); + + if ((time_status & STA_UNSYNC) == 0 + && xtime.tv_sec > last_rtc_update + 660 + && xtime.tv_usec >= 500000 - (tick >> 1) + && xtime.tv_usec <= 500000 + (tick >> 1)) + if (set_rtc_mmss(xtime.tv_sec) == 0) + last_rtc_update = xtime.tv_sec; + else + /* do it again in 60 s */ + last_rtc_update = xtime.tv_sec - 600; + } while (seq != fr_read_end(&xtime_lock)); + if ((timer_tick_count++ % HZ) == 0) { mips_display_message(&display_string[display_count++]); if (display_count == MAX_DISPLAY_COUNT) @@ -267,10 +271,10 @@ void __init time_init(void) change_cp0_status(ST0_IM, ALLINTS); /* Read time from the RTC chipset. */ - write_lock_irqsave (&xtime_lock, flags); + fr_write_lock_irqsave (&xtime_lock, flags); xtime.tv_sec = get_mips_time(); xtime.tv_usec = 0; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); } /* This is for machines which generate the exact clock. */ @@ -363,20 +367,23 @@ static unsigned long do_fast_gettimeoffs void do_gettimeofday(struct timeval *tv) { - unsigned int flags; + unsigned long seq; + + do { + seq = fr_read_begin(&xtime_lock); - read_lock_irqsave (&xtime_lock, flags); - *tv = xtime; - tv->tv_usec += do_fast_gettimeoffset(); + *tv = xtime; + tv->tv_usec += do_fast_gettimeoffset(); - /* - * xtime is atomically updated in timer_bh. jiffies - wall_jiffies - * is nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; + /* + * xtime is atomically updated in timer_bh. + * jiffies - wall_jiffies + * is nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; - read_unlock_irqrestore (&xtime_lock, flags); + } while (seq != fr_read_end(&xtime_lock)); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -386,7 +393,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec correctly. * However, the value in this location is is value at the last tick. @@ -406,5 +413,5 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips/philips/nino/time.c 90-mjb/arch/mips/philips/nino/time.c --- 00-virgin/arch/mips/philips/nino/time.c Sun Nov 17 20:29:20 2002 +++ 90-mjb/arch/mips/philips/nino/time.c Thu Feb 6 19:49:45 2003 @@ -24,7 +24,7 @@ #include extern volatile unsigned long wall_jiffies; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; static struct timeval xbase; @@ -61,30 +61,31 @@ void inline readRTC(unsigned long *high, */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long high, low; - read_lock_irqsave(&xtime_lock, flags); - // 40 bit RTC, driven by 32khz source: - // +-----------+-----------------------------------------+ - // | HHHH.HHHH | LLLL.LLLL.LLLL.LLLL.LMMM.MMMM.MMMM.MMMM | - // +-----------+-----------------------------------------+ - readRTC(&high,&low); - tv->tv_sec = (high << 17) | (low >> 15); - tv->tv_usec = (low % 32768) * 1953 / 64; - tv->tv_sec += xbase.tv_sec; - tv->tv_usec += xbase.tv_usec; + do { + seq = fr_read_begin(&xtime_lock); - tv->tv_usec += do_gettimeoffset(); - - /* - * xtime is atomically updated in timer_bh. lost_ticks is - * nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; - - read_unlock_irqrestore(&xtime_lock, flags); + // 40 bit RTC, driven by 32khz source: + // +-----------+-----------------------------------------+ + // | HHHH.HHHH | LLLL.LLLL.LLLL.LLLL.LMMM.MMMM.MMMM.MMMM | + // +-----------+-----------------------------------------+ + readRTC(&high,&low); + tv->tv_sec = (high << 17) | (low >> 15); + tv->tv_usec = (low % 32768) * 1953 / 64; + tv->tv_sec += xbase.tv_sec; + tv->tv_usec += xbase.tv_usec; + + tv->tv_usec += do_gettimeoffset(); + + /* + * xtime is atomically updated in timer_bh. lost_ticks is + * nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; + } while (seq != fr_read_end(&xtime_lock)); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -94,7 +95,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec * correctly. However, the value in this location is * is value at the last tick. @@ -118,7 +119,7 @@ void do_settimeofday(struct timeval *tv) time_state = TIME_BAD; time_maxerror = MAXPHASE; time_esterror = MAXPHASE; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } static int set_rtc_mmss(unsigned long nowtime) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips64/mips-boards/generic/time.c 90-mjb/arch/mips64/mips-boards/generic/time.c --- 00-virgin/arch/mips64/mips-boards/generic/time.c Sun Nov 17 20:29:27 2002 +++ 90-mjb/arch/mips64/mips-boards/generic/time.c Thu Feb 6 19:49:45 2003 @@ -44,7 +44,7 @@ unsigned long missed_heart_beats = 0; static unsigned long r4k_offset; /* Amount to increment compare reg each time */ static unsigned long r4k_cur; /* What counter should be at next timer irq */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; #define ALLINTS (IE_IRQ0 | IE_IRQ1 | IE_IRQ2 | IE_IRQ3 | IE_IRQ4 | IE_IRQ5) @@ -133,6 +133,7 @@ static int set_rtc_mmss(unsigned long no void mips_timer_interrupt(struct pt_regs *regs) { int irq = 7; + unsigned long seq; if (r4k_offset == 0) goto null; @@ -148,17 +149,20 @@ void mips_timer_interrupt(struct pt_regs * within 500ms before the * next second starts, * thus the following code. */ - read_lock(&xtime_lock); - if ((time_status & STA_UNSYNC) == 0 - && xtime.tv_sec > last_rtc_update + 660 - && xtime.tv_usec >= 500000 - (tick >> 1) - && xtime.tv_usec <= 500000 + (tick >> 1)) - if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - /* do it again in 60 s */ - last_rtc_update = xtime.tv_sec - 600; - read_unlock(&xtime_lock); + do { + seq = fr_read_begin(&xtime_lock); + + if ((time_status & STA_UNSYNC) == 0 + && xtime.tv_sec > last_rtc_update + 660 + && xtime.tv_usec >= 500000 - (tick >> 1) + && xtime.tv_usec <= 500000 + (tick >> 1)) + if (set_rtc_mmss(xtime.tv_sec) == 0) + last_rtc_update = xtime.tv_sec; + else + /* do it again in 60 s */ + last_rtc_update = xtime.tv_sec - 600; + } while (seq != fr_read_end(&xtime_lock)); + if ((timer_tick_count++ % HZ) == 0) { mips_display_message(&display_string[display_count++]); @@ -266,10 +270,10 @@ void __init time_init(void) set_cp0_status(ST0_IM, ALLINTS); /* Read time from the RTC chipset. */ - write_lock_irqsave (&xtime_lock, flags); + fr_write_lock_irqsave (&xtime_lock, flags); xtime.tv_sec = get_mips_time(); xtime.tv_usec = 0; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); } /* This is for machines which generate the exact clock. */ @@ -352,20 +356,24 @@ static unsigned long do_fast_gettimeoffs void do_gettimeofday(struct timeval *tv) { - unsigned int flags; + unsigned long seq; - read_lock_irqsave (&xtime_lock, flags); - *tv = xtime; - tv->tv_usec += do_fast_gettimeoffset(); + do { + seq = fr_read_begin(&xtime_lock); - /* - * xtime is atomically updated in timer_bh. jiffies - wall_jiffies - * is nonzero if the timer bottom half hasnt executed yet. - */ - if (jiffies - wall_jiffies) - tv->tv_usec += USECS_PER_JIFFY; + *tv = xtime; + tv->tv_usec += do_fast_gettimeoffset(); + + /* + * xtime is atomically updated in timer_bh. + * jiffies - wall_jiffies + * is nonzero if the timer bottom half hasnt executed yet. + */ + if (jiffies - wall_jiffies) + tv->tv_usec += USECS_PER_JIFFY; + + } while (seq != fr_read_end(&xtime_lock)); - read_unlock_irqrestore (&xtime_lock, flags); if (tv->tv_usec >= 1000000) { tv->tv_usec -= 1000000; @@ -375,7 +383,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec correctly. * However, the value in this location is is value at the last tick. @@ -395,5 +403,5 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips64/sgi-ip22/ip22-timer.c 90-mjb/arch/mips64/sgi-ip22/ip22-timer.c --- 00-virgin/arch/mips64/sgi-ip22/ip22-timer.c Sun Nov 17 20:29:46 2002 +++ 90-mjb/arch/mips64/sgi-ip22/ip22-timer.c Thu Feb 6 19:49:45 2003 @@ -32,7 +32,7 @@ static unsigned long r4k_offset; /* Amount to increment compare reg each time */ static unsigned long r4k_cur; /* What counter should be at next timer irq */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; static inline void ack_r4ktimer(unsigned long newval) { @@ -86,7 +86,7 @@ void indy_timer_interrupt(struct pt_regs unsigned long count; int irq = 7; - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); /* Ack timer and compute new compare. */ count = read_32bit_cp0_register(CP0_COUNT); /* This has races. */ @@ -116,7 +116,7 @@ void indy_timer_interrupt(struct pt_regs /* do it again in 60s */ last_rtc_update = xtime.tv_sec - 600; } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } static unsigned long dosample(volatile unsigned char *tcwp, @@ -224,10 +224,10 @@ void __init indy_timer_init(void) set_cp0_status(ST0_IM, ALLINTS); sti(); - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime.tv_sec = get_indy_time(); /* Read time from RTC. */ xtime.tv_usec = 0; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } void indy_8254timer_irq(void) @@ -243,20 +243,21 @@ void indy_8254timer_irq(void) void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; - read_lock_irqsave(&xtime_lock, flags); - *tv = xtime; - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + *tv = xtime; + } while (seq != fr_read_end(&xtime_lock)); } void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime = *tv; time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/mips64/sgi-ip27/ip27-timer.c 90-mjb/arch/mips64/sgi-ip27/ip27-timer.c --- 00-virgin/arch/mips64/sgi-ip27/ip27-timer.c Thu Jan 2 22:05:00 2003 +++ 90-mjb/arch/mips64/sgi-ip27/ip27-timer.c Thu Feb 6 19:49:45 2003 @@ -40,7 +40,7 @@ static unsigned long ct_cur[NR_CPUS]; /* What counter should be at next timer irq */ static long last_rtc_update; /* Last time the rtc clock got updated */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern volatile unsigned long wall_jiffies; @@ -94,7 +94,7 @@ void rt_timer_interrupt(struct pt_regs * int cpuA = ((cputoslice(cpu)) == 0); int irq = 7; /* XXX Assign number */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); again: LOCAL_HUB_S(cpuA ? PI_RT_PEND_A : PI_RT_PEND_B, 0); /* Ack */ @@ -145,7 +145,7 @@ again: } } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); if (softirq_pending(cpu)) do_softirq(); @@ -160,19 +160,21 @@ unsigned long inline do_gettimeoffset(vo void do_gettimeofday(struct timeval *tv) { - unsigned long flags; unsigned long usec, sec; + unsigned long seq; - read_lock_irqsave(&xtime_lock, flags); - usec = do_gettimeoffset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } - sec = xtime.tv_sec; - usec += xtime.tv_usec; - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + + usec = do_gettimeoffset(); + { + unsigned long lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); + } + sec = xtime.tv_sec; + usec += xtime.tv_usec; + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -185,7 +187,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); tv->tv_usec -= do_gettimeoffset(); tv->tv_usec -= (jiffies - wall_jiffies) * (1000000 / HZ); @@ -199,7 +201,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* Includes for ioc3_init(). */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/parisc/kernel/sys_parisc32.c 90-mjb/arch/parisc/kernel/sys_parisc32.c --- 00-virgin/arch/parisc/kernel/sys_parisc32.c Tue Jan 14 10:06:14 2003 +++ 90-mjb/arch/parisc/kernel/sys_parisc32.c Thu Feb 6 19:49:45 2003 @@ -2428,22 +2428,25 @@ struct sysinfo32 { asmlinkage int sys32_sysinfo(struct sysinfo32 *info) { struct sysinfo val; + unsigned long seq; int err; - extern rwlock_t xtime_lock; + extern frlock_t xtime_lock; /* We don't need a memset here because we copy the * struct to userspace once element at a time. */ - read_lock_irq(&xtime_lock); - val.uptime = jiffies / HZ; + do { + seq = fr_read_begin(&xtime_lock); - val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + val.uptime = jiffies / HZ; + + val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - val.procs = nr_threads; - read_unlock_irq(&xtime_lock); + val.procs = nr_threads; + } while (seq != fr_read_end(&xtime_lock)); si_meminfo(&val); si_swapinfo(&val); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/parisc/kernel/time.c 90-mjb/arch/parisc/kernel/time.c --- 00-virgin/arch/parisc/kernel/time.c Tue Jan 14 10:06:14 2003 +++ 90-mjb/arch/parisc/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -36,7 +36,7 @@ u64 jiffies_64; /* xtime and wall_jiffies keep wall-clock time */ extern unsigned long wall_jiffies; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; static long clocktick; /* timer cycles per tick */ static long halftick; @@ -115,9 +115,9 @@ void timer_interrupt(int irq, void *dev_ smp_do_timer(regs); #endif if (cpu == 0) { - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); do_timer(regs); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } } @@ -172,16 +172,18 @@ gettimeoffset (void) void do_gettimeofday (struct timeval *tv) { - unsigned long flags, usec, sec; + unsigned long seq, usec, sec; - read_lock_irqsave(&xtime_lock, flags); - { - usec = gettimeoffset(); + do { + seq = fr_read_begin(&xtime_lock); + + { + usec = gettimeoffset(); - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - } - read_unlock_irqrestore(&xtime_lock, flags); + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -195,7 +197,7 @@ do_gettimeofday (struct timeval *tv) void do_settimeofday (struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); { /* * This is revolting. We need to set "xtime" @@ -219,7 +221,7 @@ do_settimeofday (struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; } - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } @@ -241,10 +243,10 @@ void __init time_init(void) mtctl(next_tick, 16); if(pdc_tod_read(&tod_data) == 0) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime.tv_sec = tod_data.tod_sec; xtime.tv_nsec = tod_data.tod_usec * 1000; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } else { printk(KERN_ERR "Error reading tod clock\n"); xtime.tv_sec = 0; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/ppc/kernel/time.c 90-mjb/arch/ppc/kernel/time.c --- 00-virgin/arch/ppc/kernel/time.c Thu Jan 9 19:15:58 2003 +++ 90-mjb/arch/ppc/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -76,7 +76,7 @@ extern struct timezone sys_tz; /* keep track of when we need to update the rtc */ time_t last_rtc_update; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* The decrementer counts down by 128 every 128ns on a 601. */ #define DECREMENTER_COUNT_601 (1000000000 / HZ) @@ -161,7 +161,7 @@ void timer_interrupt(struct pt_regs * re continue; /* We are in an interrupt, no need to save/restore flags */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); tb_last_stamp = jiffy_stamp; do_timer(regs); @@ -191,7 +191,7 @@ void timer_interrupt(struct pt_regs * re /* Try again one minute later */ last_rtc_update += 60; } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } if ( !disarm_decr[smp_processor_id()] ) set_dec(next_dec); @@ -212,22 +212,23 @@ void timer_interrupt(struct pt_regs * re */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned delta, lost_ticks, usec, sec; - read_lock_irqsave(&xtime_lock, flags); - sec = xtime.tv_sec; - usec = (xtime.tv_nsec / 1000); - delta = tb_ticks_since(tb_last_stamp); + do { + seq = fr_read_begin(&xtime_lock); + sec = xtime.tv_sec; + usec = (xtime.tv_nsec / 1000); + delta = tb_ticks_since(tb_last_stamp); #ifdef CONFIG_SMP - /* As long as timebases are not in sync, gettimeofday can only - * have jiffy resolution on SMP. - */ - if (!smp_tb_synchronized) - delta = 0; + /* As long as timebases are not in sync, gettimeofday can only + * have jiffy resolution on SMP. + */ + if (!smp_tb_synchronized) + delta = 0; #endif /* CONFIG_SMP */ - lost_ticks = jiffies - wall_jiffies; - read_unlock_irqrestore(&xtime_lock, flags); + lost_ticks = jiffies - wall_jiffies; + } while (seq != fr_read_end(&xtime_lock)); usec += mulhwu(tb_to_us, tb_ticks_per_jiffy * lost_ticks + delta); while (usec >= 1000000) { @@ -243,7 +244,7 @@ void do_settimeofday(struct timeval *tv) unsigned long flags; int tb_delta, new_usec, new_sec; - write_lock_irqsave(&xtime_lock, flags); + fr_write_lock_irqsave(&xtime_lock, flags); /* Updating the RTC is not the job of this code. If the time is * stepped under NTP, the RTC will be update after STA_UNSYNC * is cleared. Tool like clock/hwclock either copy the RTC @@ -283,7 +284,7 @@ void do_settimeofday(struct timeval *tv) time_state = TIME_ERROR; /* p. 24, (a) */ time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); } /* This function is only called on the boot processor */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/ppc/platforms/pmac_time.c 90-mjb/arch/ppc/platforms/pmac_time.c --- 00-virgin/arch/ppc/platforms/pmac_time.c Sun Nov 17 20:29:29 2002 +++ 90-mjb/arch/ppc/platforms/pmac_time.c Thu Feb 6 19:49:45 2003 @@ -29,7 +29,7 @@ #include #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* Apparently the RTC stores seconds since 1 Jan 1904 */ #define RTC_OFFSET 2082844800 @@ -218,16 +218,17 @@ time_sleep_notify(struct pmu_sleep_notif switch (when) { case PBOOK_SLEEP_NOW: - read_lock_irqsave(&xtime_lock, flags); - time_diff = xtime.tv_sec - pmac_get_rtc_time(); - read_unlock_irqrestore(&xtime_lock, flags); + do { + flags = fr_read_begin(&xtime_lock); + time_diff = xtime.tv_sec - pmac_get_rtc_time(); + } while (seq != fr_read_end(&xtime_lock)); break; case PBOOK_WAKE: - write_lock_irqsave(&xtime_lock, flags); + fr_write_lock_irqsave(&xtime_lock, flags); xtime.tv_sec = pmac_get_rtc_time() + time_diff; xtime.tv_nsec = 0; last_rtc_update = xtime.tv_sec; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); break; } return PBOOK_SLEEP_OK; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/ppc64/kernel/time.c 90-mjb/arch/ppc64/kernel/time.c --- 00-virgin/arch/ppc64/kernel/time.c Thu Jan 9 19:16:00 2003 +++ 90-mjb/arch/ppc64/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -69,7 +69,7 @@ u64 jiffies_64; /* keep track of when we need to update the rtc */ time_t last_rtc_update; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern int piranha_simulator; #ifdef CONFIG_PPC_ISERIES unsigned long iSeries_recal_titan = 0; @@ -284,12 +284,12 @@ int timer_interrupt(struct pt_regs * reg smp_local_timer_interrupt(regs); #endif if (cpu == boot_cpuid) { - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); tb_last_stamp = lpaca->next_jiffy_update_tb; do_timer(regs); timer_sync_xtime( cur_tb ); timer_check_rtc(); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); if ( adjusting_time && (time_adjust == 0) ) ppc_adjtimex(); } @@ -348,7 +348,7 @@ void do_settimeofday(struct timeval *tv) long int tb_delta, new_usec, new_sec; unsigned long new_xsec; - write_lock_irqsave(&xtime_lock, flags); + fr_write_lock_irqsave(&xtime_lock, flags); /* Updating the RTC is not the job of this code. If the time is * stepped under NTP, the RTC will be update after STA_UNSYNC * is cleared. Tool like clock/hwclock either copy the RTC @@ -399,7 +399,7 @@ void do_settimeofday(struct timeval *tv) do_gtod.tb_orig_stamp = tb_last_stamp; } - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); } /* @@ -465,7 +465,7 @@ void __init time_init(void) #endif ppc_md.get_boot_time(&tm); - write_lock_irqsave(&xtime_lock, flags); + fr_write_lock_irqsave(&xtime_lock, flags); xtime.tv_sec = mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec); tb_last_stamp = get_tb(); @@ -484,7 +484,7 @@ void __init time_init(void) xtime.tv_nsec = 0; last_rtc_update = xtime.tv_sec; - write_unlock_irqrestore(&xtime_lock, flags); + fr_write_unlock_irqrestore(&xtime_lock, flags); /* Not exact, but the timer interrupt takes care of this */ set_dec(tb_ticks_per_jiffy); @@ -587,7 +587,7 @@ void ppc_adjtimex(void) new_tb_to_xs = divres.result_low; new_xsec = mulhdu( tb_ticks, new_tb_to_xs ); - write_lock_irqsave( &xtime_lock, flags ); + fr_write_lock_irqsave( &xtime_lock, flags ); old_xsec = mulhdu( tb_ticks, do_gtod.varp->tb_to_xs ); new_stamp_xsec = do_gtod.varp->stamp_xsec + old_xsec - new_xsec; @@ -609,7 +609,7 @@ void ppc_adjtimex(void) do_gtod.varp = temp_varp; do_gtod.var_idx = temp_idx; - write_unlock_irqrestore( &xtime_lock, flags ); + fr_write_unlock_irqrestore( &xtime_lock, flags ); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/s390/kernel/time.c 90-mjb/arch/s390/kernel/time.c --- 00-virgin/arch/s390/kernel/time.c Sun Dec 1 09:59:46 2002 +++ 90-mjb/arch/s390/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -52,7 +52,7 @@ static ext_int_info_t ext_int_info_timer static uint64_t xtime_cc; static uint64_t init_timer_cc; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; void tod_to_timeval(__u64 todval, struct timespec *xtime) @@ -82,13 +82,15 @@ static inline unsigned long do_gettimeof */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - sec = xtime.tv_sec; - usec = xtime.tv_nsec / 1000 + do_gettimeoffset(); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + + sec = xtime.tv_sec; + usec = xtime.tv_nsec / 1000 + do_gettimeoffset(); + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -102,7 +104,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_nsec * correctly. However, the value in this location is * is value at the last tick. @@ -122,7 +124,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } static inline __u32 div64_32(__u64 dividend, __u32 divisor) @@ -166,7 +168,7 @@ static void do_comparator_interrupt(stru * Do not rely on the boot cpu to do the calls to do_timer. * Spread it over all cpus instead. */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); if (S390_lowcore.jiffy_timer > xtime_cc) { __u32 xticks; @@ -181,7 +183,7 @@ static void do_comparator_interrupt(stru while (xticks--) do_timer(regs); } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); while (ticks--) update_process_times(user_mode(regs)); #else diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/s390x/kernel/time.c 90-mjb/arch/s390x/kernel/time.c --- 00-virgin/arch/s390x/kernel/time.c Sun Dec 1 09:59:46 2002 +++ 90-mjb/arch/s390x/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -51,7 +51,7 @@ static ext_int_info_t ext_int_info_timer static uint64_t xtime_cc; static uint64_t init_timer_cc; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; void tod_to_timeval(__u64 todval, struct timespec *xtime) @@ -77,13 +77,14 @@ static inline unsigned long do_gettimeof */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - sec = xtime.tv_sec; - usec = xtime.tv_nsec + do_gettimeoffset(); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + sec = xtime.tv_sec; + usec = xtime.tv_nsec + do_gettimeoffset(); + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -97,7 +98,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* This is revolting. We need to set the xtime.tv_usec * correctly. However, the value in this location is * is value at the last tick. @@ -117,7 +118,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* @@ -152,7 +153,7 @@ static void do_comparator_interrupt(stru * Do not rely on the boot cpu to do the calls to do_timer. * Spread it over all cpus instead. */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); if (S390_lowcore.jiffy_timer > xtime_cc) { __u32 xticks; @@ -167,7 +168,7 @@ static void do_comparator_interrupt(stru while (xticks--) do_timer(regs); } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); while (ticks--) update_process_times(user_mode(regs)); #else diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/sh/kernel/time.c 90-mjb/arch/sh/kernel/time.c --- 00-virgin/arch/sh/kernel/time.c Sun Nov 17 20:29:20 2002 +++ 90-mjb/arch/sh/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -72,7 +72,7 @@ u64 jiffies_64; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; #define TICK_SIZE tick @@ -127,19 +127,20 @@ static unsigned long do_gettimeoffset(vo void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - usec = do_gettimeoffset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } - sec = xtime.tv_sec; - usec += xtime.tv_usec; - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + usec = do_gettimeoffset(); + { + unsigned long lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); + } + sec = xtime.tv_sec; + usec += xtime.tv_usec; + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -152,7 +153,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -172,7 +173,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* last time the RTC clock got updated */ @@ -231,9 +232,9 @@ static void timer_interrupt(int irq, voi * the irq version of write_lock because as just said we have irq * locally disabled. -arca */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); do_timer_interrupt(irq, NULL, regs); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } static unsigned int __init get_timer_frequency(void) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/sparc/kernel/pcic.c 90-mjb/arch/sparc/kernel/pcic.c --- 00-virgin/arch/sparc/kernel/pcic.c Sun Nov 17 20:29:46 2002 +++ 90-mjb/arch/sparc/kernel/pcic.c Thu Feb 6 19:49:45 2003 @@ -34,7 +34,7 @@ #include #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; #ifndef CONFIG_PCI @@ -739,10 +739,10 @@ static void pcic_clear_clock_irq(void) static void pcic_timer_handler (int irq, void *h, struct pt_regs *regs) { - write_lock(&xtime_lock); /* Dummy, to show that we remember */ + fr_write_lock(&xtime_lock); /* Dummy, to show that we remember */ pcic_clear_clock_irq(); do_timer(regs); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } #define USECS_PER_JIFFY 10000 /* We have 100HZ "standard" timer for sparc */ @@ -794,19 +794,20 @@ extern unsigned long wall_jiffies; static void pci_do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - usec = do_gettimeoffset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + usec = do_gettimeoffset(); + { + unsigned long lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); + } + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/sparc/kernel/time.c 90-mjb/arch/sparc/kernel/time.c --- 00-virgin/arch/sparc/kernel/time.c Sun Nov 17 20:29:47 2002 +++ 90-mjb/arch/sparc/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -42,7 +42,7 @@ #include #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long wall_jiffies; @@ -131,7 +131,7 @@ void timer_interrupt(int irq, void *dev_ #endif /* Protect counter clear so that do_gettimeoffset works */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); #ifdef CONFIG_SUN4 if((idprom->id_machtype == (SM_SUN4 | SM_4_260)) || (idprom->id_machtype == (SM_SUN4 | SM_4_110))) { @@ -155,7 +155,7 @@ void timer_interrupt(int irq, void *dev_ else last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */ } - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } /* Kick start a stopped clock (procedure from the Sun NVRAM/hostid FAQ). */ @@ -469,19 +469,20 @@ extern __inline__ unsigned long do_getti */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - usec = do_gettimeoffset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + usec = do_gettimeoffset(); + { + unsigned long lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); + } + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -494,9 +495,9 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); bus_do_settimeofday(tv); - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } static void sbus_do_settimeofday(struct timeval *tv) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/sparc64/kernel/rtrap.S 90-mjb/arch/sparc64/kernel/rtrap.S --- 00-virgin/arch/sparc64/kernel/rtrap.S Sun Nov 17 20:29:45 2002 +++ 90-mjb/arch/sparc64/kernel/rtrap.S Wed Feb 5 22:23:05 2003 @@ -33,7 +33,7 @@ __handle_softirq: ba,a,pt %xcc, __handle_softirq_continue nop __handle_preemption: - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate ba,pt %xcc, __handle_preemption_continue wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate @@ -48,7 +48,7 @@ __handle_user_windows: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -92,7 +92,7 @@ __handle_perfctrs: be,pt %xcc, 1f nop - call schedule + call user_schedule wrpr %g0, RTRAP_PSTATE, %pstate wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate ldx [%g6 + TI_FLAGS], %l0 @@ -271,7 +271,7 @@ to_kernel: brnz %l5, kern_fpucheck sethi %hi(PREEMPT_ACTIVE), %l6 stw %l6, [%g6 + TI_PRE_COUNT] - call schedule + call user_schedule nop ba,pt %xcc, rtrap stw %g0, [%g6 + TI_PRE_COUNT] diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/sparc64/kernel/time.c 90-mjb/arch/sparc64/kernel/time.c --- 00-virgin/arch/sparc64/kernel/time.c Thu Jan 2 22:05:01 2003 +++ 90-mjb/arch/sparc64/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -37,7 +37,7 @@ #include #include -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; spinlock_t mostek_lock = SPIN_LOCK_UNLOCKED; spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; @@ -134,7 +134,7 @@ static void timer_interrupt(int irq, voi { unsigned long ticks, pstate; - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); do { #ifndef CONFIG_SMP @@ -196,13 +196,13 @@ static void timer_interrupt(int irq, voi timer_check_rtc(); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } #ifdef CONFIG_SMP void timer_tick_interrupt(struct pt_regs *regs) { - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); do_timer(regs); @@ -225,7 +225,7 @@ void timer_tick_interrupt(struct pt_regs timer_check_rtc(); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } #endif @@ -665,7 +665,7 @@ void do_settimeofday(struct timeval *tv) if (this_is_starfire) return; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of @@ -686,7 +686,7 @@ void do_settimeofday(struct timeval *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* Ok, my cute asm atomicity trick doesn't work anymore. @@ -695,19 +695,20 @@ void do_settimeofday(struct timeval *tv) */ void do_gettimeofday(struct timeval *tv) { - unsigned long flags; + unsigned long seq; unsigned long usec, sec; - read_lock_irqsave(&xtime_lock, flags); - usec = do_gettimeoffset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - read_unlock_irqrestore(&xtime_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + usec = do_gettimeoffset(); + { + unsigned long lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); + } + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/um/kernel/time_kern.c 90-mjb/arch/um/kernel/time_kern.c --- 00-virgin/arch/um/kernel/time_kern.c Thu Jan 2 22:05:02 2003 +++ 90-mjb/arch/um/kernel/time_kern.c Thu Feb 6 19:49:45 2003 @@ -21,7 +21,7 @@ u64 jiffies_64; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; int hz(void) { @@ -57,9 +57,9 @@ void boot_timer_handler(int sig) void um_timer(int irq, void *dev, struct pt_regs *regs) { do_timer(regs); - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); timer(); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } long um_time(int * tloc) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/v850/kernel/time.c 90-mjb/arch/v850/kernel/time.c --- 00-virgin/arch/v850/kernel/time.c Mon Dec 23 23:01:49 2002 +++ 90-mjb/arch/v850/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -107,7 +107,7 @@ static void timer_interrupt (int irq, vo #endif /* 0 */ } -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; /* * This version of gettimeofday has near microsecond resolution. @@ -118,23 +118,25 @@ void do_gettimeofday (struct timeval *tv extern volatile unsigned long lost_ticks; unsigned long lost; #endif - unsigned long flags; unsigned long usec, sec; + unsigned long seq; + + do { + seq = fr_read_begin(&xtime_lock); - read_lock_irqsave (&xtime_lock, flags); #if 0 - usec = mach_gettimeoffset ? mach_gettimeoffset () : 0; + usec = mach_gettimeoffset ? mach_gettimeoffset () : 0; #else - usec = 0; + usec = 0; #endif #if 0 /* DAVIDM later if possible */ - lost = lost_ticks; - if (lost) - usec += lost * (1000000/HZ); + lost = lost_ticks; + if (lost) + usec += lost * (1000000/HZ); #endif - sec = xtime.tv_sec; - usec += xtime.tv_nsec / 1000; - read_unlock_irqrestore (&xtime_lock, flags); + sec = xtime.tv_sec; + usec += xtime.tv_nsec / 1000; + } while (seq != fr_read_end(&xtime_lock)); while (usec >= 1000000) { usec -= 1000000; @@ -147,7 +149,7 @@ void do_gettimeofday (struct timeval *tv void do_settimeofday (struct timeval *tv) { - write_lock_irq (&xtime_lock); + fr_write_lock_irq (&xtime_lock); /* This is revolting. We need to set the xtime.tv_nsec * correctly. However, the value in this location is @@ -172,7 +174,7 @@ void do_settimeofday (struct timeval *tv time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq (&xtime_lock); + fr_write_unlock_irq (&xtime_lock); } static int timer_dev_id; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/x86_64/kernel/early_printk.c 90-mjb/arch/x86_64/kernel/early_printk.c --- 00-virgin/arch/x86_64/kernel/early_printk.c Sun Nov 17 20:29:50 2002 +++ 90-mjb/arch/x86_64/kernel/early_printk.c Wed Dec 31 16:00:00 1969 @@ -1,218 +0,0 @@ -#include -#include -#include -#include -#include - -/* Simple VGA output */ - -#define VGABASE 0xffffffff800b8000UL - -#define MAX_YPOS 25 -#define MAX_XPOS 80 - -static int current_ypos = 1, current_xpos = 0; - -static void early_vga_write(struct console *con, const char *str, unsigned n) -{ - char c; - int i, k, j; - - while ((c = *str++) != '\0' && n-- > 0) { - if (current_ypos >= MAX_YPOS) { - /* scroll 1 line up */ - for(k = 1, j = 0; k < MAX_YPOS; k++, j++) { - for(i = 0; i < MAX_XPOS; i++) { - writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), - VGABASE + 2*(MAX_XPOS*j + i)); - } - } - for(i = 0; i < MAX_XPOS; i++) { - writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); - } - current_ypos = MAX_YPOS-1; - } - if (c == '\n') { - current_xpos = 0; - current_ypos++; - } else if (c != '\r') { - writew(((0x7 << 8) | (unsigned short) c), - VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++)); - if (current_xpos >= MAX_XPOS) { - current_xpos = 0; - current_ypos++; - } - } - } -} - -static struct console early_vga_console = { - .name = "earlyvga", - .write = early_vga_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ - -int early_serial_base = 0x3f8; /* ttyS0 */ - -#define XMTRDY 0x20 - -#define DLAB 0x80 - -#define TXR 0 /* Transmit register (WRITE) */ -#define RXR 0 /* Receive register (READ) */ -#define IER 1 /* Interrupt Enable */ -#define IIR 2 /* Interrupt ID */ -#define FCR 2 /* FIFO control */ -#define LCR 3 /* Line control */ -#define MCR 4 /* Modem control */ -#define LSR 5 /* Line Status */ -#define MSR 6 /* Modem Status */ -#define DLL 0 /* Divisor Latch Low */ -#define DLH 1 /* Divisor latch High */ - -static int early_serial_putc(unsigned char ch) -{ - unsigned timeout = 0xffff; - while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) - rep_nop(); - outb(ch, early_serial_base + TXR); - return timeout ? 0 : -1; -} - -static void early_serial_write(struct console *con, const char *s, unsigned n) -{ - while (*s && n-- > 0) { - early_serial_putc(*s); - if (*s == '\n') - early_serial_putc('\r'); - s++; - } -} - -static __init void early_serial_init(char *opt) -{ - unsigned char c; - unsigned divisor, baud = 38400; - char *s, *e; - - if (*opt == ',') - ++opt; - - s = strsep(&opt, ","); - if (s != NULL) { - unsigned port; - if (!strncmp(s,"0x",2)) - early_serial_base = simple_strtoul(s, &e, 16); - else { - static int bases[] = { 0x3f8, 0x2f8 }; - if (!strncmp(s,"ttyS",4)) - s+=4; - port = simple_strtoul(s, &e, 10); - if (port > 1 || s == e) - port = 0; - early_serial_base = bases[port]; - } - } - - outb(0x3, early_serial_base + LCR); /* 8n1 */ - outb(0, early_serial_base + IER); /* no interrupt */ - outb(0, early_serial_base + FCR); /* no fifo */ - outb(0x3, early_serial_base + MCR); /* DTR + RTS */ - - s = strsep(&opt, ","); - if (s != NULL) { - baud = simple_strtoul(s, &e, 0); - if (baud == 0 || s == e) - baud = 38400; - } - - divisor = 115200 / baud; - c = inb(early_serial_base + LCR); - outb(c | DLAB, early_serial_base + LCR); - outb(divisor & 0xff, early_serial_base + DLL); - outb((divisor >> 8) & 0xff, early_serial_base + DLH); - outb(c & ~DLAB, early_serial_base + LCR); -} - -static struct console early_serial_console = { - .name = "earlyser", - .write = early_serial_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -/* Direct interface for emergencies */ -struct console *early_console = &early_vga_console; -static int early_console_initialized = 0; - -void early_printk(const char *fmt, ...) -{ - char buf[512]; - int n; - va_list ap; - va_start(ap,fmt); - n = vsnprintf(buf,512,fmt,ap); - early_console->write(early_console,buf,n); - va_end(ap); -} - -static int keep_early; - -int __init setup_early_printk(char *opt) -{ - char *space; - char buf[256]; - - if (early_console_initialized) - return -1; - - strncpy(buf,opt,256); - buf[255] = 0; - space = strchr(buf, ' '); - if (space) - *space = 0; - - if (strstr(buf,"keep")) - keep_early = 1; - - if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); - early_console = &early_serial_console; - } else if (!strncmp(buf, "ttyS", 4)) { - early_serial_init(buf); - early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3)) { - early_console = &early_vga_console; - } else { - early_console = NULL; - return -1; - } - early_console_initialized = 1; - register_console(early_console); - return 0; -} - -void __init disable_early_printk(void) -{ - if (!early_console_initialized || !early_console) - return; - if (!keep_early) { - printk("disabling early console...\n"); - unregister_console(early_console); - early_console_initialized = 0; - } else { - printk("keeping early console.\n"); - } -} - -/* syntax: earlyprintk=vga - earlyprintk=serial[,ttySn[,baudrate]] - Append ,keep to not disable it when the real console takes over. - Only vga or serial at a time, not both. - Currently only ttyS0 and ttyS1 are supported. - Interaction with the standard serial driver is not very good. - The VGA output is eventually overwritten by the real console. */ -__setup("earlyprintk=", setup_early_printk); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/x86_64/kernel/entry.S 90-mjb/arch/x86_64/kernel/entry.S --- 00-virgin/arch/x86_64/kernel/entry.S Fri Jan 17 09:18:25 2003 +++ 90-mjb/arch/x86_64/kernel/entry.S Wed Feb 5 22:23:05 2003 @@ -187,7 +187,7 @@ sysret_careful: jnc sysret_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp sysret_check @@ -256,7 +256,7 @@ int_careful: jnc int_very_careful sti pushq %rdi - call schedule + call user_schedule popq %rdi jmp int_with_check @@ -420,7 +420,7 @@ retint_careful: jnc retint_signal sti pushq %rdi - call schedule + call user_schedule popq %rdi GET_THREAD_INFO(%rcx) cli @@ -454,7 +454,7 @@ retint_kernel: jc retint_restore_args movl $PREEMPT_ACTIVE,threadinfo_preempt_count(%rcx) sti - call schedule + call user_schedule cli GET_THREAD_INFO(%rcx) movl $0,threadinfo_preempt_count(%rcx) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/x86_64/kernel/head64.c 90-mjb/arch/x86_64/kernel/head64.c --- 00-virgin/arch/x86_64/kernel/head64.c Sun Nov 17 20:29:32 2002 +++ 90-mjb/arch/x86_64/kernel/head64.c Wed Feb 5 22:22:58 2003 @@ -70,7 +70,7 @@ static void __init setup_boot_cpu_data(v boot_cpu_data.x86_mask = eax & 0xf; } -extern void start_kernel(void), pda_init(int), setup_early_printk(char *); +extern void start_kernel(void), pda_init(int); extern int disable_apic; void __init x86_64_start_kernel(char * real_mode_data) @@ -80,9 +80,6 @@ void __init x86_64_start_kernel(char * r clear_bss(); pda_init(0); copy_bootdata(real_mode_data); - s = strstr(saved_command_line, "earlyprintk="); - if (s != NULL) - setup_early_printk(s+12); #ifdef CONFIG_X86_IO_APIC if (strstr(saved_command_line, "disableapic")) disable_apic = 1; diff -urpN -X /home/fletch/.diff.exclude 00-virgin/arch/x86_64/kernel/time.c 90-mjb/arch/x86_64/kernel/time.c --- 00-virgin/arch/x86_64/kernel/time.c Thu Jan 2 22:05:03 2003 +++ 90-mjb/arch/x86_64/kernel/time.c Thu Feb 6 19:49:45 2003 @@ -27,7 +27,7 @@ u64 jiffies_64; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; unsigned int cpu_khz; /* TSC clocks / usec, not used here */ @@ -70,21 +70,22 @@ inline unsigned int do_gettimeoffset(voi void do_gettimeofday(struct timeval *tv) { - unsigned long flags, t; + unsigned long flags, t, seq; unsigned int sec, usec; - read_lock_irqsave(&xtime_lock, flags); - spin_lock(&time_offset_lock); + spin_lock_irqsave(&time_offset_lock, flags); + do { + seq = fr_read_begin(&xtime_lock); + + sec = xtime.tv_sec; + usec = xtime.tv_nsec / 1000; + + t = (jiffies - wall_jiffies) * (1000000L / HZ) + do_gettimeoffset(); + if (t > timeoffset) timeoffset = t; + usec += timeoffset; - sec = xtime.tv_sec; - usec = xtime.tv_nsec / 1000; - - t = (jiffies - wall_jiffies) * (1000000L / HZ) + do_gettimeoffset(); - if (t > timeoffset) timeoffset = t; - usec += timeoffset; - - spin_unlock(&time_offset_lock); - read_unlock_irqrestore(&xtime_lock, flags); + } while (seq != fr_read_end(&xtime_lock)); + spin_unlock_irqrestore(&time_offset_lock, flags); tv->tv_sec = sec + usec / 1000000; tv->tv_usec = usec % 1000000; @@ -98,7 +99,7 @@ void do_gettimeofday(struct timeval *tv) void do_settimeofday(struct timeval *tv) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); vxtime_lock(); tv->tv_usec -= do_gettimeoffset() + @@ -118,7 +119,7 @@ void do_settimeofday(struct timeval *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* @@ -201,7 +202,7 @@ static void timer_interrupt(int irq, voi * variables, because both do_timer() and us change them -arca+vojtech */ - write_lock(&xtime_lock); + fr_write_lock(&xtime_lock); vxtime_lock(); { @@ -250,7 +251,7 @@ static void timer_interrupt(int irq, voi } vxtime_unlock(); - write_unlock(&xtime_lock); + fr_write_unlock(&xtime_lock); } unsigned long get_cmos_time(void) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/acpi/events/evevent.c 90-mjb/drivers/acpi/events/evevent.c --- 00-virgin/drivers/acpi/events/evevent.c Mon Jan 13 21:09:09 2003 +++ 90-mjb/drivers/acpi/events/evevent.c Thu Feb 6 19:49:40 2003 @@ -103,6 +103,10 @@ acpi_ev_handler_initialize ( ACPI_FUNCTION_TRACE ("ev_handler_initialize"); +#ifdef CONFIG_X86_SUMMIT +/*horrible horrible hack to avoid interrupt storm*/ +return_ACPI_STATUS (0); +#endif /* Install the SCI handler */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/char/Makefile 90-mjb/drivers/char/Makefile --- 00-virgin/drivers/char/Makefile Tue Jan 14 10:06:15 2003 +++ 90-mjb/drivers/char/Makefile Wed Feb 5 22:23:05 2003 @@ -32,6 +32,7 @@ obj-$(CONFIG_COMPUTONE) += ip2.o ip2main obj-$(CONFIG_RISCOM8) += riscom8.o obj-$(CONFIG_ISI) += isicom.o obj-$(CONFIG_ESPSERIAL) += esp.o +obj-$(CONFIG_X86_REMOTE_DEBUG) += gdbserial.o obj-$(CONFIG_SYNCLINK) += synclink.o obj-$(CONFIG_SYNCLINKMP) += synclinkmp.o obj-$(CONFIG_N_HDLC) += n_hdlc.o diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/char/gdbserial.c 90-mjb/drivers/char/gdbserial.c --- 00-virgin/drivers/char/gdbserial.c Wed Dec 31 16:00:00 1969 +++ 90-mjb/drivers/char/gdbserial.c Wed Feb 5 22:23:05 2003 @@ -0,0 +1,274 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * + * Modified by Scott Foehner (sfoehner@engr.sgi.com) to allow connect + * on boot-up + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#undef PRNT /* define for debug printing */ + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +extern void set_debug_traps(void); /* GDB routine */ +extern int gdb_serial_setup(int ttyS, int baud, int *port, int *irq); +extern void shutdown_for_gdb(struct async_struct *info); + /* in serial.c */ + +int gdb_irq; +int gdb_port; +int gdb_ttyS = 1; /* Default: ttyS1 */ +int gdb_baud = 38400; +int gdb_enter = 0; /* Default: do not do gdb_hook on boot */ +int gdb_initialized = 0; + +static int initialized = -1; + +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(void) +{ + if (inb(gdb_port + UART_LSR) & UART_LSR_DR) + return (inb(gdb_port + UART_RX)); + + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + */ +static int +read_char(void) +{ + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + int chr; + + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + return (chr); + } + + return (read_data_bfr()); /* read from hardware */ + +} /* read_char */ + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(int chr) +{ + while (!(inb(gdb_port + UART_LSR) & UART_LSR_THRE)) ; + + outb(chr, gdb_port + UART_TX); + +} /* write_char */ + +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static void +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + int chr; + int iir; + + do { + chr = read_data_bfr(); + iir = inb(gdb_port + UART_IIR); +#ifdef PRNT + printk("gdb_interrupt: chr=%02x '%c' after read iir=%02x\n", + chr, chr > ' ' && chr < 0x7F ? chr : ' ', iir); +#endif + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + breakpoint(); + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { /* buffer overflow, clear it */ + gdb_buf_in_inx = 0; + atomic_set(&gdb_buf_in_cnt, 0); + gdb_buf_out_inx = 0; + break; + } + + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + atomic_inc(&gdb_buf_in_cnt); + } + while (iir & UART_IIR_RDI); + +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +extern int serial8250_init(void); + +int +gdb_hook(void) +{ + int retval; + +#ifdef CONFIG_SMP + if (NR_CPUS > KGDB_MAX_NO_CPUS) { + printk + ("kgdb: too manu cpus. Cannot enable debugger with more than 8 cpus\n"); + return (-1); + } +#endif + + /* + * Call first time just to get the ser ptr + */ + + serial8250_init(); + + if (gdb_serial_setup(gdb_ttyS, gdb_baud, &gdb_port, &gdb_irq)) { + printk("gdb_serial_setup() error"); + return (-1); + } + + retval = request_irq(gdb_irq, + gdb_interrupt, SA_INTERRUPT, "GDB-stub", NULL); + if (retval == 0) + initialized = 1; + else { + initialized = 0; + printk("gdb_hook: request_irq(irq=%d) failed: %d\n", gdb_irq, + retval); + } + + /* + * Call GDB routine to setup the exception vectors for the debugger + */ + set_debug_traps(); + + /* + * Call the breakpoint() routine in GDB to start the debugging + * session. + */ + printk("Waiting for connection from remote gdb... "); + breakpoint(); + gdb_null(); + + printk("Connected.\n"); + + gdb_initialized = 1; + return (0); + +} /* gdb_hook_interrupt2 */ + +/* + * getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. + */ +int +getDebugChar(void) +{ + volatile int chr; + +#ifdef PRNT + printk("getDebugChar: "); +#endif + + while ((chr = read_char()) < 0) + touch_nmi_watchdog(); + +#ifdef PRNT + printk("%c\n", chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + return (chr); + +} /* getDebugChar */ + +/* + * putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. + */ +void +putDebugChar(int chr) +{ +#ifdef PRNT + printk("putDebugChar: chr=%02x '%c'\n", chr, + chr > ' ' && chr < 0x7F ? chr : ' '); +#endif + + write_char(chr); /* this routine will wait */ + +} /* putDebugChar */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/char/sysrq.c 90-mjb/drivers/char/sysrq.c --- 00-virgin/drivers/char/sysrq.c Thu Jan 2 22:05:04 2003 +++ 90-mjb/drivers/char/sysrq.c Wed Feb 5 22:23:05 2003 @@ -107,6 +107,18 @@ static struct sysrq_key_op sysrq_reboot_ .action_msg = "Resetting", }; +#ifdef CONFIG_X86_REMOTE_DEBUG +static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) { + int gdb_hook(void); + gdb_hook(); +} +static struct sysrq_key_op sysrq_gdb_op = { + handler: sysrq_handle_gdb, + help_msg: "Gdb", + action_msg: "Entering debugger", +}; +#endif /* SYNC SYSRQ HANDLERS BLOCK */ @@ -357,7 +369,11 @@ static struct sysrq_key_op *sysrq_key_ta /* d */ NULL, /* e */ &sysrq_term_op, /* f */ NULL, +#ifdef CONFIG_X86_REMOTE_DEBUG +/* g */ &sysrq_gdb_op, +#else /* CONFIG_X86_REMOTE_DEBUG */ /* g */ NULL, +#endif /* CONFIG_X86_REMOTE_DEBUG */ /* h */ NULL, /* i */ &sysrq_kill_op, /* j */ NULL, diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/char/tty_io.c 90-mjb/drivers/char/tty_io.c --- 00-virgin/drivers/char/tty_io.c Fri Jan 17 09:18:26 2003 +++ 90-mjb/drivers/char/tty_io.c Wed Feb 5 22:23:05 2003 @@ -91,6 +91,9 @@ #include #include #include +#ifdef CONFIG_GDB_CONSOLE +#include +#endif #include #include @@ -2211,6 +2214,9 @@ void __init console_init(void) #endif #ifdef CONFIG_VT con_init(); +#endif +#ifdef CONFIG_GDB_CONSOLE + gdb_console_init(); #endif #ifdef CONFIG_AU1000_SERIAL_CONSOLE au1000_serial_console_init(); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/net/starfire.c 90-mjb/drivers/net/starfire.c --- 00-virgin/drivers/net/starfire.c Thu Jan 9 19:16:08 2003 +++ 90-mjb/drivers/net/starfire.c Thu Feb 6 19:49:47 2003 @@ -101,15 +101,35 @@ - Better stats and error handling (Ion Badulescu) - Use new pci_set_mwi() PCI API function (jgarzik) -TODO: - - implement tx_timeout() properly + LK1.3.7 (Ion Badulescu) + - minimal implementation of tx_timeout() + - correctly shutdown the Rx/Tx engines in netdev_close() + - added calls to netif_carrier_on/off + (patch from Stefan Rompf ) - VLAN support + + LK1.3.8 (Ion Badulescu) + - adjust DMA burst size on sparc64 + - 64-bit support + - reworked zerocopy support for 64-bit buffers + - working and usable interrupt mitigation/latency + - reduced Tx interrupt frequency for lower interrupt overhead + + LK1.3.9 (Ion Badulescu) + - bugfix for mcast filter + - enable the right kind of Tx interrupts (TxDMADone, not TxDone) + + LK1.4.0 (Ion Badulescu) + - NAPI support + +TODO: bugfixes (no bugs known as of right now) */ #define DRV_NAME "starfire" -#define DRV_VERSION "1.03+LK1.3.6" -#define DRV_RELDATE "March 7, 2002" +#define DRV_VERSION "1.03+LK1.4.0" +#define DRV_RELDATE "December 23, 2002" +#include #include #include #include @@ -118,7 +138,6 @@ TODO: #include #include #include -#include #include /* Processor type for cache alignment. */ #include #include @@ -128,17 +147,14 @@ TODO: * firmware files) does not allow one to redistribute them. Thus, we can't * include the firmware with this driver. * - * However, an end-user is allowed to download and use it, after - * converting it to C header files using starfire_firmware.pl. + * However, should a legal-to-use firmware become available, + * the driver developer would need only to obtain the firmware in the + * form of a C header file. * Once that's done, the #undef below must be changed into a #define * for this driver to really use the firmware. Note that Rx/Tx * hardware TCP checksumming is not possible without the firmware. * - * If Adaptec could allow redistribution of the firmware (even in binary - * format), life would become a lot easier. Unfortunately, I've lost my - * Adaptec contacts, so progress on this front is rather unlikely to - * occur. If anybody from Adaptec reads this and can help with this matter, - * please let me know... + * WANTED: legal firmware to include with this GPL'd driver. */ #undef HAS_FIRMWARE /* @@ -157,11 +173,16 @@ TODO: #include "starfire_firmware.h" #endif /* HAS_FIRMWARE */ +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) +#define VLAN_SUPPORT +#endif + /* The user-configurable values. These may be modified when a driver module is loaded.*/ /* Used for tuning interrupt latency vs. overhead. */ -static int interrupt_mitigation; +static int intr_latency; +static int small_frames; static int debug = 1; /* 1 normal messages, 0 quiet .. 7 verbose. */ static int max_interrupt_work = 20; @@ -169,6 +190,12 @@ static int mtu; /* Maximum number of multicast addresses to filter (vs. rx-all-multicast). The Starfire has a 512 element hash table based on the Ethernet CRC. */ static int multicast_filter_limit = 512; +/* Whether to do TCP/UDP checksums in hardware */ +#ifdef HAS_FIRMWARE +static int enable_hw_cksum = 1; +#else +static int enable_hw_cksum = 0; +#endif #define PKT_BUF_SZ 1536 /* Size of each temporary Rx buffer.*/ /* @@ -181,7 +208,9 @@ static int multicast_filter_limit = 512; * packets as the starfire doesn't allow for misaligned DMAs ;-( * 23/10/2000 - Jes * - * The Alpha and the Sparc don't allow unaligned loads, either. -Ion + * The Alpha and the Sparc don't like unaligned loads, either. On Sparc64, + * at least, having unaligned frames leads to a rather serious performance + * penalty. -Ion */ #if defined(__ia64__) || defined(__alpha__) || defined(__sparc__) static int rx_copybreak = PKT_BUF_SZ; @@ -189,9 +218,17 @@ static int rx_copybreak = PKT_BUF_SZ; static int rx_copybreak /* = 0 */; #endif +/* PCI DMA burst size -- on sparc64 we want to force it to 64 bytes, on the others the default of 128 is fine. */ +#ifdef __sparc__ +#define DMA_BURST_SIZE 64 +#else +#define DMA_BURST_SIZE 128 +#endif + /* Used to pass the media type, etc. Both 'options[]' and 'full_duplex[]' exist for driver interoperability. The media type is usually passed in 'options[]'. + These variables are deprecated, use ethtool instead. -Ion */ #define MAX_UNITS 8 /* More are supported, limit only on options */ static int options[MAX_UNITS] = {0, }; @@ -201,33 +238,55 @@ static int full_duplex[MAX_UNITS] = {0, /* The "native" ring sizes are either 256 or 2048. However in some modes a descriptor may be marked to wrap the ring earlier. - The driver allocates a single page for each descriptor ring, constraining - the maximum size in an architecture-dependent way. */ #define RX_RING_SIZE 256 #define TX_RING_SIZE 32 /* The completion queues are fixed at 1024 entries i.e. 4K or 8KB. */ #define DONE_Q_SIZE 1024 +/* All queues must be aligned on a 256-byte boundary */ +#define QUEUE_ALIGN 256 + +#if RX_RING_SIZE > 256 +#define RX_Q_ENTRIES Rx2048QEntries +#else +#define RX_Q_ENTRIES Rx256QEntries +#endif /* Operational parameters that usually are not changed. */ /* Time in jiffies before concluding the transmitter is hung. */ #define TX_TIMEOUT (2 * HZ) -#ifdef ZEROCOPY -#if MAX_SKB_FRAGS <= 6 -#define MAX_STARFIRE_FRAGS 6 -#else /* MAX_STARFIRE_FRAGS > 6 */ -#warning This driver will not work with more than 6 skb fragments. -#warning Turning off zerocopy support. -#undef ZEROCOPY -#endif /* MAX_STARFIRE_FRAGS > 6 */ -#endif /* ZEROCOPY */ +/* + * This SUCKS. + * We need a much better method to determine if dma_addr_t is 64-bit. + */ +#if (defined(__i386__) && defined(CONFIG_HIGHMEM) && (LINUX_VERSION_CODE > 0x20500 || defined(CONFIG_HIGHMEM64G))) || defined(__x86_64__) || defined (__ia64__) || defined(__mips64__) || (defined(__mips__) && defined(CONFIG_HIGHMEM) && defined(CONFIG_64BIT_PHYS_ADDR)) +/* 64-bit dma_addr_t */ +#define ADDR_64BITS /* This chip uses 64 bit addresses. */ +#define cpu_to_dma(x) cpu_to_le64(x) +#define dma_to_cpu(x) le64_to_cpu(x) +#define RX_DESC_Q_ADDR_SIZE RxDescQAddr64bit +#define TX_DESC_Q_ADDR_SIZE TxDescQAddr64bit +#define RX_COMPL_Q_ADDR_SIZE RxComplQAddr64bit +#define TX_COMPL_Q_ADDR_SIZE TxComplQAddr64bit +#define RX_DESC_ADDR_SIZE RxDescAddr64bit +#else /* 32-bit dma_addr_t */ +#define cpu_to_dma(x) cpu_to_le32(x) +#define dma_to_cpu(x) le32_to_cpu(x) +#define RX_DESC_Q_ADDR_SIZE RxDescQAddr32bit +#define TX_DESC_Q_ADDR_SIZE TxDescQAddr32bit +#define RX_COMPL_Q_ADDR_SIZE RxComplQAddr32bit +#define TX_COMPL_Q_ADDR_SIZE TxComplQAddr32bit +#define RX_DESC_ADDR_SIZE RxDescAddr32bit +#endif -#ifdef ZEROCOPY +#ifdef MAX_SKB_FRAGS #define skb_first_frag_len(skb) skb_headlen(skb) -#else /* not ZEROCOPY */ +#define skb_num_frags(skb) (skb_shinfo(skb)->nr_frags + 1) +#else /* not MAX_SKB_FRAGS */ #define skb_first_frag_len(skb) (skb->len) -#endif /* not ZEROCOPY */ +#define skb_num_frags(skb) 1 +#endif /* not MAX_SKB_FRAGS */ /* 2.2.x compatibility code */ #if LINUX_VERSION_CODE < 0x20300 @@ -236,9 +295,12 @@ static int full_duplex[MAX_UNITS] = {0, #else /* LINUX_VERSION_CODE > 0x20300 */ +#include #include #include +#include + #define COMPAT_MOD_INC_USE_COUNT #define COMPAT_MOD_DEC_USE_COUNT @@ -253,6 +315,40 @@ static int full_duplex[MAX_UNITS] = {0, #define PCI_SLOT_NAME(pci_dev) (pci_dev)->slot_name #endif /* LINUX_VERSION_CODE > 0x20300 */ + +#ifdef HAVE_NETDEV_POLL +#define init_poll(dev) \ + dev->poll = &netdev_poll; \ + dev->weight = max_interrupt_work; +#define netdev_rx(dev, ioaddr) \ +do { \ + u32 intr_enable; \ + if (netif_rx_schedule_prep(dev)) { \ + __netif_rx_schedule(dev); \ + intr_enable = readl(ioaddr + IntrEnable); \ + intr_enable &= ~(IntrRxDone | IntrRxEmpty); \ + writel(intr_enable, ioaddr + IntrEnable); \ + } else { \ + /* Paranoia check */ \ + intr_enable = readl(ioaddr + IntrEnable); \ + if (intr_enable & (IntrRxDone | IntrRxEmpty)) { \ + printk("%s: interrupt while in polling mode!\n", dev->name); \ + intr_enable &= ~(IntrRxDone | IntrRxEmpty); \ + writel(intr_enable, ioaddr + IntrEnable); \ + } \ + } \ +} while (0) +static int netdev_poll(struct net_device *dev, int *budget); +#else /* not HAVE_NETDEV_POLL */ +#define init_poll(dev) +#define netif_receive_skb(skb) netif_rx(skb) +#define vlan_hwaccel_receive_skb(skb, vlgrp, vlid) vlan_hwaccel_rx(skb, vlgrp, vlid) +#define netdev_rx(dev, ioaddr) \ +do { \ + int quota = np->dirty_rx + RX_RING_SIZE - np->cur_rx; \ + __netdev_rx(dev, "a);\ +} while (0) +#endif /* not HAVE_NETDEV_POLL */ /* end of compatibility code */ @@ -269,15 +365,20 @@ MODULE_PARM(max_interrupt_work, "i"); MODULE_PARM(mtu, "i"); MODULE_PARM(debug, "i"); MODULE_PARM(rx_copybreak, "i"); -MODULE_PARM(interrupt_mitigation, "i"); +MODULE_PARM(intr_latency, "i"); +MODULE_PARM(small_frames, "i"); MODULE_PARM(options, "1-" __MODULE_STRING(MAX_UNITS) "i"); MODULE_PARM(full_duplex, "1-" __MODULE_STRING(MAX_UNITS) "i"); -MODULE_PARM_DESC(max_interrupt_work, "Starfire maximum events handled per interrupt"); -MODULE_PARM_DESC(mtu, "Starfire MTU (all boards)"); -MODULE_PARM_DESC(debug, "Starfire debug level (0-6)"); -MODULE_PARM_DESC(rx_copybreak, "Starfire copy breakpoint for copy-only-tiny-frames"); -MODULE_PARM_DESC(options, "Starfire: Bits 0-3: media type, bit 17: full duplex"); -MODULE_PARM_DESC(full_duplex, "Starfire full duplex setting(s) (1)"); +MODULE_PARM(enable_hw_cksum, "i"); +MODULE_PARM_DESC(max_interrupt_work, "Maximum events handled per interrupt"); +MODULE_PARM_DESC(mtu, "MTU (all boards)"); +MODULE_PARM_DESC(debug, "Debug level (0-6)"); +MODULE_PARM_DESC(rx_copybreak, "Copy breakpoint for copy-only-tiny-frames"); +MODULE_PARM_DESC(intr_latency, "Maximum interrupt latency, in microseconds"); +MODULE_PARM_DESC(small_frames, "Maximum size of receive frames that bypass interrupt latency (0,64,128,256,512)"); +MODULE_PARM_DESC(options, "Deprecated: Bits 0-3: media type, bit 17: full duplex"); +MODULE_PARM_DESC(full_duplex, "Deprecated: Forced full-duplex setting (0/1)"); +MODULE_PARM_DESC(enable_hw_cksum, "Enable/disable hardware cksum support (0/1)"); /* Theory of Operation @@ -363,13 +464,6 @@ IVc. Errata enum chip_capability_flags {CanHaveMII=1, }; -#define PCI_IOTYPE (PCI_USES_MASTER | PCI_USES_MEM | PCI_ADDR0) - -#if 0 -#define ADDR_64BITS 1 /* This chip uses 64 bit addresses. */ -#endif - -#define HAS_IP_COPYSUM 1 enum chipset { CH_6915 = 0, @@ -401,7 +495,7 @@ static struct chip_info { enum register_offsets { PCIDeviceConfig=0x50040, GenCtrl=0x50070, IntrTimerCtrl=0x50074, IntrClear=0x50080, IntrStatus=0x50084, IntrEnable=0x50088, - MIICtrl=0x52000, StationAddr=0x50120, EEPROMCtrl=0x51000, + MIICtrl=0x52000, TxStationAddr=0x50120, EEPROMCtrl=0x51000, GPIOCtrl=0x5008C, TxDescCtrl=0x50090, TxRingPtr=0x50098, HiPriTxRingPtr=0x50094, /* Low and High priority. */ TxRingHiAddr=0x5009C, /* 64 bit address extension. */ @@ -412,11 +506,16 @@ enum register_offsets { CompletionQConsumerIdx=0x500C4, RxDMACtrl=0x500D0, RxDescQCtrl=0x500D4, RxDescQHiAddr=0x500DC, RxDescQAddr=0x500E0, RxDescQIdx=0x500E8, RxDMAStatus=0x500F0, RxFilterMode=0x500F4, - TxMode=0x55000, PerfFilterTable=0x56000, HashTable=0x56100, + TxMode=0x55000, VlanType=0x55064, + PerfFilterTable=0x56000, HashTable=0x56100, TxGfpMem=0x58000, RxGfpMem=0x5a000, }; -/* Bits in the interrupt status/mask registers. */ +/* + * Bits in the interrupt status/mask registers. + * Warning: setting Intr[Ab]NormalSummary in the IntrEnable register + * enables all the interrupt sources that are or'ed into those status bits. + */ enum intr_status_bits { IntrLinkChange=0xf0000000, IntrStatsMax=0x08000000, IntrAbnormalSummary=0x02000000, IntrGeneralTimer=0x01000000, @@ -441,7 +540,16 @@ enum intr_status_bits { /* Bits in the RxFilterMode register. */ enum rx_mode_bits { AcceptBroadcast=0x04, AcceptAllMulticast=0x02, AcceptAll=0x01, - AcceptMulticast=0x10, AcceptMyPhys=0xE040, + AcceptMulticast=0x10, PerfectFilter=0x40, HashFilter=0x30, + PerfectFilterVlan=0x80, MinVLANPrio=0xE000, VlanMode=0x0200, + WakeupOnGFP=0x0800, +}; + +/* Bits in the TxMode register */ +enum tx_mode_bits { + MiiSoftReset=0x8000, MIILoopback=0x4000, + TxFlowEnable=0x0800, RxFlowEnable=0x0400, + PadEnable=0x04, FullDuplex=0x02, HugeFrame=0x01, }; /* Bits in the TxDescCtrl register. */ @@ -450,7 +558,8 @@ enum tx_ctrl_bits { TxDescSpace128=0x30, TxDescSpace256=0x40, TxDescType0=0x00, TxDescType1=0x01, TxDescType2=0x02, TxDescType3=0x03, TxDescType4=0x04, - TxNoDMACompletion=0x08, TxDescQ64bit=0x80, + TxNoDMACompletion=0x08, + TxDescQAddr64bit=0x80, TxDescQAddr32bit=0, TxHiPriFIFOThreshShift=24, TxPadLenShift=16, TxDMABurstSizeShift=8, }; @@ -458,81 +567,144 @@ enum tx_ctrl_bits { /* Bits in the RxDescQCtrl register. */ enum rx_ctrl_bits { RxBufferLenShift=16, RxMinDescrThreshShift=0, - RxPrefetchMode=0x8000, Rx2048QEntries=0x4000, - RxVariableQ=0x2000, RxDesc64bit=0x1000, - RxDescQAddr64bit=0x0100, + RxPrefetchMode=0x8000, RxVariableQ=0x2000, + Rx2048QEntries=0x4000, Rx256QEntries=0, + RxDescAddr64bit=0x1000, RxDescAddr32bit=0, + RxDescQAddr64bit=0x0100, RxDescQAddr32bit=0, RxDescSpace4=0x000, RxDescSpace8=0x100, RxDescSpace16=0x200, RxDescSpace32=0x300, RxDescSpace64=0x400, RxDescSpace128=0x500, RxConsumerWrEn=0x80, }; +/* Bits in the RxDMACtrl register. */ +enum rx_dmactrl_bits { + RxReportBadFrames=0x80000000, RxDMAShortFrames=0x40000000, + RxDMABadFrames=0x20000000, RxDMACrcErrorFrames=0x10000000, + RxDMAControlFrame=0x08000000, RxDMAPauseFrame=0x04000000, + RxChecksumIgnore=0, RxChecksumRejectTCPUDP=0x02000000, + RxChecksumRejectTCPOnly=0x01000000, + RxCompletionQ2Enable=0x800000, + RxDMAQ2Disable=0, RxDMAQ2FPOnly=0x100000, + RxDMAQ2SmallPkt=0x200000, RxDMAQ2HighPrio=0x300000, + RxDMAQ2NonIP=0x400000, + RxUseBackupQueue=0x080000, RxDMACRC=0x040000, + RxEarlyIntThreshShift=12, RxHighPrioThreshShift=8, + RxBurstSizeShift=0, +}; + /* Bits in the RxCompletionAddr register */ enum rx_compl_bits { - RxComplQAddr64bit=0x80, TxComplProducerWrEn=0x40, + RxComplQAddr64bit=0x80, RxComplQAddr32bit=0, + RxComplProducerWrEn=0x40, RxComplType0=0x00, RxComplType1=0x10, RxComplType2=0x20, RxComplType3=0x30, RxComplThreshShift=0, }; +/* Bits in the TxCompletionAddr register */ +enum tx_compl_bits { + TxComplQAddr64bit=0x80, TxComplQAddr32bit=0, + TxComplProducerWrEn=0x40, + TxComplIntrStatus=0x20, + CommonQueueMode=0x10, + TxComplThreshShift=0, +}; + +/* Bits in the GenCtrl register */ +enum gen_ctrl_bits { + RxEnable=0x05, TxEnable=0x0a, + RxGFPEnable=0x10, TxGFPEnable=0x20, +}; + +/* Bits in the IntrTimerCtrl register */ +enum intr_ctrl_bits { + Timer10X=0x800, EnableIntrMasking=0x60, SmallFrameBypass=0x100, + SmallFrame64=0, SmallFrame128=0x200, SmallFrame256=0x400, SmallFrame512=0x600, + IntrLatencyMask=0x1f, +}; + /* The Rx and Tx buffer descriptors. */ struct starfire_rx_desc { - u32 rxaddr; /* Optionally 64 bits. */ + dma_addr_t rxaddr; }; enum rx_desc_bits { RxDescValid=1, RxDescEndRing=2, }; -/* Completion queue entry. - You must update the page allocation, init_ring and the shift count in rx() - if using a larger format. */ -#ifdef HAS_FIRMWARE -#define csum_rx_status -#endif /* HAS_FIRMWARE */ -struct rx_done_desc { +/* Completion queue entry. */ +struct short_rx_done_desc { + u32 status; /* Low 16 bits is length. */ +}; +struct basic_rx_done_desc { u32 status; /* Low 16 bits is length. */ -#ifdef csum_rx_status - u32 status2; /* Low 16 bits is csum */ -#endif /* csum_rx_status */ -#ifdef full_rx_status - u32 status2; + u16 vlanid; + u16 status2; +}; +struct csum_rx_done_desc { + u32 status; /* Low 16 bits is length. */ + u16 csum; /* Partial checksum */ + u16 status2; +}; +struct full_rx_done_desc { + u32 status; /* Low 16 bits is length. */ + u16 status3; + u16 status2; u16 vlanid; u16 csum; /* partial checksum */ u32 timestamp; -#endif /* full_rx_status */ }; +/* XXX: this is ugly and I'm not sure it's worth the trouble -Ion */ +#ifdef HAS_FIRMWARE +#ifdef VLAN_SUPPORT +typedef struct full_rx_done_desc rx_done_desc; +#define RxComplType RxComplType3 +#else /* not VLAN_SUPPORT */ +typedef struct csum_rx_done_desc rx_done_desc; +#define RxComplType RxComplType2 +#endif /* not VLAN_SUPPORT */ +#else /* not HAS_FIRMWARE */ +#ifdef VLAN_SUPPORT +typedef struct basic_rx_done_desc rx_done_desc; +#define RxComplType RxComplType1 +#else /* not VLAN_SUPPORT */ +typedef struct short_rx_done_desc rx_done_desc; +#define RxComplType RxComplType0 +#endif /* not VLAN_SUPPORT */ +#endif /* not HAS_FIRMWARE */ + enum rx_done_bits { RxOK=0x20000000, RxFIFOErr=0x10000000, RxBufQ2=0x08000000, }; -#ifdef ZEROCOPY -/* Type 0 Tx descriptor. */ -/* If more fragments are needed, don't forget to change the - descriptor spacing as well! */ -struct starfire_tx_desc { - u32 status; - u32 nbufs; - u32 first_addr; - u16 first_len; - u16 total_len; - struct { - u32 addr; - u32 len; - } frag[MAX_STARFIRE_FRAGS]; -}; -#else /* not ZEROCOPY */ /* Type 1 Tx descriptor. */ -struct starfire_tx_desc { +struct starfire_tx_desc_1 { + u32 status; /* Upper bits are status, lower 16 length. */ + u32 addr; +}; + +/* Type 2 Tx descriptor. */ +struct starfire_tx_desc_2 { u32 status; /* Upper bits are status, lower 16 length. */ - u32 first_addr; + u32 reserved; + u64 addr; }; -#endif /* not ZEROCOPY */ + +#ifdef ADDR_64BITS +typedef struct starfire_tx_desc_2 starfire_tx_desc; +#define TX_DESC_TYPE TxDescType2 +#else /* not ADDR_64BITS */ +typedef struct starfire_tx_desc_1 starfire_tx_desc; +#define TX_DESC_TYPE TxDescType1 +#endif /* not ADDR_64BITS */ +#define TX_DESC_SPACING TxDescSpaceUnlim + enum tx_desc_bits { TxDescID=0xB0000000, TxCRCEn=0x01000000, TxDescIntr=0x08000000, TxRingWrap=0x04000000, TxCalTCP=0x02000000, }; -struct tx_done_report { +struct tx_done_desc { u32 status; /* timestamp, index. */ #if 0 u32 intrstatus; /* interrupt status */ @@ -545,41 +717,45 @@ struct rx_ring_info { }; struct tx_ring_info { struct sk_buff *skb; - dma_addr_t first_mapping; -#ifdef ZEROCOPY - dma_addr_t frag_mapping[MAX_STARFIRE_FRAGS]; -#endif /* ZEROCOPY */ + dma_addr_t mapping; + unsigned int used_slots; }; #define PHY_CNT 2 struct netdev_private { /* Descriptor rings first for alignment. */ struct starfire_rx_desc *rx_ring; - struct starfire_tx_desc *tx_ring; + starfire_tx_desc *tx_ring; dma_addr_t rx_ring_dma; dma_addr_t tx_ring_dma; /* The addresses of rx/tx-in-place skbuffs. */ struct rx_ring_info rx_info[RX_RING_SIZE]; struct tx_ring_info tx_info[TX_RING_SIZE]; /* Pointers to completion queues (full pages). */ - struct rx_done_desc *rx_done_q; + rx_done_desc *rx_done_q; dma_addr_t rx_done_q_dma; unsigned int rx_done; - struct tx_done_report *tx_done_q; + struct tx_done_desc *tx_done_q; dma_addr_t tx_done_q_dma; unsigned int tx_done; struct net_device_stats stats; struct pci_dev *pci_dev; +#ifdef VLAN_SUPPORT + struct vlan_group *vlgrp; +#endif + void *queue_mem; + dma_addr_t queue_mem_dma; + size_t queue_mem_size; + /* Frequently used values: keep some adjacent for cache effect. */ spinlock_t lock; unsigned int cur_rx, dirty_rx; /* Producer/consumer ring indices */ - unsigned int cur_tx, dirty_tx; + unsigned int cur_tx, dirty_tx, reap_tx; unsigned int rx_buf_sz; /* Based on MTU+slack. */ - unsigned int tx_full:1, /* The Tx queue is full. */ /* These values keep track of the transceiver/media in use. */ - speed100:1; /* Set if speed == 100MBit. */ - unsigned int intr_mitigation; + int speed100; /* Set if speed == 100MBit. */ u32 tx_mode; + u32 intr_timer_ctrl; u8 tx_threshold; /* MII transceiver section. */ struct mii_if_info mii_if; /* MII lib hooks/info */ @@ -597,7 +773,8 @@ static void init_ring(struct net_device static int start_tx(struct sk_buff *skb, struct net_device *dev); static void intr_handler(int irq, void *dev_instance, struct pt_regs *regs); static void netdev_error(struct net_device *dev, int intr_status); -static int netdev_rx(struct net_device *dev); +static int __netdev_rx(struct net_device *dev, int *quota); +static void refill_rx_ring(struct net_device *dev); static void netdev_error(struct net_device *dev, int intr_status); static void set_rx_mode(struct net_device *dev); static struct net_device_stats *get_stats(struct net_device *dev); @@ -606,6 +783,44 @@ static int netdev_close(struct net_devic static void netdev_media_change(struct net_device *dev); +#ifdef VLAN_SUPPORT +static void netdev_vlan_rx_register(struct net_device *dev, struct vlan_group *grp) +{ + struct netdev_private *np = dev->priv; + + spin_lock(&np->lock); + if (debug > 2) + printk("%s: Setting vlgrp to %p\n", dev->name, grp); + np->vlgrp = grp; + set_rx_mode(dev); + spin_unlock(&np->lock); +} + +static void netdev_vlan_rx_add_vid(struct net_device *dev, unsigned short vid) +{ + struct netdev_private *np = dev->priv; + + spin_lock(&np->lock); + if (debug > 1) + printk("%s: Adding vlanid %d to vlan filter\n", dev->name, vid); + set_rx_mode(dev); + spin_unlock(&np->lock); +} + +static void netdev_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid) +{ + struct netdev_private *np = dev->priv; + + spin_lock(&np->lock); + if (debug > 1) + printk("%s: removing vlanid %d from vlan filter\n", dev->name, vid); + if (np->vlgrp) + np->vlgrp->vlan_devices[vid] = NULL; + set_rx_mode(dev); + spin_unlock(&np->lock); +} +#endif /* VLAN_SUPPORT */ + static int __devinit starfire_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) @@ -617,10 +832,6 @@ static int __devinit starfire_init_one(s long ioaddr; int drv_flags, io_size; int boguscnt; -#ifndef HAVE_PCI_SET_MWI - u16 cmd; - u8 cache; -#endif /* when built into the kernel, we only print version if device is found */ #ifndef MODULE @@ -637,13 +848,13 @@ static int __devinit starfire_init_one(s ioaddr = pci_resource_start(pdev, 0); io_size = pci_resource_len(pdev, 0); if (!ioaddr || ((pci_resource_flags(pdev, 0) & IORESOURCE_MEM) == 0)) { - printk (KERN_ERR DRV_NAME " %d: no PCI MEM resources, aborting\n", card_idx); + printk(KERN_ERR DRV_NAME " %d: no PCI MEM resources, aborting\n", card_idx); return -ENODEV; } dev = alloc_etherdev(sizeof(*np)); if (!dev) { - printk (KERN_ERR DRV_NAME " %d: cannot alloc etherdev, aborting\n", card_idx); + printk(KERN_ERR DRV_NAME " %d: cannot alloc etherdev, aborting\n", card_idx); return -ENOMEM; } SET_MODULE_OWNER(dev); @@ -651,7 +862,7 @@ static int __devinit starfire_init_one(s irq = pdev->irq; if (pci_request_regions (pdev, dev->name)) { - printk (KERN_ERR DRV_NAME " %d: cannot reserve PCI resources, aborting\n", card_idx); + printk(KERN_ERR DRV_NAME " %d: cannot reserve PCI resources, aborting\n", card_idx); goto err_out_free_netdev; } @@ -659,7 +870,7 @@ static int __devinit starfire_init_one(s #if !defined(CONFIG_SPARC64) || LINUX_VERSION_CODE > 0x20300 ioaddr = (long) ioremap(ioaddr, io_size); if (!ioaddr) { - printk (KERN_ERR DRV_NAME " %d: cannot remap 0x%x @ 0x%lx, aborting\n", + printk(KERN_ERR DRV_NAME " %d: cannot remap %#x @ %#lx, aborting\n", card_idx, io_size, ioaddr); goto err_out_free_res; } @@ -667,29 +878,26 @@ static int __devinit starfire_init_one(s pci_set_master(pdev); -#ifdef HAVE_PCI_SET_MWI - pci_set_mwi(pdev); -#else /* enable MWI -- it vastly improves Rx performance on sparc64 */ - pci_read_config_word(pdev, PCI_COMMAND, &cmd); - cmd |= PCI_COMMAND_INVALIDATE; - pci_write_config_word(pdev, PCI_COMMAND, cmd); - - /* set PCI cache size */ - pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache); - if ((cache << 2) != SMP_CACHE_BYTES) { - printk(KERN_INFO " PCI cache line size set incorrectly " - "(%i bytes) by BIOS/FW, correcting to %i\n", - (cache << 2), SMP_CACHE_BYTES); - pci_write_config_byte(pdev, PCI_CACHE_LINE_SIZE, - SMP_CACHE_BYTES >> 2); - } -#endif + pci_set_mwi(pdev); +#ifdef MAX_SKB_FRAGS + dev->features |= NETIF_F_SG; +#endif /* MAX_SKB_FRAGS */ #ifdef ZEROCOPY - /* Starfire can do SG and TCP/UDP checksumming */ - dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; + /* Starfire can do TCP/UDP checksumming */ + if (enable_hw_cksum) + dev->features |= NETIF_F_IP_CSUM; #endif /* ZEROCOPY */ +#ifdef VLAN_SUPPORT + dev->features |= NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER; + dev->vlan_rx_register = netdev_vlan_rx_register; + dev->vlan_rx_add_vid = netdev_vlan_rx_add_vid; + dev->vlan_rx_kill_vid = netdev_vlan_rx_kill_vid; +#endif /* VLAN_RX_KILL_VID */ +#ifdef ADDR_64BITS + dev->features |= NETIF_F_HIGHDMA; +#endif /* ADDR_64BITS */ /* Serial EEPROM reads are hidden by the hardware. */ for (i = 0; i < 6; i++) @@ -704,7 +912,7 @@ static int __devinit starfire_init_one(s #endif /* Issue soft reset */ - writel(0x8000, ioaddr + TxMode); + writel(MiiSoftReset, ioaddr + TxMode); udelay(1000); writel(0, ioaddr + TxMode); @@ -750,15 +958,40 @@ static int __devinit starfire_init_one(s np->mii_if.full_duplex = 1; if (np->mii_if.full_duplex) - np->mii_if.force_media = 0; - else np->mii_if.force_media = 1; + else + np->mii_if.force_media = 0; np->speed100 = 1; + /* timer resolution is 128 * 0.8us */ + np->intr_timer_ctrl = (((intr_latency * 10) / 1024) & IntrLatencyMask) | + Timer10X | EnableIntrMasking; + + if (small_frames > 0) { + np->intr_timer_ctrl |= SmallFrameBypass; + switch (small_frames) { + case 1 ... 64: + np->intr_timer_ctrl |= SmallFrame64; + break; + case 65 ... 128: + np->intr_timer_ctrl |= SmallFrame128; + break; + case 129 ... 256: + np->intr_timer_ctrl |= SmallFrame256; + break; + default: + np->intr_timer_ctrl |= SmallFrame512; + if (small_frames > 512) + printk("Adjusting small_frames down to 512\n"); + break; + } + } + /* The chip-specific entries in the device structure. */ dev->open = &netdev_open; dev->hard_start_xmit = &start_tx; init_tx_timer(dev, tx_timeout, TX_TIMEOUT); + init_poll(dev); dev->stop = &netdev_close; dev->get_stats = &get_stats; dev->set_multicast_list = &set_rx_mode; @@ -767,11 +1000,10 @@ static int __devinit starfire_init_one(s if (mtu) dev->mtu = mtu; - i = register_netdev(dev); - if (i) + if (register_netdev(dev)) goto err_out_cleardev; - printk(KERN_INFO "%s: %s at 0x%lx, ", + printk(KERN_INFO "%s: %s at %#lx, ", dev->name, netdrv_tbl[chip_idx].name, ioaddr); for (i = 0; i < 5; i++) printk("%2.2x:", dev->dev_addr[i]); @@ -796,7 +1028,7 @@ static int __devinit starfire_init_one(s np->phys[phy_idx++] = phy; np->mii_if.advertising = mdio_read(dev, phy, MII_ADVERTISE); printk(KERN_INFO "%s: MII PHY found at address %d, status " - "0x%4.4x advertising %4.4x.\n", + "%#4.4x advertising %#4.4x.\n", dev->name, phy, mii_status, np->mii_if.advertising); /* there can be only one PHY on-board */ break; @@ -809,14 +1041,8 @@ static int __devinit starfire_init_one(s memset(&np->mii_if, 0, sizeof(np->mii_if)); } -#ifdef ZEROCOPY - printk(KERN_INFO "%s: scatter-gather and hardware TCP cksumming enabled.\n", - dev->name); -#else /* not ZEROCOPY */ - printk(KERN_INFO "%s: scatter-gather and hardware TCP cksumming disabled.\n", - dev->name); -#endif /* not ZEROCOPY */ - + printk(KERN_INFO "%s: scatter-gather and hardware TCP cksumming %s.\n", + dev->name, enable_hw_cksum ? "enabled" : "disabled"); return 0; err_out_cleardev: @@ -825,7 +1051,6 @@ err_out_cleardev: err_out_free_res: pci_release_regions (pdev); err_out_free_netdev: - unregister_netdev(dev); kfree(dev); return -ENODEV; } @@ -861,6 +1086,7 @@ static int netdev_open(struct net_device struct netdev_private *np = dev->priv; long ioaddr = dev->base_addr; int i, retval; + size_t tx_done_q_size, rx_done_q_size, tx_ring_size, rx_ring_size; /* Do we ever need to reset the chip??? */ @@ -878,62 +1104,58 @@ static int netdev_open(struct net_device if (debug > 1) printk(KERN_DEBUG "%s: netdev_open() irq %d.\n", dev->name, dev->irq); - /* Allocate the various queues, failing gracefully. */ - if (np->tx_done_q == 0) - np->tx_done_q = pci_alloc_consistent(np->pci_dev, PAGE_SIZE, &np->tx_done_q_dma); - if (np->rx_done_q == 0) - np->rx_done_q = pci_alloc_consistent(np->pci_dev, sizeof(struct rx_done_desc) * DONE_Q_SIZE, &np->rx_done_q_dma); - if (np->tx_ring == 0) - np->tx_ring = pci_alloc_consistent(np->pci_dev, PAGE_SIZE, &np->tx_ring_dma); - if (np->rx_ring == 0) - np->rx_ring = pci_alloc_consistent(np->pci_dev, PAGE_SIZE, &np->rx_ring_dma); - if (np->tx_done_q == 0 || np->rx_done_q == 0 - || np->rx_ring == 0 || np->tx_ring == 0) { - if (np->tx_done_q) - pci_free_consistent(np->pci_dev, PAGE_SIZE, - np->tx_done_q, np->tx_done_q_dma); - if (np->rx_done_q) - pci_free_consistent(np->pci_dev, sizeof(struct rx_done_desc) * DONE_Q_SIZE, - np->rx_done_q, np->rx_done_q_dma); - if (np->tx_ring) - pci_free_consistent(np->pci_dev, PAGE_SIZE, - np->tx_ring, np->tx_ring_dma); - if (np->rx_ring) - pci_free_consistent(np->pci_dev, PAGE_SIZE, - np->rx_ring, np->rx_ring_dma); + /* Allocate the various queues. */ + tx_done_q_size = ((sizeof(struct tx_done_desc) * DONE_Q_SIZE + QUEUE_ALIGN - 1) / QUEUE_ALIGN) * QUEUE_ALIGN; + rx_done_q_size = ((sizeof(rx_done_desc) * DONE_Q_SIZE + QUEUE_ALIGN - 1) / QUEUE_ALIGN) * QUEUE_ALIGN; + tx_ring_size = ((sizeof(starfire_tx_desc) * TX_RING_SIZE + QUEUE_ALIGN - 1) / QUEUE_ALIGN) * QUEUE_ALIGN; + rx_ring_size = sizeof(struct starfire_rx_desc) * RX_RING_SIZE; + np->queue_mem_size = tx_done_q_size + rx_done_q_size + tx_ring_size + rx_ring_size; + np->queue_mem = pci_alloc_consistent(np->pci_dev, np->queue_mem_size, &np->queue_mem_dma); + if (np->queue_mem == 0) { COMPAT_MOD_DEC_USE_COUNT; return -ENOMEM; } + np->tx_done_q = np->queue_mem; + np->tx_done_q_dma = np->queue_mem_dma; + np->rx_done_q = (void *) np->tx_done_q + tx_done_q_size; + np->rx_done_q_dma = np->tx_done_q_dma + tx_done_q_size; + np->tx_ring = (void *) np->rx_done_q + rx_done_q_size; + np->tx_ring_dma = np->rx_done_q_dma + rx_done_q_size; + np->rx_ring = (void *) np->tx_ring + tx_ring_size; + np->rx_ring_dma = np->tx_ring_dma + tx_ring_size; + + /* Start with no carrier, it gets adjusted later */ netif_carrier_off(dev); init_ring(dev); /* Set the size of the Rx buffers. */ writel((np->rx_buf_sz << RxBufferLenShift) | (0 << RxMinDescrThreshShift) | RxPrefetchMode | RxVariableQ | + RX_Q_ENTRIES | + RX_DESC_Q_ADDR_SIZE | RX_DESC_ADDR_SIZE | RxDescSpace4, ioaddr + RxDescQCtrl); -#ifdef ZEROCOPY - /* Set Tx descriptor to type 0 and spacing to 64 bytes. */ - writel((2 << TxHiPriFIFOThreshShift) | - (0 << TxPadLenShift) | - (4 << TxDMABurstSizeShift) | - TxDescSpace64 | TxDescType0, - ioaddr + TxDescCtrl); -#else /* not ZEROCOPY */ - /* Set Tx descriptor to type 1 and padding to 0 bytes. */ + /* Set up the Rx DMA controller. */ + writel(RxChecksumIgnore | + (0 << RxEarlyIntThreshShift) | + (6 << RxHighPrioThreshShift) | + ((DMA_BURST_SIZE / 32) << RxBurstSizeShift), + ioaddr + RxDMACtrl); + + /* Set Tx descriptor */ writel((2 << TxHiPriFIFOThreshShift) | (0 << TxPadLenShift) | - (4 << TxDMABurstSizeShift) | - TxDescSpaceUnlim | TxDescType1, + ((DMA_BURST_SIZE / 32) << TxDMABurstSizeShift) | + TX_DESC_Q_ADDR_SIZE | + TX_DESC_SPACING | TX_DESC_TYPE, ioaddr + TxDescCtrl); -#endif /* not ZEROCOPY */ -#if defined(ADDR_64BITS) && defined(__alpha__) - /* XXX We really need a 64-bit PCI dma interfaces too... -DaveM */ - writel(np->rx_ring_dma >> 32, ioaddr + RxDescQHiAddr); - writel(np->tx_ring_dma >> 32, ioaddr + TxRingHiAddr); +#if defined(ADDR_64BITS) + writel(np->queue_mem_dma >> 32, ioaddr + RxDescQHiAddr); + writel(np->queue_mem_dma >> 32, ioaddr + TxRingHiAddr); + writel(np->queue_mem_dma >> 32, ioaddr + CompletionHiAddr); #else writel(0, ioaddr + RxDescQHiAddr); writel(0, ioaddr + TxRingHiAddr); @@ -943,32 +1165,23 @@ static int netdev_open(struct net_device writel(np->tx_ring_dma, ioaddr + TxRingPtr); writel(np->tx_done_q_dma, ioaddr + TxCompletionAddr); -#ifdef full_rx_status - writel(np->rx_done_q_dma | - RxComplType3 | - (0 << RxComplThreshShift), - ioaddr + RxCompletionAddr); -#else /* not full_rx_status */ -#ifdef csum_rx_status - writel(np->rx_done_q_dma | - RxComplType2 | - (0 << RxComplThreshShift), - ioaddr + RxCompletionAddr); -#else /* not csum_rx_status */ writel(np->rx_done_q_dma | - RxComplType0 | + RxComplType | (0 << RxComplThreshShift), ioaddr + RxCompletionAddr); -#endif /* not csum_rx_status */ -#endif /* not full_rx_status */ if (debug > 1) printk(KERN_DEBUG "%s: Filling in the station address.\n", dev->name); - /* Fill both the unused Tx SA register and the Rx perfect filter. */ + /* Fill both the Tx SA register and the Rx perfect filter. */ for (i = 0; i < 6; i++) - writeb(dev->dev_addr[i], ioaddr + StationAddr + 5 - i); - for (i = 0; i < 16; i++) { + writeb(dev->dev_addr[i], ioaddr + TxStationAddr + 5 - i); + /* The first entry is special because it bypasses the VLAN filter. + Don't use it. */ + writew(0, ioaddr + PerfFilterTable); + writew(0, ioaddr + PerfFilterTable + 4); + writew(0, ioaddr + PerfFilterTable + 8); + for (i = 1; i < 16; i++) { u16 *eaddrs = (u16 *)dev->dev_addr; long setup_frm = ioaddr + PerfFilterTable + i * 16; writew(cpu_to_be16(eaddrs[2]), setup_frm); setup_frm += 4; @@ -978,16 +1191,14 @@ static int netdev_open(struct net_device /* Initialize other registers. */ /* Configure the PCI bus bursts and FIFO thresholds. */ - np->tx_mode = 0x0C04; /* modified when link is up. */ - writel(0x8000 | np->tx_mode, ioaddr + TxMode); + np->tx_mode = TxFlowEnable|RxFlowEnable|PadEnable; /* modified when link is up. */ + writel(MiiSoftReset | np->tx_mode, ioaddr + TxMode); udelay(1000); writel(np->tx_mode, ioaddr + TxMode); np->tx_threshold = 4; writel(np->tx_threshold, ioaddr + TxThreshold); - interrupt_mitigation &= 0x1f; - np->intr_mitigation = interrupt_mitigation; - writel(np->intr_mitigation, ioaddr + IntrTimerCtrl); + writel(np->intr_timer_ctrl, ioaddr + IntrTimerCtrl); netif_start_if(dev); netif_start_queue(dev); @@ -1002,29 +1213,35 @@ static int netdev_open(struct net_device /* Enable GPIO interrupts on link change */ writel(0x0f00ff00, ioaddr + GPIOCtrl); - /* Set the interrupt mask and enable PCI interrupts. */ + /* Set the interrupt mask */ writel(IntrRxDone | IntrRxEmpty | IntrDMAErr | - IntrTxDone | IntrStatsMax | IntrLinkChange | - IntrNormalSummary | IntrAbnormalSummary | + IntrTxDMADone | IntrStatsMax | IntrLinkChange | IntrRxGFPDead | IntrNoTxCsum | IntrTxBadID, ioaddr + IntrEnable); + /* Enable PCI interrupts. */ writel(0x00800000 | readl(ioaddr + PCIDeviceConfig), ioaddr + PCIDeviceConfig); +#ifdef VLAN_SUPPORT + /* Set VLAN type to 802.1q */ + writel(ETH_P_8021Q, ioaddr + VlanType); +#endif /* VLAN_SUPPORT */ + #ifdef HAS_FIRMWARE /* Load Rx/Tx firmware into the frame processors */ for (i = 0; i < FIRMWARE_RX_SIZE * 2; i++) writel(firmware_rx[i], ioaddr + RxGfpMem + i * 4); for (i = 0; i < FIRMWARE_TX_SIZE * 2; i++) writel(firmware_tx[i], ioaddr + TxGfpMem + i * 4); - /* Enable the Rx and Tx units, and the Rx/Tx frame processors. */ - writel(0x003F, ioaddr + GenCtrl); -#else /* not HAS_FIRMWARE */ - /* Enable the Rx and Tx units only. */ - writel(0x000F, ioaddr + GenCtrl); -#endif /* not HAS_FIRMWARE */ +#endif /* HAS_FIRMWARE */ + if (enable_hw_cksum) + /* Enable the Rx and Tx units, and the Rx/Tx frame processors. */ + writel(TxEnable|TxGFPEnable|RxEnable|RxGFPEnable, ioaddr + GenCtrl); + else + /* Enable the Rx and Tx units only. */ + writel(TxEnable|RxEnable, ioaddr + GenCtrl); - if (debug > 2) + if (debug > 1) printk(KERN_DEBUG "%s: Done netdev_open().\n", dev->name); @@ -1036,11 +1253,17 @@ static void check_duplex(struct net_devi { struct netdev_private *np = dev->priv; u16 reg0; + int silly_count = 1000; mdio_write(dev, np->phys[0], MII_ADVERTISE, np->mii_if.advertising); mdio_write(dev, np->phys[0], MII_BMCR, BMCR_RESET); udelay(500); - while (mdio_read(dev, np->phys[0], MII_BMCR) & BMCR_RESET); + while (--silly_count && mdio_read(dev, np->phys[0], MII_BMCR) & BMCR_RESET) + /* do nothing */; + if (!silly_count) { + printk("%s: MII reset failed!\n", dev->name); + return; + } reg0 = mdio_read(dev, np->phys[0], MII_BMCR); @@ -1065,25 +1288,22 @@ static void tx_timeout(struct net_device { struct netdev_private *np = dev->priv; long ioaddr = dev->base_addr; + int old_debug; - printk(KERN_WARNING "%s: Transmit timed out, status %8.8x," - " resetting...\n", dev->name, (int)readl(ioaddr + IntrStatus)); - -#ifndef __alpha__ - { - int i; - printk(KERN_DEBUG " Rx ring %p: ", np->rx_ring); - for (i = 0; i < RX_RING_SIZE; i++) - printk(" %8.8x", (unsigned int)le32_to_cpu(np->rx_ring[i].rxaddr)); - printk("\n"KERN_DEBUG" Tx ring %p: ", np->tx_ring); - for (i = 0; i < TX_RING_SIZE; i++) - printk(" %4.4x", le32_to_cpu(np->tx_ring[i].status)); - printk("\n"); - } -#endif + printk(KERN_WARNING "%s: Transmit timed out, status %#8.8x, " + "resetting...\n", dev->name, (int) readl(ioaddr + IntrStatus)); /* Perhaps we should reinitialize the hardware here. */ - /* Stop and restart the chip's Tx processes . */ + + /* + * Stop and restart the interface. + * Cheat and increase the debug level temporarily. + */ + old_debug = debug; + debug = 2; + netdev_close(dev); + netdev_open(dev); + debug = old_debug; /* Trigger an immediate transmit demand. */ @@ -1099,9 +1319,8 @@ static void init_ring(struct net_device struct netdev_private *np = dev->priv; int i; - np->tx_full = 0; - np->cur_rx = np->cur_tx = 0; - np->dirty_rx = np->rx_done = np->dirty_tx = np->tx_done = 0; + np->cur_rx = np->cur_tx = np->reap_tx = 0; + np->dirty_rx = np->dirty_tx = np->rx_done = np->tx_done = 0; np->rx_buf_sz = (dev->mtu <= 1500 ? PKT_BUF_SZ : dev->mtu + 32); @@ -1114,7 +1333,7 @@ static void init_ring(struct net_device np->rx_info[i].mapping = pci_map_single(np->pci_dev, skb->tail, np->rx_buf_sz, PCI_DMA_FROMDEVICE); skb->dev = dev; /* Mark as being used by this device. */ /* Grrr, we cannot offset to correctly align the IP header. */ - np->rx_ring[i].rxaddr = cpu_to_le32(np->rx_info[i].mapping | RxDescValid); + np->rx_ring[i].rxaddr = cpu_to_dma(np->rx_info[i].mapping | RxDescValid); } writew(i - 1, dev->base_addr + RxDescQIdx); np->dirty_rx = (unsigned int)(i - RX_RING_SIZE); @@ -1126,7 +1345,7 @@ static void init_ring(struct net_device np->rx_info[i].mapping = 0; } /* Mark the last entry as wrapping the ring. */ - np->rx_ring[i-1].rxaddr |= cpu_to_le32(RxDescEndRing); + np->rx_ring[RX_RING_SIZE - 1].rxaddr |= cpu_to_dma(RxDescEndRing); /* Clear the completion rings. */ for (i = 0; i < DONE_Q_SIZE; i++) { @@ -1134,18 +1353,9 @@ static void init_ring(struct net_device np->tx_done_q[i].status = 0; } - for (i = 0; i < TX_RING_SIZE; i++) { - np->tx_info[i].skb = NULL; - np->tx_info[i].first_mapping = 0; -#ifdef ZEROCOPY - { - int j; - for (j = 0; j < MAX_STARFIRE_FRAGS; j++) - np->tx_info[i].frag_mapping[j] = 0; - } -#endif /* ZEROCOPY */ - np->tx_ring[i].status = 0; - } + for (i = 0; i < TX_RING_SIZE; i++) + memset(&np->tx_info[i], 0, sizeof(np->tx_info[i])); + return; } @@ -1154,19 +1364,21 @@ static int start_tx(struct sk_buff *skb, { struct netdev_private *np = dev->priv; unsigned int entry; -#ifdef ZEROCOPY + u32 status; int i; -#endif kick_tx_timer(dev, tx_timeout, TX_TIMEOUT); - /* Caution: the write order is important here, set the field - with the "ownership" bits last. */ - - /* Calculate the next Tx descriptor entry. */ - entry = np->cur_tx % TX_RING_SIZE; + /* + * be cautious here, wrapping the queue has weird semantics + * and we may not have enough slots even when it seems we do. + */ + if ((np->cur_tx - np->dirty_tx) + skb_num_frags(skb) * 2 > TX_RING_SIZE) { + netif_stop_queue(dev); + return 1; + } -#if defined(ZEROCOPY) && defined(HAS_FIRMWARE) && defined(HAS_BROKEN_FIRMWARE) +#if defined(ZEROCOPY) && defined(HAS_BROKEN_FIRMWARE) { int has_bad_length = 0; @@ -1183,85 +1395,72 @@ static int start_tx(struct sk_buff *skb, if (has_bad_length) skb_checksum_help(skb); } -#endif /* ZEROCOPY && HAS_FIRMWARE && HAS_BROKEN_FIRMWARE */ - - np->tx_info[entry].skb = skb; - np->tx_info[entry].first_mapping = - pci_map_single(np->pci_dev, skb->data, skb_first_frag_len(skb), PCI_DMA_TODEVICE); +#endif /* ZEROCOPY && HAS_BROKEN_FIRMWARE */ - np->tx_ring[entry].first_addr = cpu_to_le32(np->tx_info[entry].first_mapping); -#ifdef ZEROCOPY - np->tx_ring[entry].first_len = cpu_to_le16(skb_first_frag_len(skb)); - np->tx_ring[entry].total_len = cpu_to_le16(skb->len); - /* Add "| TxDescIntr" to generate Tx-done interrupts. */ - np->tx_ring[entry].status = cpu_to_le32(TxDescID | TxCRCEn); - np->tx_ring[entry].nbufs = cpu_to_le32(skb_shinfo(skb)->nr_frags + 1); -#else /* not ZEROCOPY */ - /* Add "| TxDescIntr" to generate Tx-done interrupts. */ - np->tx_ring[entry].status = cpu_to_le32(skb->len | TxDescID | TxCRCEn | 1 << 16); -#endif /* not ZEROCOPY */ - - if (entry >= TX_RING_SIZE-1) /* Wrap ring */ - np->tx_ring[entry].status |= cpu_to_le32(TxRingWrap | TxDescIntr); - -#ifdef ZEROCOPY - if (skb->ip_summed == CHECKSUM_HW) { - np->tx_ring[entry].status |= cpu_to_le32(TxCalTCP); - np->stats.tx_compressed++; - } -#endif /* ZEROCOPY */ - - if (debug > 5) { -#ifdef ZEROCOPY - printk(KERN_DEBUG "%s: Tx #%d slot %d status %8.8x nbufs %d len %4.4x/%4.4x.\n", - dev->name, np->cur_tx, entry, - le32_to_cpu(np->tx_ring[entry].status), - le32_to_cpu(np->tx_ring[entry].nbufs), - le32_to_cpu(np->tx_ring[entry].first_len), - le32_to_cpu(np->tx_ring[entry].total_len)); -#else /* not ZEROCOPY */ - printk(KERN_DEBUG "%s: Tx #%d slot %d status %8.8x.\n", - dev->name, np->cur_tx, entry, - le32_to_cpu(np->tx_ring[entry].status)); -#endif /* not ZEROCOPY */ - } - -#ifdef ZEROCOPY - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - skb_frag_t *this_frag = &skb_shinfo(skb)->frags[i]; + entry = np->cur_tx % TX_RING_SIZE; + for (i = 0; i < skb_num_frags(skb); i++) { + int wrap_ring = 0; + status = TxDescID; + + if (i == 0) { + np->tx_info[entry].skb = skb; + status |= TxCRCEn; + if (entry >= TX_RING_SIZE - skb_num_frags(skb)) { + status |= TxRingWrap; + wrap_ring = 1; + } + if (np->reap_tx) { + status |= TxDescIntr; + np->reap_tx = 0; + } + if (skb->ip_summed == CHECKSUM_HW) { + status |= TxCalTCP; + np->stats.tx_compressed++; + } + status |= skb_first_frag_len(skb) | (skb_num_frags(skb) << 16); - /* we already have the proper value in entry */ - np->tx_info[entry].frag_mapping[i] = - pci_map_single(np->pci_dev, page_address(this_frag->page) + this_frag->page_offset, this_frag->size, PCI_DMA_TODEVICE); - - np->tx_ring[entry].frag[i].addr = cpu_to_le32(np->tx_info[entry].frag_mapping[i]); - np->tx_ring[entry].frag[i].len = cpu_to_le32(this_frag->size); - if (debug > 5) { - printk(KERN_DEBUG "%s: Tx #%d frag %d len %4.4x.\n", - dev->name, np->cur_tx, i, - le32_to_cpu(np->tx_ring[entry].frag[i].len)); - } + np->tx_info[entry].mapping = + pci_map_single(np->pci_dev, skb->data, skb_first_frag_len(skb), PCI_DMA_TODEVICE); + } else { +#ifdef MAX_SKB_FRAGS + skb_frag_t *this_frag = &skb_shinfo(skb)->frags[i - 1]; + status |= this_frag->size; + np->tx_info[entry].mapping = + pci_map_single(np->pci_dev, page_address(this_frag->page) + this_frag->page_offset, this_frag->size, PCI_DMA_TODEVICE); +#endif /* MAX_SKB_FRAGS */ + } + + np->tx_ring[entry].addr = cpu_to_dma(np->tx_info[entry].mapping); + np->tx_ring[entry].status = cpu_to_le32(status); + if (debug > 3) + printk(KERN_DEBUG "%s: Tx #%d/#%d slot %d status %#8.8x.\n", + dev->name, np->cur_tx, np->dirty_tx, + entry, status); + if (wrap_ring) { + np->tx_info[entry].used_slots = TX_RING_SIZE - entry; + np->cur_tx += np->tx_info[entry].used_slots; + entry = 0; + } else { + np->tx_info[entry].used_slots = 1; + np->cur_tx += np->tx_info[entry].used_slots; + entry++; + } + /* scavenge the tx descriptors twice per TX_RING_SIZE */ + if (np->cur_tx % (TX_RING_SIZE / 2) == 0) + np->reap_tx = 1; } -#endif /* ZEROCOPY */ - - np->cur_tx++; - - if (entry >= TX_RING_SIZE-1) /* Wrap ring */ - entry = -1; - entry++; /* Non-x86: explicitly flush descriptor cache lines here. */ - /* Ensure everything is written back above before the transmit is + /* Ensure all descriptors are written back before the transmit is initiated. - Jes */ wmb(); /* Update the producer index. */ - writel(entry * (sizeof(struct starfire_tx_desc) / 8), dev->base_addr + TxProducerIdx); + writel(entry * (sizeof(starfire_tx_desc) / 8), dev->base_addr + TxProducerIdx); - if (np->cur_tx - np->dirty_tx >= TX_RING_SIZE - 1) { - np->tx_full = 1; + /* 4 is arbitrary, but should be ok */ + if ((np->cur_tx - np->dirty_tx) + 4 > TX_RING_SIZE) netif_stop_queue(dev); - } dev->trans_start = jiffies; @@ -1273,20 +1472,13 @@ static int start_tx(struct sk_buff *skb, after the Tx thread. */ static void intr_handler(int irq, void *dev_instance, struct pt_regs *rgs) { - struct net_device *dev = (struct net_device *)dev_instance; + struct net_device *dev = dev_instance; struct netdev_private *np; long ioaddr; int boguscnt = max_interrupt_work; int consumer; int tx_status; -#ifndef final_version /* Can never occur. */ - if (dev == NULL) { - printk (KERN_ERR "Netdev interrupt handler(): IRQ %d for unknown device.\n", irq); - return; - } -#endif - ioaddr = dev->base_addr; np = dev->priv; @@ -1294,83 +1486,69 @@ static void intr_handler(int irq, void * u32 intr_status = readl(ioaddr + IntrClear); if (debug > 4) - printk(KERN_DEBUG "%s: Interrupt status %4.4x.\n", + printk(KERN_DEBUG "%s: Interrupt status %#8.8x.\n", dev->name, intr_status); - if (intr_status == 0) + if (intr_status == 0 || intr_status == (u32) -1) break; - if (intr_status & IntrRxDone) - netdev_rx(dev); + if (intr_status & (IntrRxDone | IntrRxEmpty)) + netdev_rx(dev, ioaddr); /* Scavenge the skbuff list based on the Tx-done queue. There are redundant checks here that may be cleaned up after the driver has proven to be reliable. */ consumer = readl(ioaddr + TxConsumerIdx); - if (debug > 4) + if (debug > 3) printk(KERN_DEBUG "%s: Tx Consumer index is %d.\n", dev->name, consumer); -#if 0 - if (np->tx_done >= 250 || np->tx_done == 0) - printk(KERN_DEBUG "%s: Tx completion entry %d is %8.8x, %d is %8.8x.\n", - dev->name, np->tx_done, - le32_to_cpu(np->tx_done_q[np->tx_done].status), - (np->tx_done+1) & (DONE_Q_SIZE-1), - le32_to_cpu(np->tx_done_q[(np->tx_done+1)&(DONE_Q_SIZE-1)].status)); -#endif while ((tx_status = le32_to_cpu(np->tx_done_q[np->tx_done].status)) != 0) { - if (debug > 4) - printk(KERN_DEBUG "%s: Tx completion entry %d is %8.8x.\n", - dev->name, np->tx_done, tx_status); + if (debug > 3) + printk(KERN_DEBUG "%s: Tx completion #%d entry %d is %#8.8x.\n", + dev->name, np->dirty_tx, np->tx_done, tx_status); if ((tx_status & 0xe0000000) == 0xa0000000) { np->stats.tx_packets++; } else if ((tx_status & 0xe0000000) == 0x80000000) { - struct sk_buff *skb; -#ifdef ZEROCOPY - int i; -#endif /* ZEROCOPY */ - u16 entry = tx_status; /* Implicit truncate */ - entry /= sizeof(struct starfire_tx_desc); - - skb = np->tx_info[entry].skb; + u16 entry = (tx_status & 0x7fff) / sizeof(starfire_tx_desc); + struct sk_buff *skb = np->tx_info[entry].skb; np->tx_info[entry].skb = NULL; pci_unmap_single(np->pci_dev, - np->tx_info[entry].first_mapping, + np->tx_info[entry].mapping, skb_first_frag_len(skb), PCI_DMA_TODEVICE); - np->tx_info[entry].first_mapping = 0; - -#ifdef ZEROCOPY - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - pci_unmap_single(np->pci_dev, - np->tx_info[entry].frag_mapping[i], - skb_shinfo(skb)->frags[i].size, - PCI_DMA_TODEVICE); - np->tx_info[entry].frag_mapping[i] = 0; + np->tx_info[entry].mapping = 0; + np->dirty_tx += np->tx_info[entry].used_slots; + entry = (entry + np->tx_info[entry].used_slots) % TX_RING_SIZE; +#ifdef MAX_SKB_FRAGS + { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + pci_unmap_single(np->pci_dev, + np->tx_info[entry].mapping, + skb_shinfo(skb)->frags[i].size, + PCI_DMA_TODEVICE); + np->dirty_tx++; + entry++; + } } -#endif /* ZEROCOPY */ - - /* Scavenge the descriptor. */ +#endif /* MAX_SKB_FRAGS */ dev_kfree_skb_irq(skb); - - np->dirty_tx++; } np->tx_done_q[np->tx_done].status = 0; - np->tx_done = (np->tx_done+1) & (DONE_Q_SIZE-1); + np->tx_done = (np->tx_done + 1) % DONE_Q_SIZE; } writew(np->tx_done, ioaddr + CompletionQConsumerIdx + 2); - if (np->tx_full && np->cur_tx - np->dirty_tx < TX_RING_SIZE - 4) { + if (netif_queue_stopped(dev) && + (np->cur_tx - np->dirty_tx + 4 < TX_RING_SIZE)) { /* The ring is no longer full, wake the queue. */ - np->tx_full = 0; netif_wake_queue(dev); } /* Stats overflow */ - if (intr_status & IntrStatsMax) { + if (intr_status & IntrStatsMax) get_stats(dev); - } /* Media change interrupt. */ if (intr_status & IntrLinkChange) @@ -1381,72 +1559,58 @@ static void intr_handler(int irq, void * netdev_error(dev, intr_status); if (--boguscnt < 0) { - printk(KERN_WARNING "%s: Too much work at interrupt, " - "status=0x%4.4x.\n", - dev->name, intr_status); + if (debug > 1) + printk(KERN_WARNING "%s: Too much work at interrupt, " + "status=%#8.8x.\n", + dev->name, intr_status); break; } } while (1); if (debug > 4) - printk(KERN_DEBUG "%s: exiting interrupt, status=%#4.4x.\n", - dev->name, (int)readl(ioaddr + IntrStatus)); - -#ifndef final_version - /* Code that should never be run! Remove after testing.. */ - { - static int stopit = 10; - if (!netif_running(dev) && --stopit < 0) { - printk(KERN_ERR "%s: Emergency stop, looping startup interrupt.\n", - dev->name); - free_irq(irq, dev); - } - } -#endif + printk(KERN_DEBUG "%s: exiting interrupt, status=%#8.8x.\n", + dev->name, (int) readl(ioaddr + IntrStatus)); } -/* This routine is logically part of the interrupt handler, but separated - for clarity and better register allocation. */ -static int netdev_rx(struct net_device *dev) +/* This routine is logically part of the interrupt/poll handler, but separated + for clarity, code sharing between NAPI/non-NAPI, and better register allocation. */ +static int __netdev_rx(struct net_device *dev, int *quota) { struct netdev_private *np = dev->priv; - int boguscnt = np->dirty_rx + RX_RING_SIZE - np->cur_rx; u32 desc_status; - - if (np->rx_done_q == 0) { - printk(KERN_ERR "%s: rx_done_q is NULL! rx_done is %d. %p.\n", - dev->name, np->rx_done, np->tx_done_q); - return 0; - } + int retcode = 0; /* If EOP is set on the next entry, it's a new packet. Send it up. */ while ((desc_status = le32_to_cpu(np->rx_done_q[np->rx_done].status)) != 0) { struct sk_buff *skb; u16 pkt_len; int entry; + rx_done_desc *desc = &np->rx_done_q[np->rx_done]; if (debug > 4) - printk(KERN_DEBUG " netdev_rx() status of %d was %8.8x.\n", np->rx_done, desc_status); - if (--boguscnt < 0) - break; - if ( ! (desc_status & RxOK)) { + printk(KERN_DEBUG " netdev_rx() status of %d was %#8.8x.\n", np->rx_done, desc_status); + if (!(desc_status & RxOK)) { /* There was a error. */ if (debug > 2) - printk(KERN_DEBUG " netdev_rx() Rx error was %8.8x.\n", desc_status); + printk(KERN_DEBUG " netdev_rx() Rx error was %#8.8x.\n", desc_status); np->stats.rx_errors++; if (desc_status & RxFIFOErr) np->stats.rx_fifo_errors++; goto next_rx; } + if (*quota <= 0) { /* out of rx quota */ + retcode = 1; + goto out; + } + (*quota)--; + pkt_len = desc_status; /* Implicitly Truncate */ entry = (desc_status >> 16) & 0x7ff; -#ifndef final_version if (debug > 4) - printk(KERN_DEBUG " netdev_rx() normal Rx pkt length %d, bogus_cnt %d.\n", pkt_len, boguscnt); -#endif + printk(KERN_DEBUG " netdev_rx() normal Rx pkt length %d, quota %d.\n", pkt_len, *quota); /* Check if the packet is long enough to accept without copying to a minimally-sized skbuff. */ if (pkt_len < rx_copybreak @@ -1456,12 +1620,8 @@ static int netdev_rx(struct net_device * pci_dma_sync_single(np->pci_dev, np->rx_info[entry].mapping, pkt_len, PCI_DMA_FROMDEVICE); -#if HAS_IP_COPYSUM /* Call copy + cksum if available. */ eth_copy_and_sum(skb, np->rx_info[entry].skb->tail, pkt_len, 0); skb_put(skb, pkt_len); -#else - memcpy(skb_put(skb, pkt_len), np->rx_info[entry].skb->tail, pkt_len); -#endif } else { pci_unmap_single(np->pci_dev, np->rx_info[entry].mapping, np->rx_buf_sz, PCI_DMA_FROMDEVICE); skb = np->rx_info[entry].skb; @@ -1473,51 +1633,109 @@ static int netdev_rx(struct net_device * /* You will want this info for the initial debug. */ if (debug > 5) printk(KERN_DEBUG " Rx data %2.2x:%2.2x:%2.2x:%2.2x:%2.2x:" - "%2.2x %2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x %2.2x%2.2x " - "%d.%d.%d.%d.\n", + "%2.2x %2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x %2.2x%2.2x.\n", skb->data[0], skb->data[1], skb->data[2], skb->data[3], skb->data[4], skb->data[5], skb->data[6], skb->data[7], skb->data[8], skb->data[9], skb->data[10], - skb->data[11], skb->data[12], skb->data[13], - skb->data[14], skb->data[15], skb->data[16], - skb->data[17]); + skb->data[11], skb->data[12], skb->data[13]); #endif + skb->protocol = eth_type_trans(skb, dev); -#if defined(full_rx_status) || defined(csum_rx_status) - if (le32_to_cpu(np->rx_done_q[np->rx_done].status2) & 0x01000000) { +#if defined(HAS_FIRMWARE) || defined(VLAN_SUPPORT) + if (debug > 4) + printk(KERN_DEBUG " netdev_rx() status2 of %d was %#4.4x.\n", np->rx_done, le16_to_cpu(desc->status2)); +#endif +#ifdef HAS_FIRMWARE + if (le16_to_cpu(desc->status2) & 0x0100) { skb->ip_summed = CHECKSUM_UNNECESSARY; np->stats.rx_compressed++; } /* * This feature doesn't seem to be working, at least * with the two firmware versions I have. If the GFP sees - * a fragment, it either ignores it completely, or reports + * an IP fragment, it either ignores it completely, or reports * "bad checksum" on it. * * Maybe I missed something -- corrections are welcome. * Until then, the printk stays. :-) -Ion */ - else if (le32_to_cpu(np->rx_done_q[np->rx_done].status2) & 0x00400000) { + else if (le16_to_cpu(desc->status2) & 0x0040) { skb->ip_summed = CHECKSUM_HW; - skb->csum = le32_to_cpu(np->rx_done_q[np->rx_done].status2) & 0xffff; - printk(KERN_DEBUG "%s: checksum_hw, status2 = %x\n", dev->name, np->rx_done_q[np->rx_done].status2); + skb->csum = le16_to_cpu(desc->csum); + printk(KERN_DEBUG "%s: checksum_hw, status2 = %#x\n", dev->name, le16_to_cpu(desc->status2)); } -#endif - netif_rx(skb); +#endif /* HAS_FIRMWARE */ +#ifdef VLAN_SUPPORT + if (np->vlgrp && le16_to_cpu(desc->status2) & 0x0200) { + if (debug > 4) + printk(KERN_DEBUG " netdev_rx() vlanid = %d\n", le16_to_cpu(desc->vlanid)); + /* vlan_hwaccel_receive_skb() expects a packet with the VLAN tag stripped out */ + vlan_hwaccel_receive_skb(skb, np->vlgrp, le16_to_cpu(desc->vlanid) & VLAN_VID_MASK); + } else +#endif /* VLAN_SUPPORT */ + netif_receive_skb(skb); dev->last_rx = jiffies; np->stats.rx_packets++; -next_rx: + next_rx: np->cur_rx++; - np->rx_done_q[np->rx_done].status = 0; - np->rx_done = (np->rx_done + 1) & (DONE_Q_SIZE-1); + desc->status = 0; + np->rx_done = (np->rx_done + 1) % DONE_Q_SIZE; } writew(np->rx_done, dev->base_addr + CompletionQConsumerIdx); + out: + refill_rx_ring(dev); + if (debug > 5) + printk(KERN_DEBUG " exiting netdev_rx(): %d, status of %d was %#8.8x.\n", + retcode, np->rx_done, desc_status); + return retcode; +} + + +#ifdef HAVE_NETDEV_POLL +static int netdev_poll(struct net_device *dev, int *budget) +{ + u32 intr_status; + long ioaddr = dev->base_addr; + int retcode = 0, quota = dev->quota; + + do { + writel(IntrRxDone | IntrRxEmpty, ioaddr + IntrClear); + + retcode = __netdev_rx(dev, "a); + *budget -= (dev->quota - quota); + dev->quota = quota; + if (retcode) + goto out; + + intr_status = readl(ioaddr + IntrStatus); + } while (intr_status & (IntrRxDone | IntrRxEmpty)); + + netif_rx_complete(dev); + intr_status = readl(ioaddr + IntrEnable); + intr_status |= IntrRxDone | IntrRxEmpty; + writel(intr_status, ioaddr + IntrEnable); + + out: + if (debug > 5) + printk(KERN_DEBUG " exiting netdev_poll(): %d.\n", retcode); + + /* Restart Rx engine if stopped. */ + return retcode; +} +#endif /* HAVE_NETDEV_POLL */ + + +static void refill_rx_ring(struct net_device *dev) +{ + struct netdev_private *np = dev->priv; + struct sk_buff *skb; + int entry = -1; + /* Refill the Rx ring buffers. */ for (; np->cur_rx - np->dirty_rx > 0; np->dirty_rx++) { - struct sk_buff *skb; - int entry = np->dirty_rx % RX_RING_SIZE; + entry = np->dirty_rx % RX_RING_SIZE; if (np->rx_info[entry].skb == NULL) { skb = dev_alloc_skb(np->rx_buf_sz); np->rx_info[entry].skb = skb; @@ -1527,20 +1745,13 @@ next_rx: pci_map_single(np->pci_dev, skb->tail, np->rx_buf_sz, PCI_DMA_FROMDEVICE); skb->dev = dev; /* Mark as being used by this device. */ np->rx_ring[entry].rxaddr = - cpu_to_le32(np->rx_info[entry].mapping | RxDescValid); + cpu_to_dma(np->rx_info[entry].mapping | RxDescValid); } if (entry == RX_RING_SIZE - 1) - np->rx_ring[entry].rxaddr |= cpu_to_le32(RxDescEndRing); - /* We could defer this until later... */ - writew(entry, dev->base_addr + RxDescQIdx); + np->rx_ring[entry].rxaddr |= cpu_to_dma(RxDescEndRing); } - - if (debug > 5) - printk(KERN_DEBUG " exiting netdev_rx() status of %d was %8.8x.\n", - np->rx_done, desc_status); - - /* Restart Rx engine if stopped. */ - return 0; + if (entry >= 0) + writew(entry, dev->base_addr + RxDescQIdx); } @@ -1550,6 +1761,7 @@ static void netdev_media_change(struct n long ioaddr = dev->base_addr; u16 reg0, reg1, reg4, reg5; u32 new_tx_mode; + u32 new_intr_timer_ctrl; /* reset status first */ mdio_read(dev, np->phys[0], MII_BMCR); @@ -1594,15 +1806,23 @@ static void netdev_media_change(struct n np->speed100 ? "100" : "10", np->mii_if.full_duplex ? "full" : "half"); - new_tx_mode = np->tx_mode & ~0x2; /* duplex setting */ + new_tx_mode = np->tx_mode & ~FullDuplex; /* duplex setting */ if (np->mii_if.full_duplex) - new_tx_mode |= 2; + new_tx_mode |= FullDuplex; if (np->tx_mode != new_tx_mode) { np->tx_mode = new_tx_mode; - writel(np->tx_mode | 0x8000, ioaddr + TxMode); + writel(np->tx_mode | MiiSoftReset, ioaddr + TxMode); udelay(1000); writel(np->tx_mode, ioaddr + TxMode); } + + new_intr_timer_ctrl = np->intr_timer_ctrl & ~Timer10X; + if (np->speed100) + new_intr_timer_ctrl |= Timer10X; + if (np->intr_timer_ctrl != new_intr_timer_ctrl) { + np->intr_timer_ctrl = new_intr_timer_ctrl; + writel(new_intr_timer_ctrl, ioaddr + IntrTimerCtrl); + } } else { netif_carrier_off(dev); printk(KERN_DEBUG "%s: Link is down\n", dev->name); @@ -1616,9 +1836,12 @@ static void netdev_error(struct net_devi /* Came close to underrunning the Tx FIFO, increase threshold. */ if (intr_status & IntrTxDataLow) { - writel(++np->tx_threshold, dev->base_addr + TxThreshold); - printk(KERN_NOTICE "%s: Increasing Tx FIFO threshold to %d bytes\n", - dev->name, np->tx_threshold * 16); + if (np->tx_threshold <= PKT_BUF_SZ / 16) { + writel(++np->tx_threshold, dev->base_addr + TxThreshold); + printk(KERN_NOTICE "%s: Increasing Tx FIFO threshold to %d bytes\n", + dev->name, np->tx_threshold * 16); + } else + printk(KERN_WARNING "%s: PCI Tx underflow -- adapter is probably malfunctioning\n", dev->name); } if (intr_status & IntrRxGFPDead) { np->stats.rx_fifo_errors++; @@ -1629,7 +1852,7 @@ static void netdev_error(struct net_devi np->stats.tx_errors++; } if ((intr_status & ~(IntrNormalMask | IntrAbnormalSummary | IntrLinkChange | IntrStatsMax | IntrTxDataLow | IntrRxGFPDead | IntrNoTxCsum | IntrPCIPad)) && debug) - printk(KERN_ERR "%s: Something Wicked happened! %4.4x.\n", + printk(KERN_ERR "%s: Something Wicked happened! %#8.8x.\n", dev->name, intr_status); } @@ -1664,39 +1887,67 @@ static struct net_device_stats *get_stat /* Chips may use the upper or lower CRC bits, and may reverse and/or invert them. Select the endian-ness that results in minimal calculations. */ - static void set_rx_mode(struct net_device *dev) { long ioaddr = dev->base_addr; - u32 rx_mode; + u32 rx_mode = MinVLANPrio; struct dev_mc_list *mclist; int i; +#ifdef VLAN_SUPPORT + struct netdev_private *np = dev->priv; + + rx_mode |= VlanMode; + if (np->vlgrp) { + int vlan_count = 0; + long filter_addr = ioaddr + HashTable + 8; + for (i = 0; i < VLAN_VID_MASK; i++) { + if (np->vlgrp->vlan_devices[i]) { + if (vlan_count >= 32) + break; + writew(cpu_to_be16(i), filter_addr); + filter_addr += 16; + vlan_count++; + } + } + if (i == VLAN_VID_MASK) { + rx_mode |= PerfectFilterVlan; + while (vlan_count < 32) { + writew(0, filter_addr); + filter_addr += 16; + vlan_count++; + } + } + } +#endif /* VLAN_SUPPORT */ if (dev->flags & IFF_PROMISC) { /* Set promiscuous. */ - rx_mode = AcceptBroadcast|AcceptAllMulticast|AcceptAll|AcceptMyPhys; + rx_mode |= AcceptAll; } else if ((dev->mc_count > multicast_filter_limit) || (dev->flags & IFF_ALLMULTI)) { /* Too many to match, or accept all multicasts. */ - rx_mode = AcceptBroadcast|AcceptAllMulticast|AcceptMyPhys; - } else if (dev->mc_count <= 15) { - /* Use the 16 element perfect filter, skip first entry. */ - long filter_addr = ioaddr + PerfFilterTable + 1 * 16; - for (i = 1, mclist = dev->mc_list; mclist && i <= dev->mc_count; + rx_mode |= AcceptBroadcast|AcceptAllMulticast|PerfectFilter; + } else if (dev->mc_count <= 14) { + /* Use the 16 element perfect filter, skip first two entries. */ + long filter_addr = ioaddr + PerfFilterTable + 2 * 16; + u16 *eaddrs; + for (i = 2, mclist = dev->mc_list; mclist && i < dev->mc_count + 2; i++, mclist = mclist->next) { - u16 *eaddrs = (u16 *)mclist->dmi_addr; + eaddrs = (u16 *)mclist->dmi_addr; writew(cpu_to_be16(eaddrs[2]), filter_addr); filter_addr += 4; writew(cpu_to_be16(eaddrs[1]), filter_addr); filter_addr += 4; writew(cpu_to_be16(eaddrs[0]), filter_addr); filter_addr += 8; } + eaddrs = (u16 *)dev->dev_addr; while (i++ < 16) { - writew(0xffff, filter_addr); filter_addr += 4; - writew(0xffff, filter_addr); filter_addr += 4; - writew(0xffff, filter_addr); filter_addr += 8; + writew(cpu_to_be16(eaddrs[0]), filter_addr); filter_addr += 4; + writew(cpu_to_be16(eaddrs[1]), filter_addr); filter_addr += 4; + writew(cpu_to_be16(eaddrs[2]), filter_addr); filter_addr += 8; } - rx_mode = AcceptBroadcast | AcceptMyPhys; + rx_mode |= AcceptBroadcast|PerfectFilter; } else { /* Must use a multicast hash table. */ long filter_addr; + u16 *eaddrs; u16 mc_filter[32] __attribute__ ((aligned(sizeof(long)))); /* Multicast hash filter */ memset(mc_filter, 0, sizeof(mc_filter)); @@ -1707,17 +1958,19 @@ static void set_rx_mode(struct net_devic *fptr |= cpu_to_le32(1 << (bit_nr & 31)); } - /* Clear the perfect filter list, skip first entry. */ - filter_addr = ioaddr + PerfFilterTable + 1 * 16; - for (i = 1; i < 16; i++) { - writew(0xffff, filter_addr); filter_addr += 4; - writew(0xffff, filter_addr); filter_addr += 4; - writew(0xffff, filter_addr); filter_addr += 8; + /* Clear the perfect filter list, skip first two entries. */ + filter_addr = ioaddr + PerfFilterTable + 2 * 16; + eaddrs = (u16 *)dev->dev_addr; + for (i = 2; i < 16; i++) { + writew(cpu_to_be16(eaddrs[0]), filter_addr); filter_addr += 4; + writew(cpu_to_be16(eaddrs[1]), filter_addr); filter_addr += 4; + writew(cpu_to_be16(eaddrs[2]), filter_addr); filter_addr += 8; } - for (filter_addr = ioaddr + HashTable, i=0; i < 32; filter_addr+= 16, i++) + for (filter_addr = ioaddr + HashTable, i = 0; i < 32; filter_addr+= 16, i++) writew(mc_filter[i], filter_addr); - rx_mode = AcceptBroadcast | AcceptMulticast | AcceptMyPhys; + rx_mode |= AcceptBroadcast|PerfectFilter|HashFilter; } + wmb(); writel(rx_mode, ioaddr + RxFilterMode); } @@ -1763,6 +2016,7 @@ static int netdev_ethtool_ioctl(struct n spin_lock_irq(&np->lock); r = mii_ethtool_sset(&np->mii_if, &ecmd); spin_unlock_irq(&np->lock); + check_duplex(dev); return r; } /* restart autonegotiation */ @@ -1816,7 +2070,7 @@ static int netdev_ioctl(struct net_devic spin_lock_irq(&np->lock); rc = generic_mii_ioctl(&np->mii_if, data, cmd, NULL); spin_unlock_irq(&np->lock); - + if ((cmd == SIOCSMIIREG) && (data->phy_id == np->phys[0])) check_duplex(dev); } @@ -1834,41 +2088,42 @@ static int netdev_close(struct net_devic netif_stop_if(dev); if (debug > 1) { - printk(KERN_DEBUG "%s: Shutting down ethercard, Intr status %4.4x.\n", - dev->name, (int)readl(ioaddr + IntrStatus)); - printk(KERN_DEBUG "%s: Queue pointers were Tx %d / %d, Rx %d / %d.\n", - dev->name, np->cur_tx, np->dirty_tx, np->cur_rx, np->dirty_rx); + printk(KERN_DEBUG "%s: Shutting down ethercard, Intr status %#8.8x.\n", + dev->name, (int) readl(ioaddr + IntrStatus)); + printk(KERN_DEBUG "%s: Queue pointers were Tx %d / %d, Rx %d / %d.\n", + dev->name, np->cur_tx, np->dirty_tx, + np->cur_rx, np->dirty_rx); } /* Disable interrupts by clearing the interrupt mask. */ writel(0, ioaddr + IntrEnable); /* Stop the chip's Tx and Rx processes. */ + writel(0, ioaddr + GenCtrl); + readl(ioaddr + GenCtrl); -#ifdef __i386__ - if (debug > 2) { - printk("\n"KERN_DEBUG" Tx ring at %9.9Lx:\n", - (u64) np->tx_ring_dma); + if (debug > 5) { + printk(KERN_DEBUG" Tx ring at %#llx:\n", + (long long) np->tx_ring_dma); for (i = 0; i < 8 /* TX_RING_SIZE is huge! */; i++) - printk(KERN_DEBUG " #%d desc. %8.8x %8.8x -> %8.8x.\n", + printk(KERN_DEBUG " #%d desc. %#8.8x %#llx -> %#8.8x.\n", i, le32_to_cpu(np->tx_ring[i].status), - le32_to_cpu(np->tx_ring[i].first_addr), + (long long) dma_to_cpu(np->tx_ring[i].addr), le32_to_cpu(np->tx_done_q[i].status)); - printk(KERN_DEBUG " Rx ring at %9.9Lx -> %p:\n", - (u64) np->rx_ring_dma, np->rx_done_q); + printk(KERN_DEBUG " Rx ring at %#llx -> %p:\n", + (long long) np->rx_ring_dma, np->rx_done_q); if (np->rx_done_q) for (i = 0; i < 8 /* RX_RING_SIZE */; i++) { - printk(KERN_DEBUG " #%d desc. %8.8x -> %8.8x\n", - i, le32_to_cpu(np->rx_ring[i].rxaddr), le32_to_cpu(np->rx_done_q[i].status)); + printk(KERN_DEBUG " #%d desc. %#llx -> %#8.8x\n", + i, (long long) dma_to_cpu(np->rx_ring[i].rxaddr), le32_to_cpu(np->rx_done_q[i].status)); } } -#endif /* __i386__ debugging only */ free_irq(dev->irq, dev); /* Free all the skbuffs in the Rx queue. */ for (i = 0; i < RX_RING_SIZE; i++) { - np->rx_ring[i].rxaddr = cpu_to_le32(0xBADF00D0); /* An invalid address. */ + np->rx_ring[i].rxaddr = cpu_to_dma(0xBADF00D0); /* An invalid address. */ if (np->rx_info[i].skb != NULL) { pci_unmap_single(np->pci_dev, np->rx_info[i].mapping, np->rx_buf_sz, PCI_DMA_FROMDEVICE); dev_kfree_skb(np->rx_info[i].skb); @@ -1878,28 +2133,14 @@ static int netdev_close(struct net_devic } for (i = 0; i < TX_RING_SIZE; i++) { struct sk_buff *skb = np->tx_info[i].skb; -#ifdef ZEROCOPY - int j; -#endif /* ZEROCOPY */ if (skb == NULL) continue; pci_unmap_single(np->pci_dev, - np->tx_info[i].first_mapping, + np->tx_info[i].mapping, skb_first_frag_len(skb), PCI_DMA_TODEVICE); - np->tx_info[i].first_mapping = 0; + np->tx_info[i].mapping = 0; dev_kfree_skb(skb); np->tx_info[i].skb = NULL; -#ifdef ZEROCOPY - for (j = 0; j < MAX_STARFIRE_FRAGS; j++) - if (np->tx_info[i].frag_mapping[j]) { - pci_unmap_single(np->pci_dev, - np->tx_info[i].frag_mapping[j], - skb_shinfo(skb)->frags[j].size, - PCI_DMA_TODEVICE); - np->tx_info[i].frag_mapping[j] = 0; - } else - break; -#endif /* ZEROCOPY */ } COMPAT_MOD_DEC_USE_COUNT; @@ -1917,19 +2158,7 @@ static void __devexit starfire_remove_on BUG(); np = dev->priv; - if (np->tx_done_q) - pci_free_consistent(pdev, PAGE_SIZE, - np->tx_done_q, np->tx_done_q_dma); - if (np->rx_done_q) - pci_free_consistent(pdev, - sizeof(struct rx_done_desc) * DONE_Q_SIZE, - np->rx_done_q, np->rx_done_q_dma); - if (np->tx_ring) - pci_free_consistent(pdev, PAGE_SIZE, - np->tx_ring, np->tx_ring_dma); - if (np->rx_ring) - pci_free_consistent(pdev, PAGE_SIZE, - np->rx_ring, np->rx_ring_dma); + pci_free_consistent(pdev, np->queue_mem_size, np->queue_mem, np->queue_mem_dma); unregister_netdev(dev); iounmap((char *)dev->base_addr); @@ -1954,6 +2183,17 @@ static int __init starfire_init (void) #ifdef MODULE printk(version); #endif +#ifndef ADDR_64BITS + /* we can do this test only at run-time... sigh */ + if (sizeof(dma_addr_t) == sizeof(u64)) { + printk("This driver has not been ported to this 64-bit architecture yet\n"); + return -ENODEV; + } +#endif /* not ADDR_64BITS */ +#ifndef HAS_FIRMWARE + /* unconditionally disable hw cksums if firmware is not present */ + enable_hw_cksum = 0; +#endif /* not HAS_FIRMWARE */ return pci_module_init (&starfire_driver); } @@ -1970,8 +2210,6 @@ module_exit(starfire_cleanup); /* * Local variables: - * compile-command: "gcc -DMODULE -Wall -Wstrict-prototypes -O2 -c starfire.c" - * simple-compile-command: "gcc -DMODULE -O2 -c starfire.c" * c-basic-offset: 8 * tab-width: 8 * End: diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/serial/8250.c 90-mjb/drivers/serial/8250.c --- 00-virgin/drivers/serial/8250.c Mon Jan 13 21:09:12 2003 +++ 90-mjb/drivers/serial/8250.c Wed Feb 5 22:23:05 2003 @@ -2031,9 +2031,116 @@ void serial8250_get_irq_map(unsigned int } } -static int __init serial8250_init(void) +#ifdef CONFIG_X86_REMOTE_DEBUG +/* + * Takes: + * ttyS - integer specifying which serial port to use for debugging + * baud - baud rate of specified serial port + * Returns: + * port for use by the gdb serial driver + */ +int gdb_serial_setup(int ttyS, int baud, int *port, int *irq) +{ + struct uart_8250_port *up; + unsigned cval; + int bits = 8; + int parity = 'n'; + int cflag = CREAD | HUPCL | CLOCAL; + int quot = 0; + + /* + * Now construct a cflag setting. + */ + switch(baud) { + case 1200: + cflag |= B1200; + break; + case 2400: + cflag |= B2400; + break; + case 4800: + cflag |= B4800; + break; + case 19200: + cflag |= B19200; + break; + case 38400: + cflag |= B38400; + break; + case 57600: + cflag |= B57600; + break; + case 115200: + cflag |= B115200; + break; + case 9600: + default: + cflag |= B9600; + break; + } + switch(bits) { + case 7: + cflag |= CS7; + break; + default: + case 8: + cflag |= CS8; + break; + } + switch(parity) { + case 'o': case 'O': + cflag |= PARODD; + break; + case 'e': case 'E': + cflag |= PARENB; + break; + } + + /* + * Divisor, bytesize and parity + */ + + up = &serial8250_ports[ttyS]; +// ser->flags &= ~ASYNC_BOOT_AUTOCONF; + quot = ( 1843200 / 16 ) / baud; + cval = cflag & (CSIZE | CSTOPB); + cval >>= 4; + if (cflag & PARENB) + cval |= UART_LCR_PARITY; + if (!(cflag & PARODD)) + cval |= UART_LCR_EPAR; + + /* + * Disable UART interrupts, set DTR and RTS high + * and set speed. + */ + cval = 0x3; + serial_outp(up, UART_LCR, cval | UART_LCR_DLAB); /* set DLAB */ + serial_outp(up, UART_DLL, quot & 0xff); /* LS of divisor */ + serial_outp(up, UART_DLM, quot >> 8); /* MS of divisor */ + serial_outp(up, UART_LCR, cval); /* reset DLAB */ + serial_outp(up, UART_IER, UART_IER_RDI); /* turn on interrupts*/ + serial_outp(up, UART_MCR, UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS); + + /* + * If we read 0xff from the LSR, there is no UART here. + */ + if (serial_inp(up, UART_LSR) == 0xff) + return 1; + *port = up->port.iobase; + *irq = up->port.irq; +// serial8250_shutdown(&up->port); + return 0; +} +#endif + +int serial8250_init(void) { int ret, i; + static int didit = 0; + + if (didit++) + return 0; printk(KERN_INFO "Serial: 8250/16550 driver $Revision: 1.90 $ " "IRQ sharing %sabled\n", share_irqs ? "en" : "dis"); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/drivers/serial/core.c 90-mjb/drivers/serial/core.c --- 00-virgin/drivers/serial/core.c Mon Jan 13 21:09:12 2003 +++ 90-mjb/drivers/serial/core.c Wed Feb 5 22:23:05 2003 @@ -36,6 +36,10 @@ #include #include /* for serial_state and serial_icounter_struct */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + #include #include @@ -1040,6 +1044,17 @@ uart_ioctl(struct tty_struct *tty, struc (unsigned int *)arg); break; +#ifdef CONFIG_X86_REMOTE_DEBUG + case TIOCGDB: + ret = -ENOTTY; + if (capable(CAP_SYS_ADMIN)) { + gdb_ttyS = minor(tty->device) & 0x03F; + gdb_baud = tty_get_baud_rate(tty); + ret = gdb_hook(); + } + break; +#endif + case TIOCMBIS: case TIOCMBIC: case TIOCMSET: @@ -1115,6 +1130,30 @@ uart_ioctl(struct tty_struct *tty, struc } return ret; } + + /* + * ------------------------------------------------------------ + * Serial GDB driver (most in gdbserial.c) + * ------------------------------------------------------------ + */ + +#ifdef CONFIG_X86_REMOTE_DEBUG +#ifdef CONFIG_GDB_CONSOLE +static struct console gdbcons = { + name: "gdb", + write: gdb_console_write, + flags: CON_PRINTBUFFER | CON_ENABLED, + index: -1, +}; +#endif + +#ifdef CONFIG_GDB_CONSOLE +void __init gdb_console_init(void) +{ + register_console(&gdbcons); +} +#endif +#endif /* CONFIG_X86_REMOTE_DEBUG */ static void uart_set_termios(struct tty_struct *tty, struct termios *old_termios) { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/fs/dcache.c 90-mjb/fs/dcache.c --- 00-virgin/fs/dcache.c Sun Dec 1 10:00:00 2002 +++ 90-mjb/fs/dcache.c Wed Feb 5 22:22:56 2003 @@ -24,6 +24,7 @@ #include #include #include +#include #include #define DCACHE_PARANOIA 1 @@ -55,6 +56,15 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; +static void d_callback(void *arg) +{ + struct dentry * dentry = (struct dentry *)arg; + + if (dname_external(dentry)) + kfree(dentry->d_name.name); + kmem_cache_free(dentry_cache, dentry); +} + /* * no dcache_lock, please. The caller must decrement dentry_stat.nr_dentry * inside dcache_lock. @@ -63,9 +73,7 @@ static void d_free(struct dentry *dentry { if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - if (dname_external(dentry)) - kfree(dentry->d_name.name); - kmem_cache_free(dentry_cache, dentry); + call_rcu(&dentry->d_rcu, d_callback, dentry); } /* @@ -126,9 +134,13 @@ repeat: if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) return; - /* dput on a free dentry? */ - if (!list_empty(&dentry->d_lru)) - BUG(); + spin_lock(&dentry->d_lock); + if (atomic_read(&dentry->d_count)) { + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); + return; + } + /* * AV: ->d_delete() is _NOT_ allowed to block now. */ @@ -139,8 +151,12 @@ repeat: /* Unreachable? Get rid of it */ if (d_unhashed(dentry)) goto kill_it; - list_add(&dentry->d_lru, &dentry_unused); - dentry_stat.nr_unused++; + if (list_empty(&dentry->d_lru)) { + dentry->d_vfs_flags &= ~DCACHE_REFERENCED; + list_add(&dentry->d_lru, &dentry_unused); + dentry_stat.nr_unused++; + } + spin_unlock(&dentry->d_lock); dentry->d_vfs_flags |= DCACHE_REFERENCED; spin_unlock(&dcache_lock); return; @@ -150,7 +166,12 @@ unhash_it: kill_it: { struct dentry *parent; - list_del(&dentry->d_child); + if (!list_empty(&dentry->d_lru)) { + list_del(&dentry->d_lru); + dentry_stat.nr_unused--; + } + list_del(&dentry->d_child); + spin_unlock(&dentry->d_lock); dentry_stat.nr_dentry--; /* For d_free, below */ /* drops the lock, at that point nobody can reach this dentry */ dentry_iput(dentry); @@ -222,6 +243,7 @@ int d_invalidate(struct dentry * dentry) static inline struct dentry * __dget_locked(struct dentry *dentry) { atomic_inc(&dentry->d_count); + dentry->d_vfs_flags |= DCACHE_REFERENCED; if (atomic_read(&dentry->d_count) == 1) { dentry_stat.nr_unused--; list_del_init(&dentry->d_lru); @@ -289,8 +311,8 @@ restart: struct dentry *dentry = list_entry(tmp, struct dentry, d_alias); if (!atomic_read(&dentry->d_count)) { __dget_locked(dentry); + __d_drop(dentry); spin_unlock(&dcache_lock); - d_drop(dentry); dput(dentry); goto restart; } @@ -310,6 +332,7 @@ static inline void prune_one_dentry(stru __d_drop(dentry); list_del(&dentry->d_child); + spin_unlock(&dentry->d_lock); dentry_stat.nr_dentry--; /* For d_free, below */ dentry_iput(dentry); parent = dentry->d_parent; @@ -343,18 +366,20 @@ static void prune_dcache(int count) if (tmp == &dentry_unused) break; list_del_init(tmp); + dentry_stat.nr_unused--; dentry = list_entry(tmp, struct dentry, d_lru); + spin_lock(&dentry->d_lock); /* If the dentry was recently referenced, don't free it. */ if (dentry->d_vfs_flags & DCACHE_REFERENCED) { dentry->d_vfs_flags &= ~DCACHE_REFERENCED; - list_add(&dentry->d_lru, &dentry_unused); + if (!atomic_read(&dentry->d_count)) { + list_add(&dentry->d_lru, &dentry_unused); + dentry_stat.nr_unused++; + } + spin_unlock(&dentry->d_lock); continue; } - dentry_stat.nr_unused--; - - /* Unused dentry with a count? */ - BUG_ON(atomic_read(&dentry->d_count)); prune_one_dentry(dentry); } spin_unlock(&dcache_lock); @@ -414,10 +439,13 @@ repeat: dentry = list_entry(tmp, struct dentry, d_lru); if (dentry->d_sb != sb) continue; - if (atomic_read(&dentry->d_count)) - continue; dentry_stat.nr_unused--; list_del_init(tmp); + spin_lock(&dentry->d_lock); + if (atomic_read(&dentry->d_count)) { + spin_unlock(&dentry->d_lock); + continue; + } prune_one_dentry(dentry); goto repeat; } @@ -497,8 +525,8 @@ resume: struct list_head *tmp = next; struct dentry *dentry = list_entry(tmp, struct dentry, d_child); next = tmp->next; + list_del_init(&dentry->d_lru); if (!atomic_read(&dentry->d_count)) { - list_del(&dentry->d_lru); list_add(&dentry->d_lru, dentry_unused.prev); found++; } @@ -561,8 +589,8 @@ void shrink_dcache_anon(struct list_head spin_lock(&dcache_lock); list_for_each(lp, head) { struct dentry *this = list_entry(lp, struct dentry, d_hash); + list_del(&this->d_lru); if (!atomic_read(&this->d_count)) { - list_del(&this->d_lru); list_add_tail(&this->d_lru, &dentry_unused); found++; } @@ -648,7 +676,8 @@ struct dentry * d_alloc(struct dentry * str[name->len] = 0; atomic_set(&dentry->d_count, 1); - dentry->d_vfs_flags = 0; + dentry->d_vfs_flags = DCACHE_UNHASHED; + dentry->d_lock = SPIN_LOCK_UNLOCKED; dentry->d_flags = 0; dentry->d_inode = NULL; dentry->d_parent = NULL; @@ -785,12 +814,15 @@ struct dentry * d_alloc_anon(struct inod res = tmp; tmp = NULL; if (res) { + spin_lock(&res->d_lock); res->d_sb = inode->i_sb; res->d_parent = res; res->d_inode = inode; res->d_flags |= DCACHE_DISCONNECTED; + res->d_vfs_flags &= ~DCACHE_UNHASHED; list_add(&res->d_alias, &inode->i_dentry); list_add(&res->d_hash, &inode->i_sb->s_anon); + spin_unlock(&res->d_lock); } inode = NULL; /* don't drop reference */ } @@ -859,30 +891,16 @@ struct dentry *d_splice_alias(struct ino struct dentry * d_lookup(struct dentry * parent, struct qstr * name) { - struct dentry * dentry; - spin_lock(&dcache_lock); - dentry = __d_lookup(parent,name); - if (dentry) - __dget_locked(dentry); - spin_unlock(&dcache_lock); - return dentry; -} - -struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) -{ - unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; struct list_head *head = d_hash(parent,hash); struct list_head *tmp; + struct dentry *found = NULL; - tmp = head->next; - for (;;) { + rcu_read_lock(); + __list_for_each_rcu(tmp, head) { struct dentry * dentry = list_entry(tmp, struct dentry, d_hash); - if (tmp == head) - break; - tmp = tmp->next; if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) @@ -896,9 +914,14 @@ struct dentry * __d_lookup(struct dentry if (memcmp(dentry->d_name.name, str, len)) continue; } - return dentry; - } - return NULL; + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) + found = dget(dentry); + spin_unlock(&dentry->d_lock); + break; + } + rcu_read_unlock(); + return found; } /** @@ -937,7 +960,7 @@ int d_validate(struct dentry *dentry, st lhp = base = d_hash(dparent, dentry->d_name.hash); while ((lhp = lhp->next) != base) { if (dentry == list_entry(lhp, struct dentry, d_hash)) { - __dget_locked(dentry); + dget(dentry); spin_unlock(&dcache_lock); return 1; } @@ -974,17 +997,18 @@ void d_delete(struct dentry * dentry) * Are we the only user? */ spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); if (atomic_read(&dentry->d_count) == 1) { + spin_unlock(&dentry->d_lock); dentry_iput(dentry); return; } - spin_unlock(&dcache_lock); - /* - * If not, just drop the dentry and let dput - * pick up the tab.. - */ - d_drop(dentry); + if (!d_unhashed(dentry)) + __d_drop(dentry); + + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); } /** @@ -997,9 +1021,10 @@ void d_delete(struct dentry * dentry) void d_rehash(struct dentry * entry) { struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); - if (!d_unhashed(entry)) BUG(); spin_lock(&dcache_lock); - list_add(&entry->d_hash, list); + if (!list_empty(&entry->d_hash) && !d_unhashed(entry)) BUG(); + entry->d_vfs_flags &= ~DCACHE_UNHASHED; + list_add_rcu(&entry->d_hash, list); spin_unlock(&dcache_lock); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/fs/namei.c 90-mjb/fs/namei.c --- 00-virgin/fs/namei.c Mon Jan 13 21:09:27 2003 +++ 90-mjb/fs/namei.c Wed Feb 5 22:22:56 2003 @@ -286,27 +286,6 @@ static struct dentry * cached_lookup(str return dentry; } -/*for fastwalking*/ -static inline void unlock_nd(struct nameidata *nd) -{ - struct vfsmount *mnt = nd->old_mnt; - struct dentry *dentry = nd->old_dentry; - mntget(nd->mnt); - dget_locked(nd->dentry); - nd->old_mnt = NULL; - nd->old_dentry = NULL; - spin_unlock(&dcache_lock); - dput(dentry); - mntput(mnt); -} - -static inline void lock_nd(struct nameidata *nd) -{ - spin_lock(&dcache_lock); - nd->old_mnt = nd->mnt; - nd->old_dentry = nd->dentry; -} - /* * Short-cut version of permission(), for calling by * path_walk(), when dcache lock is held. Combines parts @@ -451,11 +430,18 @@ static int follow_mount(struct vfsmount { int res = 0; while (d_mountpoint(*dentry)) { - struct vfsmount *mounted = lookup_mnt(*mnt, *dentry); - if (!mounted) + struct vfsmount *mounted; + spin_lock(&dcache_lock); + mounted = lookup_mnt(*mnt, *dentry); + if (!mounted) { + spin_unlock(&dcache_lock); break; - *mnt = mounted; - *dentry = mounted->mnt_root; + } + *mnt = mntget(mounted); + spin_unlock(&dcache_lock); + dput(*dentry); + mntput(mounted->mnt_parent); + *dentry = dget(mounted->mnt_root); res = 1; } return res; @@ -488,17 +474,32 @@ static inline void follow_dotdot(struct { while(1) { struct vfsmount *parent; + struct dentry *old = *dentry; + + read_lock(¤t->fs->lock); if (*dentry == current->fs->root && - *mnt == current->fs->rootmnt) + *mnt == current->fs->rootmnt) { + read_unlock(¤t->fs->lock); break; + } + read_unlock(¤t->fs->lock); + spin_lock(&dcache_lock); if (*dentry != (*mnt)->mnt_root) { - *dentry = (*dentry)->d_parent; + *dentry = dget((*dentry)->d_parent); + spin_unlock(&dcache_lock); + dput(old); break; } - parent=(*mnt)->mnt_parent; - if (parent == *mnt) + parent = (*mnt)->mnt_parent; + if (parent == *mnt) { + spin_unlock(&dcache_lock); break; - *dentry=(*mnt)->mnt_mountpoint; + } + mntget(parent); + *dentry = dget((*mnt)->mnt_mountpoint); + spin_unlock(&dcache_lock); + dput(old); + mntput(*mnt); *mnt = parent; } follow_mount(mnt, dentry); @@ -515,14 +516,13 @@ struct path { * It _is_ time-critical. */ static int do_lookup(struct nameidata *nd, struct qstr *name, - struct path *path, struct path *cached_path, - int flags) + struct path *path, int flags) { struct vfsmount *mnt = nd->mnt; - struct dentry *dentry = __d_lookup(nd->dentry, name); + struct dentry *dentry = d_lookup(nd->dentry, name); if (!dentry) - goto dcache_miss; + goto need_lookup; if (dentry->d_op && dentry->d_op->d_revalidate) goto need_revalidate; done: @@ -530,36 +530,21 @@ done: path->dentry = dentry; return 0; -dcache_miss: - unlock_nd(nd); - need_lookup: dentry = real_lookup(nd->dentry, name, LOOKUP_CONTINUE); if (IS_ERR(dentry)) goto fail; - mntget(mnt); -relock: - dput(cached_path->dentry); - mntput(cached_path->mnt); - cached_path->mnt = mnt; - cached_path->dentry = dentry; - lock_nd(nd); goto done; need_revalidate: - mntget(mnt); - dget_locked(dentry); - unlock_nd(nd); if (dentry->d_op->d_revalidate(dentry, flags)) - goto relock; + goto done; if (d_invalidate(dentry)) - goto relock; + goto done; dput(dentry); - mntput(mnt); goto need_lookup; fail: - lock_nd(nd); return PTR_ERR(dentry); } @@ -573,7 +558,7 @@ fail: */ int link_path_walk(const char * name, struct nameidata *nd) { - struct path next, pinned = {NULL, NULL}; + struct path next; struct inode *inode; int err; unsigned int lookup_flags = nd->flags; @@ -594,10 +579,8 @@ int link_path_walk(const char * name, st unsigned int c; err = exec_permission_lite(inode); - if (err == -EAGAIN) { - unlock_nd(nd); + if (err == -EAGAIN) { err = permission(inode, MAY_EXEC); - lock_nd(nd); } if (err) break; @@ -648,7 +631,7 @@ int link_path_walk(const char * name, st break; } /* This does the actual lookups.. */ - err = do_lookup(nd, &this, &next, &pinned, LOOKUP_CONTINUE); + err = do_lookup(nd, &this, &next, LOOKUP_CONTINUE); if (err) break; /* Check mountpoints.. */ @@ -657,21 +640,16 @@ int link_path_walk(const char * name, st err = -ENOENT; inode = next.dentry->d_inode; if (!inode) - break; + goto out_dput; err = -ENOTDIR; if (!inode->i_op) - break; + goto out_dput; if (inode->i_op->follow_link) { - mntget(next.mnt); - dget_locked(next.dentry); - unlock_nd(nd); err = do_follow_link(next.dentry, nd); dput(next.dentry); - mntput(next.mnt); if (err) goto return_err; - lock_nd(nd); err = -ENOENT; inode = nd->dentry->d_inode; if (!inode) @@ -680,6 +658,7 @@ int link_path_walk(const char * name, st if (!inode->i_op) break; } else { + dput(nd->dentry); nd->mnt = next.mnt; nd->dentry = next.dentry; } @@ -711,24 +690,20 @@ last_component: if (err < 0) break; } - err = do_lookup(nd, &this, &next, &pinned, 0); + err = do_lookup(nd, &this, &next, 0); if (err) break; follow_mount(&next.mnt, &next.dentry); inode = next.dentry->d_inode; if ((lookup_flags & LOOKUP_FOLLOW) && inode && inode->i_op && inode->i_op->follow_link) { - mntget(next.mnt); - dget_locked(next.dentry); - unlock_nd(nd); err = do_follow_link(next.dentry, nd); dput(next.dentry); - mntput(next.mnt); if (err) goto return_err; inode = nd->dentry->d_inode; - lock_nd(nd); } else { + dput(nd->dentry); nd->mnt = next.mnt; nd->dentry = next.dentry; } @@ -751,23 +726,19 @@ lookup_parent: else if (this.len == 2 && this.name[1] == '.') nd->last_type = LAST_DOTDOT; return_base: - unlock_nd(nd); - dput(pinned.dentry); - mntput(pinned.mnt); return 0; +out_dput: + dput(next.dentry); + break; } - unlock_nd(nd); path_release(nd); return_err: - dput(pinned.dentry); - mntput(pinned.mnt); return err; } int path_walk(const char * name, struct nameidata *nd) { current->total_link_count = 0; - lock_nd(nd); return link_path_walk(name, nd); } @@ -855,28 +826,24 @@ int path_lookup(const char *name, unsign { nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags; + + read_lock(¤t->fs->lock); if (*name=='/') { - read_lock(¤t->fs->lock); if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) { nd->mnt = mntget(current->fs->altrootmnt); nd->dentry = dget(current->fs->altroot); read_unlock(¤t->fs->lock); if (__emul_lookup_dentry(name,nd)) return 0; - } else { - read_unlock(¤t->fs->lock); } - spin_lock(&dcache_lock); - nd->mnt = current->fs->rootmnt; - nd->dentry = current->fs->root; + nd->mnt = mntget(current->fs->rootmnt); + nd->dentry = dget(current->fs->root); } else{ - spin_lock(&dcache_lock); - nd->mnt = current->fs->pwdmnt; - nd->dentry = current->fs->pwd; + nd->mnt = mntget(current->fs->pwdmnt); + nd->dentry = dget(current->fs->pwd); } - nd->old_mnt = NULL; - nd->old_dentry = NULL; + read_unlock(¤t->fs->lock); current->total_link_count = 0; return link_path_walk(name, nd); } @@ -2117,7 +2084,6 @@ __vfs_follow_link(struct nameidata *nd, /* weird __emul_prefix() stuff did it */ goto out; } - lock_nd(nd); res = link_path_walk(link, nd); out: if (current->link_count || res || nd->last_type!=LAST_NORM) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/fs/namespace.c 90-mjb/fs/namespace.c --- 00-virgin/fs/namespace.c Fri Dec 13 23:18:10 2002 +++ 90-mjb/fs/namespace.c Wed Feb 5 22:22:56 2003 @@ -892,12 +892,10 @@ void set_fs_root(struct fs_struct *fs, s struct dentry *old_root; struct vfsmount *old_rootmnt; write_lock(&fs->lock); - spin_lock(&dcache_lock); old_root = fs->root; old_rootmnt = fs->rootmnt; fs->rootmnt = mntget(mnt); fs->root = dget(dentry); - spin_unlock(&dcache_lock); write_unlock(&fs->lock); if (old_root) { dput(old_root); @@ -916,12 +914,10 @@ void set_fs_pwd(struct fs_struct *fs, st struct vfsmount *old_pwdmnt; write_lock(&fs->lock); - spin_lock(&dcache_lock); old_pwd = fs->pwd; old_pwdmnt = fs->pwdmnt; fs->pwdmnt = mntget(mnt); fs->pwd = dget(dentry); - spin_unlock(&dcache_lock); write_unlock(&fs->lock); if (old_pwd) { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/fs/proc/proc_misc.c 90-mjb/fs/proc/proc_misc.c --- 00-virgin/fs/proc/proc_misc.c Fri Jan 17 09:18:30 2003 +++ 90-mjb/fs/proc/proc_misc.c Wed Feb 5 22:23:08 2003 @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -130,6 +131,40 @@ static int uptime_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +struct vmalloc_info { + unsigned long used; + unsigned long largest_chunk; +}; + +static struct vmalloc_info get_vmalloc_info(void) +{ + unsigned long prev_end = VMALLOC_START; + struct vm_struct* vma; + struct vmalloc_info vmi; + vmi.used = 0; + + read_lock(&vmlist_lock); + + if(!vmlist) + vmi.largest_chunk = (VMALLOC_END-VMALLOC_START); + else + vmi.largest_chunk = 0; + + for (vma = vmlist; vma; vma = vma->next) { + unsigned long free_area_size = + (unsigned long)vma->addr - prev_end; + vmi.used += vma->size; + if (vmi.largest_chunk < free_area_size ) + vmi.largest_chunk = free_area_size; + prev_end = vma->size + (unsigned long)vma->addr; + } + if(VMALLOC_END-prev_end > vmi.largest_chunk) + vmi.largest_chunk = VMALLOC_END-prev_end; + + read_unlock(&vmlist_lock); + return vmi; +} + extern atomic_t vm_committed_space; static int meminfo_read_proc(char *page, char **start, off_t off, @@ -141,6 +176,8 @@ static int meminfo_read_proc(char *page, unsigned long inactive; unsigned long active; unsigned long free; + unsigned long vmtot; + struct vmalloc_info vmi; get_page_state(&ps); get_zone_counts(&active, &inactive, &free); @@ -153,6 +190,11 @@ static int meminfo_read_proc(char *page, si_swapinfo(&i); committed = atomic_read(&vm_committed_space); + vmtot = (VMALLOC_END-VMALLOC_START)>>10; + vmi = get_vmalloc_info(); + vmi.used >>= 10; + vmi.largest_chunk >>= 10; + /* * Tagged format, for easy grepping and expansion. */ @@ -176,7 +218,10 @@ static int meminfo_read_proc(char *page, "Slab: %8lu kB\n" "Committed_AS: %8u kB\n" "PageTables: %8lu kB\n" - "ReverseMaps: %8lu\n", + "ReverseMaps: %8lu\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.bufferram), @@ -196,7 +241,10 @@ static int meminfo_read_proc(char *page, K(ps.nr_slab), K(committed), K(ps.nr_page_table_pages), - ps.nr_reverse_maps + ps.nr_reverse_maps, + vmtot, + vmi.used, + vmi.largest_chunk ); len += hugetlb_report_meminfo(page + len); @@ -254,6 +302,9 @@ static struct file_operations proc_vmsta .release = seq_release, }; +extern int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data); + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -298,6 +349,71 @@ static struct file_operations proc_modul }; #endif +#ifdef CONFIG_NUMA +#define K(x) ((x) << (PAGE_SHIFT - 10)) +static int show_meminfo_numa (struct seq_file *m, void *v) +{ + int *d = v; + int nid = *d; + struct sysinfo i; + si_meminfo_node(&i, nid); + seq_printf(m, "\n" + "Node %d MemTotal: %8lu kB\n" + "Node %d MemFree: %8lu kB\n" + "Node %d MemUsed: %8lu kB\n" + "Node %d HighTotal: %8lu kB\n" + "Node %d HighFree: %8lu kB\n" + "Node %d LowTotal: %8lu kB\n" + "Node %d LowFree: %8lu kB\n", + nid, K(i.totalram), + nid, K(i.freeram), + nid, K(i.totalram-i.freeram), + nid, K(i.totalhigh), + nid, K(i.freehigh), + nid, K(i.totalram-i.totalhigh), + nid, K(i.freeram-i.freehigh)); + + return 0; +} +#undef K + +extern struct seq_operations meminfo_numa_op; +static int meminfo_numa_open(struct inode *inode, struct file *file) +{ + return seq_open(file,&meminfo_numa_op); +} + +static struct file_operations proc_meminfo_numa_operations = { + open: meminfo_numa_open, + read: seq_read, + llseek: seq_lseek, + release: seq_release, +}; + +static void *meminfo_numa_start(struct seq_file *m, loff_t *pos) +{ + return *pos < numnodes ? pos : NULL; +} + +static void *meminfo_numa_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return meminfo_numa_start(m, pos); +} + +static void meminfo_numa_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations meminfo_numa_op = { + .start = meminfo_numa_start, + .next = meminfo_numa_next, + .stop = meminfo_numa_stop, + .show = show_meminfo_numa, +}; + +#endif + extern struct seq_operations slabinfo_op; extern ssize_t slabinfo_write(struct file *, const char *, size_t, loff_t *); static int slabinfo_open(struct inode *inode, struct file *file) @@ -573,6 +689,7 @@ void __init proc_misc_init(void) {"locks", locks_read_proc}, {"iomem", memory_read_proc}, {"execdomains", execdomains_read_proc}, + {"schedstat", schedstats_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) @@ -594,6 +711,9 @@ void __init proc_misc_init(void) create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations); #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); +#endif +#ifdef CONFIG_NUMA + create_seq_entry("meminfo.numa",0,&proc_meminfo_numa_operations); #endif proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/bug.h 90-mjb/include/asm-i386/bug.h --- 00-virgin/include/asm-i386/bug.h Tue Jan 14 10:06:18 2003 +++ 90-mjb/include/asm-i386/bug.h Wed Feb 5 22:23:05 2003 @@ -10,6 +10,11 @@ * undefined" opcode for parsing in the trap handler. */ +#ifdef CONFIG_X86_REMOTE_DEBUG +#define BUG() do { \ + asm ("int $0x3"); \ +} while (0) +#else #if 1 /* Set to zero for a slightly smaller kernel */ #define BUG() \ __asm__ __volatile__( "ud2\n" \ @@ -18,6 +23,7 @@ : : "i" (__LINE__), "i" (__FILE__)) #else #define BUG() __asm__ __volatile__("ud2\n") +#endif #endif #define PAGE_BUG(page) do { \ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/early_printk.h 90-mjb/include/asm-i386/early_printk.h --- 00-virgin/include/asm-i386/early_printk.h Wed Dec 31 16:00:00 1969 +++ 90-mjb/include/asm-i386/early_printk.h Wed Feb 5 22:22:58 2003 @@ -0,0 +1,8 @@ +#ifndef __EARLY_PRINTK_H_I386_ +#define __EARLY_PRINTK_H_i386_ + +#define VGABASE 0xB8000 +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/fixmap.h 90-mjb/include/asm-i386/fixmap.h --- 00-virgin/include/asm-i386/fixmap.h Mon Dec 23 23:01:56 2002 +++ 90-mjb/include/asm-i386/fixmap.h Thu Feb 6 19:49:42 2003 @@ -60,7 +60,7 @@ enum fixed_addresses { #ifdef CONFIG_X86_F00F_BUG FIX_F00F_IDT, /* Virtual mapping for IDT */ #endif -#ifdef CONFIG_X86_CYCLONE +#ifdef CONFIG_X86_SUMMIT FIX_CYCLONE_TIMER, /*cyclone timer register*/ #endif #ifdef CONFIG_HIGHMEM diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/ioctls.h 90-mjb/include/asm-i386/ioctls.h --- 00-virgin/include/asm-i386/ioctls.h Sun Nov 17 20:29:22 2002 +++ 90-mjb/include/asm-i386/ioctls.h Wed Feb 5 22:23:05 2003 @@ -68,6 +68,7 @@ #define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ #define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ #define FIOQSIZE 0x5460 +#define TIOCGDB 0x547F /* enable GDB stub mode on this tty */ /* Used for packet mode */ #define TIOCPKT_DATA 0 diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/mach-bigsmp/mach_apic.h 90-mjb/include/asm-i386/mach-bigsmp/mach_apic.h --- 00-virgin/include/asm-i386/mach-bigsmp/mach_apic.h Fri Jan 17 09:18:31 2003 +++ 90-mjb/include/asm-i386/mach-bigsmp/mach_apic.h Wed Feb 5 22:23:03 2003 @@ -87,7 +87,8 @@ static inline int cpu_to_logical_apicid( return (int)cpu_2_logical_apicid[cpu]; } -static inline int mpc_apic_id(struct mpc_config_processor *m, int quad) +static inline int mpc_apic_id(struct mpc_config_processor *m, + struct mpc_config_translation *translation_record) { printk("Processor #%d %ld:%ld APIC version %d\n", m->mpc_apicid, diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/mach-default/mach_apic.h 90-mjb/include/asm-i386/mach-default/mach_apic.h --- 00-virgin/include/asm-i386/mach-default/mach_apic.h Fri Jan 17 09:18:31 2003 +++ 90-mjb/include/asm-i386/mach-default/mach_apic.h Wed Feb 5 22:23:03 2003 @@ -79,7 +79,8 @@ static inline unsigned long apicid_to_cp return (1ul << phys_apicid); } -static inline int mpc_apic_id(struct mpc_config_processor *m, int quad) +static inline int mpc_apic_id(struct mpc_config_processor *m, + struct mpc_config_translation *translation_record) { printk("Processor #%d %ld:%ld APIC version %d\n", m->mpc_apicid, diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/mach-numaq/mach_apic.h 90-mjb/include/asm-i386/mach-numaq/mach_apic.h --- 00-virgin/include/asm-i386/mach-numaq/mach_apic.h Fri Jan 17 09:18:31 2003 +++ 90-mjb/include/asm-i386/mach-numaq/mach_apic.h Wed Feb 5 22:23:03 2003 @@ -73,8 +73,10 @@ static inline unsigned long apicid_to_cp return ( (logical_apicid&0xf) << (4*apicid_to_node(logical_apicid)) ); } -static inline int mpc_apic_id(struct mpc_config_processor *m, int quad) +static inline int mpc_apic_id(struct mpc_config_processor *m, + struct mpc_config_translation *translation_record) { + int quad = translation_record->trans_quad; int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); printk("Processor #%d %ld:%ld APIC version %d (quad %d, apic %d)\n", diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/mach-summit/mach_apic.h 90-mjb/include/asm-i386/mach-summit/mach_apic.h --- 00-virgin/include/asm-i386/mach-summit/mach_apic.h Fri Jan 17 09:18:31 2003 +++ 90-mjb/include/asm-i386/mach-summit/mach_apic.h Thu Feb 6 19:49:41 2003 @@ -3,7 +3,7 @@ extern int x86_summit; -#define esr_disable (1) +#define esr_disable (x86_summit ? 1 : 0) #define no_balance_irq (0) #define XAPIC_DEST_CPUS_MASK 0x0Fu @@ -15,14 +15,14 @@ extern int x86_summit; #define APIC_DFR_VALUE (x86_summit ? APIC_DFR_CLUSTER : APIC_DFR_FLAT) #define TARGET_CPUS (x86_summit ? XAPIC_DEST_CPUS_MASK : cpu_online_map) -#define INT_DELIVERY_MODE dest_Fixed +#define INT_DELIVERY_MODE (x86_summit ? dest_Fixed : dest_LowestPrio) #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ #define APIC_BROADCAST_ID (x86_summit ? 0xFF : 0x0F) -#define check_apicid_used(bitmap, apicid) (0) +#define check_apicid_used(bitmap, apicid) (x86_summit ? 0 : (bitmap & (1 << apicid))) /* we don't use the phys_cpu_present_map to indicate apicid presence */ -#define check_apicid_present(bit) (1) +#define check_apicid_present(bit) (x86_summit ? 1 : (phys_cpu_present_map & (1 << bit))) extern u8 bios_cpu_apicid[]; @@ -90,7 +90,8 @@ static inline unsigned long apicid_to_cp return (1ul << apicid); } -static inline int mpc_apic_id(struct mpc_config_processor *m, int quad) +static inline int mpc_apic_id(struct mpc_config_processor *m, + struct mpc_config_translation *translation_record) { printk("Processor #%d %ld:%ld APIC version %d\n", m->mpc_apicid, @@ -106,7 +107,10 @@ static inline void setup_portio_remap(vo static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) { - return (1); + if (x86_summit) + return (1); + else + return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map); } #endif /* __ASM_MACH_APIC_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/mach-summit/mach_mpparse.h 90-mjb/include/asm-i386/mach-summit/mach_mpparse.h --- 00-virgin/include/asm-i386/mach-summit/mach_mpparse.h Fri Jan 17 09:18:31 2003 +++ 90-mjb/include/asm-i386/mach-summit/mach_mpparse.h Thu Feb 6 19:49:42 2003 @@ -1,6 +1,8 @@ #ifndef __ASM_MACH_MPPARSE_H #define __ASM_MACH_MPPARSE_H +extern int use_cyclone; + static inline void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, struct mpc_config_translation *translation) { @@ -17,14 +19,18 @@ static inline void mps_oem_check(struct { if (!strncmp(oem, "IBM ENSW", 8) && (!strncmp(productid, "VIGIL SMP", 9) - || !strncmp(productid, "RUTHLESS SMP", 12))) + || !strncmp(productid, "RUTHLESS SMP", 12))){ x86_summit = 1; + use_cyclone = 1; /*enable cyclone-timer*/ + } } /* Hook from generic ACPI tables.c */ static inline void acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "SERVIGIL", 8)) + if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "SERVIGIL", 8)){ x86_summit = 1; + use_cyclone = 1; /*enable cyclone-timer*/ + } } #endif /* __ASM_MACH_MPPARSE_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/mmzone.h 90-mjb/include/asm-i386/mmzone.h --- 00-virgin/include/asm-i386/mmzone.h Sun Nov 17 20:29:46 2002 +++ 90-mjb/include/asm-i386/mmzone.h Thu Feb 6 19:49:39 2003 @@ -12,6 +12,8 @@ #ifdef CONFIG_X86_NUMAQ #include +#elif CONFIG_X86_SUMMIT +#include #else #define pfn_to_nid(pfn) (0) #endif /* CONFIG_X86_NUMAQ */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/msr.h 90-mjb/include/asm-i386/msr.h --- 00-virgin/include/asm-i386/msr.h Mon Dec 23 23:01:56 2002 +++ 90-mjb/include/asm-i386/msr.h Thu Feb 6 19:49:47 2003 @@ -93,6 +93,90 @@ #define MSR_IA32_MC0_ADDR 0x402 #define MSR_IA32_MC0_MISC 0x403 +/* Pentium IV performance counter MSRs */ +#define MSR_P4_BPU_PERFCTR0 0x300 +#define MSR_P4_BPU_PERFCTR1 0x301 +#define MSR_P4_BPU_PERFCTR2 0x302 +#define MSR_P4_BPU_PERFCTR3 0x303 +#define MSR_P4_MS_PERFCTR0 0x304 +#define MSR_P4_MS_PERFCTR1 0x305 +#define MSR_P4_MS_PERFCTR2 0x306 +#define MSR_P4_MS_PERFCTR3 0x307 +#define MSR_P4_FLAME_PERFCTR0 0x308 +#define MSR_P4_FLAME_PERFCTR1 0x309 +#define MSR_P4_FLAME_PERFCTR2 0x30a +#define MSR_P4_FLAME_PERFCTR3 0x30b +#define MSR_P4_IQ_PERFCTR0 0x30c +#define MSR_P4_IQ_PERFCTR1 0x30d +#define MSR_P4_IQ_PERFCTR2 0x30e +#define MSR_P4_IQ_PERFCTR3 0x30f +#define MSR_P4_IQ_PERFCTR4 0x310 +#define MSR_P4_IQ_PERFCTR5 0x311 +#define MSR_P4_BPU_CCCR0 0x360 +#define MSR_P4_BPU_CCCR1 0x361 +#define MSR_P4_BPU_CCCR2 0x362 +#define MSR_P4_BPU_CCCR3 0x363 +#define MSR_P4_MS_CCCR0 0x364 +#define MSR_P4_MS_CCCR1 0x365 +#define MSR_P4_MS_CCCR2 0x366 +#define MSR_P4_MS_CCCR3 0x367 +#define MSR_P4_FLAME_CCCR0 0x368 +#define MSR_P4_FLAME_CCCR1 0x369 +#define MSR_P4_FLAME_CCCR2 0x36a +#define MSR_P4_FLAME_CCCR3 0x36b +#define MSR_P4_IQ_CCCR0 0x36c +#define MSR_P4_IQ_CCCR1 0x36d +#define MSR_P4_IQ_CCCR2 0x36e +#define MSR_P4_IQ_CCCR3 0x36f +#define MSR_P4_IQ_CCCR4 0x370 +#define MSR_P4_IQ_CCCR5 0x371 +#define MSR_P4_ALF_ESCR0 0x3ca +#define MSR_P4_ALF_ESCR1 0x3cb +#define MSR_P4_BPU_ESCR0 0x3b2 +#define MSR_P4_BPU_ESCR1 0x3b3 +#define MSR_P4_BSU_ESCR0 0x3a0 +#define MSR_P4_BSU_ESCR1 0x3a1 +#define MSR_P4_CRU_ESCR0 0x3b8 +#define MSR_P4_CRU_ESCR1 0x3b9 +#define MSR_P4_CRU_ESCR2 0x3cc +#define MSR_P4_CRU_ESCR3 0x3cd +#define MSR_P4_CRU_ESCR4 0x3e0 +#define MSR_P4_CRU_ESCR5 0x3e1 +#define MSR_P4_DAC_ESCR0 0x3a8 +#define MSR_P4_DAC_ESCR1 0x3a9 +#define MSR_P4_FIRM_ESCR0 0x3a4 +#define MSR_P4_FIRM_ESCR1 0x3a5 +#define MSR_P4_FLAME_ESCR0 0x3a6 +#define MSR_P4_FLAME_ESCR1 0x3a7 +#define MSR_P4_FSB_ESCR0 0x3a2 +#define MSR_P4_FSB_ESCR1 0x3a3 +#define MSR_P4_IQ_ESCR0 0x3ba +#define MSR_P4_IQ_ESCR1 0x3bb +#define MSR_P4_IS_ESCR0 0x3b4 +#define MSR_P4_IS_ESCR1 0x3b5 +#define MSR_P4_ITLB_ESCR0 0x3b6 +#define MSR_P4_ITLB_ESCR1 0x3b7 +#define MSR_P4_IX_ESCR0 0x3c8 +#define MSR_P4_IX_ESCR1 0x3c9 +#define MSR_P4_MOB_ESCR0 0x3aa +#define MSR_P4_MOB_ESCR1 0x3ab +#define MSR_P4_MS_ESCR0 0x3c0 +#define MSR_P4_MS_ESCR1 0x3c1 +#define MSR_P4_PMH_ESCR0 0x3ac +#define MSR_P4_PMH_ESCR1 0x3ad +#define MSR_P4_RAT_ESCR0 0x3bc +#define MSR_P4_RAT_ESCR1 0x3bd +#define MSR_P4_SAAT_ESCR0 0x3ae +#define MSR_P4_SAAT_ESCR1 0x3af +#define MSR_P4_SSU_ESCR0 0x3be +#define MSR_P4_SSU_ESCR1 0x3bf /* guess: not defined in manual */ +#define MSR_P4_TBPU_ESCR0 0x3c2 +#define MSR_P4_TBPU_ESCR1 0x3c3 +#define MSR_P4_TC_ESCR0 0x3c4 +#define MSR_P4_TC_ESCR1 0x3c5 +#define MSR_P4_U2L_ESCR0 0x3b0 +#define MSR_P4_U2L_ESCR1 0x3b1 + /* AMD Defined MSRs */ #define MSR_K6_EFER 0xC0000080 #define MSR_K6_STAR 0xC0000081 diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/numaq.h 90-mjb/include/asm-i386/numaq.h --- 00-virgin/include/asm-i386/numaq.h Thu Jan 9 19:16:11 2003 +++ 90-mjb/include/asm-i386/numaq.h Thu Feb 6 19:49:50 2003 @@ -36,10 +36,11 @@ #define MAX_ELEMENTS 256 #define PAGES_PER_ELEMENT (16777216/256) +extern int physnode_map[]; +#define pfn_to_nid(pfn) ({ physnode_map[(pfn) / PAGES_PER_ELEMENT]; }) #define pfn_to_pgdat(pfn) NODE_DATA(pfn_to_nid(pfn)) #define PHYSADDR_TO_NID(pa) pfn_to_nid(pa >> PAGE_SHIFT) #define MAX_NUMNODES 8 -extern int pfn_to_nid(unsigned long); extern void get_memcfg_numaq(void); #define get_memcfg_numa() get_memcfg_numaq() @@ -168,6 +169,10 @@ struct sys_cfg_data { struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */ }; +static inline unsigned long get_zholes_size(int nid) +{ + return 0; +} #endif /* CONFIG_X86_NUMAQ */ #endif /* NUMAQ_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/page.h 90-mjb/include/asm-i386/page.h --- 00-virgin/include/asm-i386/page.h Tue Jan 14 10:06:18 2003 +++ 90-mjb/include/asm-i386/page.h Wed Feb 5 22:23:00 2003 @@ -89,7 +89,16 @@ typedef struct { unsigned long pgprot; } * and CONFIG_HIGHMEM64G options in the kernel configuration. */ -#define __PAGE_OFFSET (0xC0000000) +#include +#ifdef CONFIG_05GB +#define __PAGE_OFFSET (0xE0000000) +#elif defined(CONFIG_1GB) +#define __PAGE_OFFSET (0xC0000000) +#elif defined(CONFIG_2GB) +#define __PAGE_OFFSET (0x80000000) +#elif defined(CONFIG_3GB) +#define __PAGE_OFFSET (0x40000000) +#endif /* * This much address space is reserved for vmalloc() and iomap() diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/param.h 90-mjb/include/asm-i386/param.h --- 00-virgin/include/asm-i386/param.h Sun Nov 17 20:29:26 2002 +++ 90-mjb/include/asm-i386/param.h Wed Feb 5 22:22:59 2003 @@ -2,10 +2,18 @@ #define _ASMi386_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +#include + +#ifdef CONFIG_1000HZ +# define HZ 1000 /* Internal kernel timer frequency */ +#else +# define HZ 100 #endif + +#define USER_HZ 100 /* .. some user interfaces are in "ticks" */ +#define CLOCKS_PER_SEC (USER_HZ) /* like times() */ + +#endif /* __KERNEL__ */ #ifndef HZ #define HZ 100 diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/pgalloc.h 90-mjb/include/asm-i386/pgalloc.h --- 00-virgin/include/asm-i386/pgalloc.h Sun Nov 17 20:29:21 2002 +++ 90-mjb/include/asm-i386/pgalloc.h Thu Feb 6 19:49:49 2003 @@ -20,11 +20,11 @@ static inline void pmd_populate(struct m * Allocate and free page tables. */ -extern pgd_t *pgd_alloc(struct mm_struct *); -extern void pgd_free(pgd_t *pgd); +pgd_t *pgd_alloc(struct mm_struct *); +void pgd_free(pgd_t *pgd); -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); +pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); +struct page *pte_alloc_one(struct mm_struct *, unsigned long); static inline void pte_free_kernel(pte_t *pte) { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/pgtable-3level.h 90-mjb/include/asm-i386/pgtable-3level.h --- 00-virgin/include/asm-i386/pgtable-3level.h Sun Nov 17 20:29:52 2002 +++ 90-mjb/include/asm-i386/pgtable-3level.h Thu Feb 6 19:49:49 2003 @@ -106,6 +106,4 @@ static inline pmd_t pfn_pmd(unsigned lon return __pmd(((unsigned long long)page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); } -extern struct kmem_cache_s *pae_pgd_cachep; - #endif /* _I386_PGTABLE_3LEVEL_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/pgtable.h 90-mjb/include/asm-i386/pgtable.h --- 00-virgin/include/asm-i386/pgtable.h Sun Dec 1 10:00:23 2002 +++ 90-mjb/include/asm-i386/pgtable.h Thu Feb 6 19:49:49 2003 @@ -41,21 +41,12 @@ extern unsigned long empty_zero_page[102 #ifndef __ASSEMBLY__ #if CONFIG_X86_PAE # include - -/* - * Need to initialise the X86 PAE caches - */ -extern void pgtable_cache_init(void); - #else # include +#endif -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) +void pgtable_cache_init(void); -#endif #endif #define __beep() asm("movb $0x3,%al; outb %al,$0x61") diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/processor.h 90-mjb/include/asm-i386/processor.h --- 00-virgin/include/asm-i386/processor.h Thu Jan 2 22:05:15 2003 +++ 90-mjb/include/asm-i386/processor.h Wed Feb 5 22:23:05 2003 @@ -279,7 +279,11 @@ extern unsigned int mca_pentium_flag; /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ +#ifdef CONFIG_05GB +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 16)) +#else #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#endif /* * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. @@ -393,6 +397,9 @@ struct thread_struct { unsigned int saved_fs, saved_gs; /* IO permissions */ unsigned long *ts_io_bitmap; +#ifdef CONFIG_X86_REMOTE_DEBUG + struct pt_regs *kgdbregs; +#endif }; #define INIT_THREAD { \ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-i386/srat.h 90-mjb/include/asm-i386/srat.h --- 00-virgin/include/asm-i386/srat.h Wed Dec 31 16:00:00 1969 +++ 90-mjb/include/asm-i386/srat.h Thu Feb 6 19:49:39 2003 @@ -0,0 +1,68 @@ +/* + * Code taken from 64 bit discontigmem support. + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + */ + +#ifndef SRAT_DISCTG_H +#define SRAT_DISCTG_H + +extern int pa_to_nid(u64); +extern int pfn_to_nid(unsigned long); +extern void get_memcfg_from_srat(void); +extern unsigned long get_zholes_size(int); + +#define PHYSADDR_TO_NID(pa) pa_to_nid(pa) +#define pfn_to_pgdat(pfn) NODE_DATA(pfn_to_nid(pfn)) +#define get_memcfg_numa() get_memcfg_from_srat() + +#define MAX_NUMNODES 8 +#define MAX_CLUMPS_PER_NODE 4 +#define MAXCLUMPS (MAX_CLUMPS_PER_NODE * MAX_NUMNODES) + +/* + * cpu -> pxm_domain structure + */ +struct node_cpuid_s{ + u8 phys_id; /* phys apic ID (no EID for IA32) */ + u8 pxm; // proximity domain of cpu + u8 nid; +}; + +extern struct node_cpuid_s node_cpuid[]; + +#define _cpu_to_node(cpu) (node_cpuid[cpu].nid) + +/* + * memory -> pxm_domain structure + */ +struct node_memory_chunk_s { + u64 start_paddr; + u64 end_paddr; + u64 size; + u8 pxm; // proximity domain of node + u8 nid; // which cnode contains this chunk? + u8 bank; // which mem bank on this node +}; +extern struct node_memory_chunk_s node_memory_chunk[]; + +#endif /* SRAT_DISCTG_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/asm-x86_64/early_printk.h 90-mjb/include/asm-x86_64/early_printk.h --- 00-virgin/include/asm-x86_64/early_printk.h Wed Dec 31 16:00:00 1969 +++ 90-mjb/include/asm-x86_64/early_printk.h Wed Feb 5 22:22:58 2003 @@ -0,0 +1,8 @@ +#ifdef __EARLY_PRINTK_H_X86_64_ +#define __EARLY_PRINTK_H_X86_64_ + +#define VGABASE 0xffffffff800b8000UL +#define SERIAL_BASES { 0x3f8, 0x2f8 } +#define SERIAL_BASES_LEN 2 + +#endif diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/acpi.h 90-mjb/include/linux/acpi.h --- 00-virgin/include/linux/acpi.h Mon Jan 13 21:09:14 2003 +++ 90-mjb/include/linux/acpi.h Thu Feb 6 19:49:39 2003 @@ -82,7 +82,7 @@ typedef struct { struct acpi_table_rsdt { struct acpi_table_header header; - u32 entry[1]; + u32 entry[8]; } __attribute__ ((packed)); /* Extended System Description Table (XSDT) */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/dcache.h 90-mjb/include/linux/dcache.h --- 00-virgin/include/linux/dcache.h Tue Jan 14 10:06:18 2003 +++ 90-mjb/include/linux/dcache.h Wed Feb 5 22:22:56 2003 @@ -7,6 +7,7 @@ #include #include #include +#include #include struct vfsmount; @@ -72,11 +73,13 @@ struct dcookie_struct; struct dentry { atomic_t d_count; + unsigned long d_vfs_flags; /* moved here to be on same cacheline */ + spinlock_t d_lock; /* per dentry lock */ unsigned int d_flags; struct inode * d_inode; /* Where the name belongs to - NULL is negative */ struct dentry * d_parent; /* parent directory */ struct list_head d_hash; /* lookup hash list */ - struct list_head d_lru; /* d_count = 0 LRU list */ + struct list_head d_lru; /* LRU list */ struct list_head d_child; /* child of parent list */ struct list_head d_subdirs; /* our children */ struct list_head d_alias; /* inode alias list */ @@ -85,8 +88,8 @@ struct dentry { unsigned long d_time; /* used by d_revalidate */ struct dentry_operations *d_op; struct super_block * d_sb; /* The root of the dentry tree */ - unsigned long d_vfs_flags; void * d_fsdata; /* fs-specific data */ + struct rcu_head d_rcu; struct dcookie_struct * d_cookie; /* cookie, if any */ unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ } ____cacheline_aligned; @@ -139,6 +142,7 @@ d_iput: no no yes */ #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ +#define DCACHE_UNHASHED 0x0010 extern spinlock_t dcache_lock; extern rwlock_t dparent_lock; @@ -162,7 +166,8 @@ extern rwlock_t dparent_lock; static __inline__ void __d_drop(struct dentry * dentry) { - list_del_init(&dentry->d_hash); + dentry->d_vfs_flags |= DCACHE_UNHASHED; + list_del_rcu(&dentry->d_hash); } static __inline__ void d_drop(struct dentry * dentry) @@ -254,9 +259,8 @@ extern char * d_path(struct dentry *, st static __inline__ struct dentry * dget(struct dentry *dentry) { if (dentry) { - if (!atomic_read(&dentry->d_count)) - BUG(); atomic_inc(&dentry->d_count); + dentry->d_vfs_flags |= DCACHE_REFERENCED; } return dentry; } @@ -272,7 +276,7 @@ extern struct dentry * dget_locked(struc static __inline__ int d_unhashed(struct dentry *dentry) { - return list_empty(&dentry->d_hash); + return (dentry->d_vfs_flags & DCACHE_UNHASHED); } extern void dput(struct dentry *); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/early_printk.h 90-mjb/include/linux/early_printk.h --- 00-virgin/include/linux/early_printk.h Wed Dec 31 16:00:00 1969 +++ 90-mjb/include/linux/early_printk.h Wed Feb 5 22:22:58 2003 @@ -0,0 +1,47 @@ +#ifndef __EARLY_PRINTK_H_ +#define __EARLY_PRINTK_H_ + +#ifdef CONFIG_EARLY_PRINTK +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +/* Simple serial port output */ + +#define DEFAULT_BAUD 57600 +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + + +void early_printk(const char *fmt, ...); +int __init setup_early_printk(char *opt); + +#else + +#define early_printk(...) do {} while(0) +#define setup_early_printk(X) do {} while(0) + +#endif + +#endif diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/frlock.h 90-mjb/include/linux/frlock.h --- 00-virgin/include/linux/frlock.h Wed Dec 31 16:00:00 1969 +++ 90-mjb/include/linux/frlock.h Thu Feb 6 19:49:44 2003 @@ -0,0 +1,100 @@ +#ifndef __LINUX_FRLOCK_H +#define __LINUX_FRLOCK_H + +/* + * Fast read-write spinlocks. + * + * Fast reader/writer locks without starving writers. This type of + * lock for data where the reader wants a consitent set of information + * and is willing to retry if the information changes. Readers never + * block but they may have to retry if a writer is in + * progress. Writers do not wait for readers. + * + * Generalization on sequence variables used for gettimeofday on x86-64 + * by Andrea Arcangeli + * + * This is not as cache friendly as brlock. Also, this will not work + * for data that contains pointers, because any writer could + * invalidate a pointer that a reader was following. + * + * + * Expected reader usage: + * do { + * seq = fr_read_begin(); + * ... + * } while (seq != fr_read_end()); + * + * On non-SMP the spin locks disappear but the writer still needs + * to increment the sequence variables because an interrupt routine could + * change the state of the data. + */ + +#include +#include + +typedef struct { + spinlock_t lock; + unsigned pre_sequence; + unsigned post_sequence; +} frlock_t; + +#define FR_LOCK_UNLOCKED { SPIN_LOCK_UNLOCKED, 0, 0 } +#define frlock_init(x) do { *(x) = FR_LOCK_UNLOCKED; } while (0) + +static inline void fr_write_lock(frlock_t *rw) +{ + spin_lock(&rw->lock); + rw->pre_sequence++; + wmb(); +} + +static inline void fr_write_unlock(frlock_t *rw) +{ + wmb(); + rw->post_sequence++; + spin_unlock(&rw->lock); +} + +static inline int fr_write_trylock(frlock_t *rw) +{ + int ret = spin_trylock(&rw->lock); + + if (ret) { + ++rw->pre_sequence; + wmb(); + } + return ret; +} + +static inline unsigned fr_read_begin(frlock_t *rw) +{ + unsigned ret = rw->post_sequence; + rmb(); + return ret; + +} + +static inline unsigned fr_read_end(frlock_t *rw) +{ + rmb(); + return rw->pre_sequence; +} + +/* + * Possible sw/hw IRQ protected versions of the interfaces. + */ +#define fr_write_lock_irqsave(lock, flags) \ + do { local_irq_save(flags); fr_write_lock(lock); } while (0) +#define fr_write_lock_irq(lock) \ + do { local_irq_disable(); fr_write_lock(lock); } while (0) +#define fr_write_lock_bh(lock) \ + do { local_bh_disable(); fr_write_lock(lock); } while (0) + +#define fr_write_unlock_irqrestore(lock, flags) \ + do { fr_write_unlock(lock); local_irq_restore(flags); } while(0) +#define fr_write_unlock_irq(lock) \ + do { fr_write_unlock(lock); local_irq_enable(); } while(0) +#define fr_write_unlock_bh(lock) \ + do { fr_write_unlock(lock); local_bh_enable(); } while(0) + +#endif /* __LINUX_FRLOCK_H */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/gdb.h 90-mjb/include/linux/gdb.h --- 00-virgin/include/linux/gdb.h Wed Dec 31 16:00:00 1969 +++ 90-mjb/include/linux/gdb.h Wed Feb 5 22:23:05 2003 @@ -0,0 +1,67 @@ +#ifndef _GDB_H_ +#define _GDB_H_ + +/* + * Copyright (C) 2001 Amit S. Kale + */ + +/* gdb locks */ +#define KGDB_MAX_NO_CPUS NR_CPUS + +extern int gdb_enter; /* 1 = enter debugger on boot */ +extern int gdb_ttyS; +extern int gdb_baud; +extern int gdb_initialized; + +extern int gdb_hook(void); +extern void breakpoint(void); + +typedef int gdb_debug_hook(int trapno, + int signo, + int err_code, + struct pt_regs *regs); +extern gdb_debug_hook *linux_debug_hook; + +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +extern spinlock_t kgdb_nmispinlock; +#else +extern unsigned kgdb_spinlock; +extern unsigned kgdb_nmispinlock; +#endif + +extern volatile int kgdb_memerr_expected; + +struct console; +void gdb_console_write(struct console *co, const char *s, + unsigned count); +void gdb_console_init(void); + +extern volatile int procindebug[KGDB_MAX_NO_CPUS]; + +#define KGDB_ASSERT(message, condition) do { \ + if (!(condition)) { \ + printk("kgdb assertion failed: %s\n", message); \ + asm ("int $0x3"); \ + } \ +} while (0) + +#ifdef CONFIG_KERNEL_ASSERTS +#define KERNEL_ASSERT(message, condition) KGDB_ASSERT(message, condition) +#else +#define KERNEL_ASSERT(message, condition) +#endif + +#define KA_VALID_ERRNO(errno) ((errno) > 0 && (errno) <= EMEDIUMTYPE) + +#define KA_VALID_PTR_ERR(ptr) KA_VALID_ERRNO(-PTR_ERR(ptr)) + +#define KA_VALID_KPTR(ptr) (!(ptr) || \ + ((void *)(ptr) >= (void *)PAGE_OFFSET && \ + (void *)(ptr) < ERR_PTR(-EMEDIUMTYPE))) + +#define KA_VALID_PTRORERR(errptr) (KA_VALID_KPTR(errptr) || KA_VALID_PTR_ERR(errptr)) + +#define KA_HELD_GKL() (current->lock_depth >= 0) + +#endif /* _GDB_H_ */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/namei.h 90-mjb/include/linux/namei.h --- 00-virgin/include/linux/namei.h Sun Nov 17 20:29:30 2002 +++ 90-mjb/include/linux/namei.h Wed Feb 5 22:22:56 2003 @@ -11,8 +11,6 @@ struct nameidata { struct qstr last; unsigned int flags; int last_type; - struct dentry *old_dentry; - struct vfsmount *old_mnt; }; /* diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/oprofile.h 90-mjb/include/linux/oprofile.h --- 00-virgin/include/linux/oprofile.h Mon Dec 23 23:01:57 2002 +++ 90-mjb/include/linux/oprofile.h Thu Feb 6 19:49:47 2003 @@ -21,12 +21,22 @@ struct super_block; struct dentry; struct file_operations; +/* This is duplicated from user-space so + * must be kept in sync :( + */ enum oprofile_cpu { OPROFILE_CPU_PPRO, OPROFILE_CPU_PII, OPROFILE_CPU_PIII, OPROFILE_CPU_ATHLON, - OPROFILE_CPU_TIMER + OPROFILE_CPU_TIMER, + OPROFILE_UNUSED1, /* 2.4's RTC mode */ + OPROFILE_CPU_P4, + OPROFILE_CPU_IA64, + OPROFILE_CPU_IA64_1, + OPROFILE_CPU_IA64_2, + OPROFILE_CPU_HAMMER, + OPROFILE_CPU_P4_HT2 }; /* Operations structure to be filled in */ diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/sched.h 90-mjb/include/linux/sched.h --- 00-virgin/include/linux/sched.h Fri Jan 17 09:18:32 2003 +++ 90-mjb/include/linux/sched.h Wed Feb 5 22:23:05 2003 @@ -166,7 +166,9 @@ extern unsigned long cache_decay_ticks; #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); -asmlinkage void schedule(void); +asmlinkage void do_schedule(void); +asmlinkage void kern_schedule(void); +asmlinkage void kern_do_schedule(struct pt_regs); struct namespace; @@ -648,6 +650,12 @@ static inline int thread_group_empty(tas (thread_group_leader(p) && !thread_group_empty(p)) extern void unhash_process(struct task_struct *p); + +#ifdef CONFIG_KGDB_THREAD +#define schedule() kern_schedule() +#else +#define schedule() do_schedule() +#endif /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ static inline void task_lock(struct task_struct *p) diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/sysctl.h 90-mjb/include/linux/sysctl.h --- 00-virgin/include/linux/sysctl.h Mon Dec 23 23:01:57 2002 +++ 90-mjb/include/linux/sysctl.h Wed Feb 5 22:23:08 2003 @@ -66,7 +66,8 @@ enum CTL_DEV=7, /* Devices */ CTL_BUS=8, /* Busses */ CTL_ABI=9, /* Binary emulation */ - CTL_CPU=10 /* CPU stuff (speed scaling, etc) */ + CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_SCHED=11, /* scheduler tunables */ }; /* CTL_BUS names: */ @@ -157,6 +158,20 @@ enum VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ }; +/* Tunable scheduler parameters in /proc/sys/sched/ */ +enum { + SCHED_MIN_TIMESLICE=1, /* minimum process timeslice */ + SCHED_MAX_TIMESLICE=2, /* maximum process timeslice */ + SCHED_CHILD_PENALTY=3, /* penalty on fork to child */ + SCHED_PARENT_PENALTY=4, /* penalty on fork to parent */ + SCHED_EXIT_WEIGHT=5, /* penalty to parent of CPU hog child */ + SCHED_PRIO_BONUS_RATIO=6, /* percent of max prio given as bonus */ + SCHED_INTERACTIVE_DELTA=7, /* delta used to scale interactivity */ + SCHED_MAX_SLEEP_AVG=8, /* maximum sleep avg attainable */ + SCHED_STARVATION_LIMIT=9, /* no re-active if expired is starved */ + SCHED_IDLE_NODE_REBALANCE_RATIO=10, /* how often to global balance */ + SCHED_BUSY_NODE_REBALANCE_RATIO=11, /* how often to global balance */ +}; /* CTL_NET names: */ enum diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/time.h 90-mjb/include/linux/time.h --- 00-virgin/include/linux/time.h Fri Dec 13 23:18:14 2002 +++ 90-mjb/include/linux/time.h Thu Feb 6 19:49:44 2003 @@ -25,6 +25,7 @@ struct timezone { #ifdef __KERNEL__ #include +#include /* * Change timeval to jiffies, trying to avoid the @@ -120,7 +121,7 @@ mktime (unsigned int year, unsigned int } extern struct timespec xtime; -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; static inline unsigned long get_seconds(void) { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/include/linux/timex.h 90-mjb/include/linux/timex.h --- 00-virgin/include/linux/timex.h Sun Nov 17 20:29:21 2002 +++ 90-mjb/include/linux/timex.h Wed Feb 5 22:22:59 2003 @@ -76,7 +76,7 @@ #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #else -# error You lose. +# error Please use a HZ value which is between 12 and 1536 #endif /* diff -urpN -X /home/fletch/.diff.exclude 00-virgin/init/main.c 90-mjb/init/main.c --- 00-virgin/init/main.c Fri Jan 17 09:18:32 2003 +++ 90-mjb/init/main.c Wed Feb 5 22:23:05 2003 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,10 @@ #include #endif +#ifdef CONFIG_X86_REMOTE_DEBUG +#include +#endif + /* * Versions of gcc older than that listed below may actually compile * and link okay, but the end product can have subtle run time bugs. @@ -374,6 +379,7 @@ asmlinkage void __init start_kernel(void */ lock_kernel(); printk(linux_banner); + setup_early_printk(&command_line); setup_arch(&command_line); setup_per_cpu_areas(); @@ -444,6 +450,12 @@ asmlinkage void __init start_kernel(void * make syscalls (and thus be locked). */ init_idle(current, smp_processor_id()); + +#ifdef CONFIG_X86_REMOTE_DEBUG + if (gdb_enter) { + gdb_hook(); /* right at boot time */ + } +#endif /* Do the rest non-__init'ed, we're now alive */ rest_init(); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/Makefile 90-mjb/kernel/Makefile --- 00-virgin/kernel/Makefile Thu Jan 9 19:16:15 2003 +++ 90-mjb/kernel/Makefile Wed Feb 5 22:22:58 2003 @@ -22,6 +22,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/early_printk.c 90-mjb/kernel/early_printk.c --- 00-virgin/kernel/early_printk.c Wed Dec 31 16:00:00 1969 +++ 90-mjb/kernel/early_printk.c Wed Feb 5 22:22:58 2003 @@ -0,0 +1,209 @@ +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +static int current_ypos = 1, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= MAX_YPOS) { + /* scroll 1 line up */ + for(k = 1, j = 0; k < MAX_YPOS; k++, j++) { + for(i = 0; i < MAX_XPOS; i++) { + writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), + VGABASE + 2*(MAX_XPOS*j + i)); + } + } + for(i = 0; i < MAX_XPOS; i++) { + writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); + } + current_ypos = MAX_YPOS-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(MAX_XPOS*current_ypos + current_xpos++)); + if (current_xpos >= MAX_XPOS) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions losely based on a similar package from Klaus P. Gerlicher */ + +int early_serial_base; /* ttyS0 */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + rep_nop(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); + s++; + } +} + +static __init void early_serial_init(char *opt) +{ + unsigned char c; + unsigned divisor, baud = DEFAULT_BAUD; + static int bases[] = SERIAL_BASES; + char *s, *e; + + early_serial_base = bases[0]; + + if (*opt == ',') + ++opt; + + s = strsep(&opt, ","); + if (s != NULL) { + unsigned port; + if (!strncmp(s,"0x",2)) + early_serial_base = simple_strtoul(s, &e, 16); + else { + if (!strncmp(s,"ttyS",4)) + s+=4; + port = simple_strtoul(s, &e, 10); + if (port > (SERIAL_BASES_LEN-1) || s == e) + port = 0; + early_serial_base = bases[port]; + } + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + s = strsep(&opt, ","); + if (s != NULL) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + va_start(ap,fmt); + n = vsnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int keep_early; + +int __init setup_early_printk(char *opt) +{ + char *space, *s; + char buf[256]; + + s = strstr(opt, "earlyprintk="); + if (s == NULL) + return -1; + opt = s+12; + + if (early_console_initialized) + return -1; + + strncpy(buf,opt,256); + buf[255] = 0; + space = strchr(buf, ' '); + if (space) + *space = 0; + + if (strstr(buf,"keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3)) { + early_console = &early_vga_console; + } else { + early_console = NULL; + return -1; + } + early_console_initialized = 1; + register_console(early_console); + early_printk( "early printk console registered\n" ); + return 0; +} + +void __init disable_early_printk(void) +{ + if (!early_console_initialized || !early_console) + return; + if (!keep_early) { + printk("disabling early console...\n"); + unregister_console(early_console); + early_console_initialized = 0; + } else { + printk("keeping early console.\n"); + } +} + +/* syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. */ +__setup("earlyprintk=", setup_early_printk); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/ksyms.c 90-mjb/kernel/ksyms.c --- 00-virgin/kernel/ksyms.c Tue Jan 14 10:06:19 2003 +++ 90-mjb/kernel/ksyms.c Wed Feb 5 22:23:05 2003 @@ -469,7 +469,10 @@ EXPORT_SYMBOL(sleep_on); EXPORT_SYMBOL(sleep_on_timeout); EXPORT_SYMBOL(interruptible_sleep_on); EXPORT_SYMBOL(interruptible_sleep_on_timeout); -EXPORT_SYMBOL(schedule); +EXPORT_SYMBOL(do_schedule); +#ifdef CONFIG_KGDB_THREAD +EXPORT_SYMBOL(kern_schedule); +#endif #ifdef CONFIG_PREEMPT EXPORT_SYMBOL(preempt_schedule); #endif diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/sched.c 90-mjb/kernel/sched.c --- 00-virgin/kernel/sched.c Fri Jan 17 09:18:32 2003 +++ 90-mjb/kernel/sched.c Wed Feb 5 22:23:08 2003 @@ -33,6 +33,12 @@ #include #include +#ifdef CONFIG_NUMA +#define __cpu_to_node_mask(cpu) __node_to_cpu_mask(__cpu_to_node(cpu)) +#else +#define __cpu_to_node_mask(cpu) (cpu_online_map) +#endif + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -57,16 +63,30 @@ * Minimum timeslice is 10 msecs, default timeslice is 150 msecs, * maximum timeslice is 300 msecs. Timeslices get refilled after * they expire. + * + * They are configurable via /proc/sys/sched */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (300 * HZ / 1000) -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (2*HZ) -#define STARVATION_LIMIT (2*HZ) + +int min_timeslice = (10 * HZ) / 1000; +int max_timeslice = (300 * HZ) / 1000; +int child_penalty = 95; +int parent_penalty = 100; +int exit_weight = 3; +int prio_bonus_ratio = 25; +int interactive_delta = 2; +int max_sleep_avg = 2 * HZ; +int starvation_limit = 2 * HZ; + +#define MIN_TIMESLICE (min_timeslice) +#define MAX_TIMESLICE (max_timeslice) +#define CHILD_PENALTY (child_penalty) +#define PARENT_PENALTY (parent_penalty) +#define EXIT_WEIGHT (exit_weight) +#define PRIO_BONUS_RATIO (prio_bonus_ratio) +#define INTERACTIVE_DELTA (interactive_delta) +#define MAX_SLEEP_AVG (max_sleep_avg) +#define STARVATION_LIMIT (starvation_limit) + #define NODE_THRESHOLD 125 /* @@ -153,10 +173,9 @@ struct runqueue { nr_uninterruptible; task_t *curr, *idle; prio_array_t *active, *expired, arrays[2]; - int prev_nr_running[NR_CPUS]; + int prev_cpu_load[NR_CPUS]; #ifdef CONFIG_NUMA atomic_t *node_nr_running; - unsigned int nr_balanced; int prev_node_load[MAX_NUMNODES]; #endif task_t *migration_thread; @@ -224,6 +243,83 @@ __init void node_nr_running_init(void) #endif /* CONFIG_NUMA */ + +struct schedstat { + /* sys_sched_yield stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + + /* load_balance stats */ + unsigned long lb_imbalance; + unsigned long lb_idle; + unsigned long lb_resched; + unsigned long lb_cnt; + unsigned long lb_nobusy; +} ____cacheline_aligned; + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 1 + +struct schedstat schedstats[NR_CPUS]; + +/* + * This could conceivably exceed a page's worth of output on machines with + * large number of cpus, where large == about 4096/100 or 40ish. Start + * worrying when we pass 32, probably. Then this has to stop being a + * "simple" entry in proc/proc_misc.c and needs to be an actual seq_file. + */ +int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct schedstat sums; + int i, len; + + memset(&sums, 0, sizeof(sums)); + len = sprintf(page, "version %d\n", SCHEDSTAT_VERSION); + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) continue; + sums.yld_exp_empty += schedstats[i].yld_exp_empty; + sums.yld_act_empty += schedstats[i].yld_act_empty; + sums.yld_both_empty += schedstats[i].yld_both_empty; + sums.yld_cnt += schedstats[i].yld_cnt; + sums.sched_noswitch += schedstats[i].sched_noswitch; + sums.sched_switch += schedstats[i].sched_switch; + sums.sched_switch += schedstats[i].sched_cnt; + sums.lb_idle += schedstats[i].lb_idle; + sums.lb_resched += schedstats[i].lb_resched; + sums.lb_cnt += schedstats[i].lb_cnt; + sums.lb_imbalance += schedstats[i].lb_imbalance; + sums.lb_nobusy += schedstats[i].lb_nobusy; + len += sprintf(page + len, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + i, schedstats[i].yld_both_empty, + schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty, + schedstats[i].yld_cnt, schedstats[i].sched_noswitch, + schedstats[i].sched_switch, schedstats[i].sched_cnt, + schedstats[i].lb_idle, schedstats[i].lb_resched, + schedstats[i].lb_cnt, schedstats[i].lb_imbalance, + schedstats[i].lb_nobusy); + } + len += sprintf(page + len, + "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty, + sums.yld_cnt, sums.sched_noswitch, sums.sched_switch, + sums.sched_cnt, sums.lb_idle, sums.lb_resched, sums.lb_cnt, + sums.lb_imbalance, sums.lb_nobusy); + + return len; +} + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -583,7 +679,6 @@ static inline task_t * context_switch(ta return prev; } - /* * nr_running, nr_uninterruptible and nr_context_switches: * @@ -765,31 +860,11 @@ static int find_busiest_node(int this_no return node; } -static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq) -{ - int this_node = __cpu_to_node(this_cpu); - /* - * Avoid rebalancing between nodes too often. - * We rebalance globally once every NODE_BALANCE_RATE load balances. - */ - if (++(this_rq->nr_balanced) == NODE_BALANCE_RATE) { - int node = find_busiest_node(this_node); - this_rq->nr_balanced = 0; - if (node >= 0) - return (__node_to_cpu_mask(node) | (1UL << this_cpu)); - } - return __node_to_cpu_mask(this_node); -} - -#else /* !CONFIG_NUMA */ - -static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq) -{ - return cpu_online_map; -} - #endif /* CONFIG_NUMA */ +int idle_node_rebalance_ratio = 10; +int busy_node_rebalance_ratio = 100; + #if CONFIG_SMP /* @@ -807,10 +882,10 @@ static inline unsigned int double_lock_b spin_lock(&busiest->lock); spin_lock(&this_rq->lock); /* Need to recalculate nr_running */ - if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) nr_running = this_rq->nr_running; else - nr_running = this_rq->prev_nr_running[this_cpu]; + nr_running = this_rq->prev_cpu_load[this_cpu]; } else spin_lock(&busiest->lock); } @@ -847,10 +922,10 @@ static inline runqueue_t *find_busiest_q * that case we are less picky about moving a task across CPUs and * take what can be taken. */ - if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) nr_running = this_rq->nr_running; else - nr_running = this_rq->prev_nr_running[this_cpu]; + nr_running = this_rq->prev_cpu_load[this_cpu]; busiest = NULL; max_load = 1; @@ -859,11 +934,11 @@ static inline runqueue_t *find_busiest_q continue; rq_src = cpu_rq(i); - if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i])) + if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i])) load = rq_src->nr_running; else - load = this_rq->prev_nr_running[i]; - this_rq->prev_nr_running[i] = rq_src->nr_running; + load = this_rq->prev_cpu_load[i]; + this_rq->prev_cpu_load[i] = rq_src->nr_running; if ((load > max_load) && (rq_src != this_rq)) { busiest = rq_src; @@ -922,7 +997,7 @@ static inline void pull_task(runqueue_t * We call this with the current runqueue locked, * irqs disabled. */ -static void load_balance(runqueue_t *this_rq, int idle) +static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask) { int imbalance, idx, this_cpu = smp_processor_id(); runqueue_t *busiest; @@ -930,11 +1005,16 @@ static void load_balance(runqueue_t *thi struct list_head *head, *curr; task_t *tmp; - busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, - cpus_to_balance(this_cpu, this_rq)); - if (!busiest) + schedstats[this_cpu].lb_cnt++; + if (idle) + schedstats[this_cpu].lb_idle++; + busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); + if (!busiest) { + schedstats[this_cpu].lb_nobusy++; goto out; + } + schedstats[this_cpu].lb_imbalance += imbalance; /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1006,21 +1086,76 @@ out: * frequency and balancing agressivity depends on whether the CPU is * idle or not. * - * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on + * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on * systems with HZ=100, every 10 msecs.) + * + * On NUMA, do a node-rebalance every 400 msecs. */ -#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) +#define BUSY_REBALANCE_TICK (HZ/5 ?: 1) + +#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * idle_node_rebalance_ratio) +#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * busy_node_rebalance_ratio) -static inline void idle_tick(runqueue_t *rq) +#if CONFIG_NUMA +static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) { - if (jiffies % IDLE_REBALANCE_TICK) - return; - spin_lock(&rq->lock); - load_balance(rq, 1); - spin_unlock(&rq->lock); + int node = find_busiest_node(__cpu_to_node(this_cpu)); + unsigned long cpumask, this_cpumask = 1UL << this_cpu; + + if (node >= 0) { + cpumask = __node_to_cpu_mask(node) | this_cpumask; + spin_lock(&this_rq->lock); + load_balance(this_rq, idle, cpumask); + spin_unlock(&this_rq->lock); + } } +#endif +static void rebalance_tick(runqueue_t *this_rq, int idle) +{ +#if CONFIG_NUMA + int this_cpu = smp_processor_id(); +#endif + unsigned long j = jiffies; + + /* + * First do inter-node rebalancing, then intra-node rebalancing, + * if both events happen in the same tick. The inter-node + * rebalancing does not necessarily have to create a perfect + * balance within the node, since we load-balance the most loaded + * node with the current CPU. (ie. other CPUs in the local node + * are not balanced.) + */ + if (idle) { +#if CONFIG_NUMA + if (!(j % IDLE_NODE_REBALANCE_TICK)) + balance_node(this_rq, idle, this_cpu); +#endif + if (!(j % IDLE_REBALANCE_TICK)) { + spin_lock(&this_rq->lock); + load_balance(this_rq, 0, __cpu_to_node_mask(this_cpu)); + spin_unlock(&this_rq->lock); + } + return; + } +#if CONFIG_NUMA + if (!(j % BUSY_NODE_REBALANCE_TICK)) + balance_node(this_rq, idle, this_cpu); +#endif + if (!(j % BUSY_REBALANCE_TICK)) { + spin_lock(&this_rq->lock); + load_balance(this_rq, idle, __cpu_to_node_mask(this_cpu)); + spin_unlock(&this_rq->lock); + } +} +#else +/* + * on UP we do not need to balance between CPUs: + */ +static inline void rebalance_tick(runqueue_t *this_rq, int idle) +{ +} #endif DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } }; @@ -1063,9 +1198,7 @@ void scheduler_tick(int user_ticks, int kstat_cpu(cpu).cpustat.iowait += sys_ticks; else kstat_cpu(cpu).cpustat.idle += sys_ticks; -#if CONFIG_SMP - idle_tick(rq); -#endif + rebalance_tick(rq, 1); return; } if (TASK_NICE(p) > 0) @@ -1121,11 +1254,8 @@ void scheduler_tick(int user_ticks, int enqueue_task(p, rq->active); } out: -#if CONFIG_SMP - if (!(jiffies % BUSY_REBALANCE_TICK)) - load_balance(rq, 0); -#endif spin_unlock(&rq->lock); + rebalance_tick(rq, 0); } void scheduling_functions_start_here(void) { } @@ -1133,19 +1263,20 @@ void scheduling_functions_start_here(voi /* * schedule() is the main scheduler function. */ -asmlinkage void schedule(void) +asmlinkage void do_schedule(void) { task_t *prev, *next; runqueue_t *rq; prio_array_t *array; struct list_head *queue; - int idx; + int idx, mycpu = smp_processor_id(); /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ + schedstats[mycpu].sched_cnt++; if (likely(current->state != TASK_ZOMBIE)) { if (unlikely(in_atomic())) { printk(KERN_ERR "bad: scheduling while atomic!\n"); @@ -1184,7 +1315,8 @@ need_resched: pick_next_task: if (unlikely(!rq->nr_running)) { #if CONFIG_SMP - load_balance(rq, 1); + schedstats[mycpu].lb_resched++; + load_balance(rq, 1, __cpu_to_node_mask(smp_processor_id())); if (rq->nr_running) goto pick_next_task; #endif @@ -1198,11 +1330,13 @@ pick_next_task: /* * Switch the active and expired arrays. */ + schedstats[mycpu].sched_switch++; rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; } + schedstats[mycpu].sched_noswitch++; idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; @@ -1367,6 +1501,22 @@ void complete_all(struct completion *x) spin_unlock_irqrestore(&x->wait.lock, flags); } +asmlinkage void user_schedule(void) +{ +#ifdef CONFIG_KGDB_THREAD + current->thread.kgdbregs = NULL; +#endif + do_schedule(); +} + +#ifdef CONFIG_KGDB_THREAD +asmlinkage void kern_do_schedule(struct pt_regs regs) +{ + current->thread.kgdbregs = ®s; + do_schedule(); +} +#endif + void wait_for_completion(struct completion *x) { might_sleep(); @@ -1859,6 +2009,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; + int mycpu = smp_processor_id(); /* * We implement yielding by moving the task into the expired @@ -1867,7 +2018,15 @@ asmlinkage long sys_sched_yield(void) * (special rule: RT tasks will just roundrobin in the active * array.) */ + schedstats[mycpu].yld_cnt++; if (likely(!rt_task(current))) { + if (current->array->nr_active == 1) { + schedstats[mycpu].yld_act_empty++; + if (!rq->expired->nr_active) + schedstats[mycpu].yld_both_empty++; + } else if (!rq->expired->nr_active) { + schedstats[mycpu].yld_exp_empty++; + } dequeue_task(current, array); enqueue_task(current, rq->expired); } else { diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/sysctl.c 90-mjb/kernel/sysctl.c --- 00-virgin/kernel/sysctl.c Mon Dec 16 21:50:51 2002 +++ 90-mjb/kernel/sysctl.c Wed Feb 5 22:23:08 2003 @@ -55,6 +55,17 @@ extern char core_pattern[]; extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; +extern int min_timeslice; +extern int max_timeslice; +extern int child_penalty; +extern int parent_penalty; +extern int exit_weight; +extern int prio_bonus_ratio; +extern int interactive_delta; +extern int max_sleep_avg; +extern int starvation_limit; +extern int idle_node_rebalance_ratio; +extern int busy_node_rebalance_ratio; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -112,6 +123,7 @@ static struct ctl_table_header root_tabl static ctl_table kern_table[]; static ctl_table vm_table[]; +static ctl_table sched_table[]; #ifdef CONFIG_NET extern ctl_table net_table[]; #endif @@ -156,6 +168,7 @@ static ctl_table root_table[] = { {CTL_FS, "fs", NULL, 0, 0555, fs_table}, {CTL_DEBUG, "debug", NULL, 0, 0555, debug_table}, {CTL_DEV, "dev", NULL, 0, 0555, dev_table}, + {CTL_SCHED, "sched", NULL, 0, 0555, sched_table}, {0} }; @@ -358,7 +371,46 @@ static ctl_table debug_table[] = { static ctl_table dev_table[] = { {0} -}; +}; + +static ctl_table sched_table[] = { + {SCHED_MAX_TIMESLICE, "max_timeslice", &max_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_MIN_TIMESLICE, "min_timeslice", &min_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_CHILD_PENALTY, "child_penalty", &child_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PARENT_PENALTY, "parent_penalty", &parent_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_EXIT_WEIGHT, "exit_weight", &exit_weight, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio", &prio_bonus_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_INTERACTIVE_DELTA, "interactive_delta", &interactive_delta, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_MAX_SLEEP_AVG, "max_sleep_avg", &max_sleep_avg, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_STARVATION_LIMIT, "starvation_limit", &starvation_limit, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_IDLE_NODE_REBALANCE_RATIO, "idle_node_rebalance_ratio", + &idle_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_BUSY_NODE_REBALANCE_RATIO, "busy_node_rebalance_ratio", + &busy_node_rebalance_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {0} +}; extern void init_irq_proc (void); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/time.c 90-mjb/kernel/time.c --- 00-virgin/kernel/time.c Sun Nov 17 20:29:28 2002 +++ 90-mjb/kernel/time.c Thu Feb 6 19:49:44 2003 @@ -27,7 +27,6 @@ #include #include #include - #include /* @@ -38,7 +37,7 @@ struct timezone sys_tz; /* The xtime_lock is not only serializing the xtime read/writes but it's also serializing all accesses to the global NTP variables now. */ -extern rwlock_t xtime_lock; +extern frlock_t xtime_lock; extern unsigned long last_time_offset; #if !defined(__alpha__) && !defined(__ia64__) @@ -80,7 +79,7 @@ asmlinkage long sys_stime(int * tptr) return -EPERM; if (get_user(value, tptr)) return -EFAULT; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime.tv_sec = value; xtime.tv_nsec = 0; last_time_offset = 0; @@ -88,7 +87,7 @@ asmlinkage long sys_stime(int * tptr) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); return 0; } @@ -96,13 +95,13 @@ asmlinkage long sys_stime(int * tptr) asmlinkage long sys_gettimeofday(struct timeval *tv, struct timezone *tz) { - if (tv) { + if (likely(tv != NULL)) { struct timeval ktv; do_gettimeofday(&ktv); if (copy_to_user(tv, &ktv, sizeof(ktv))) return -EFAULT; } - if (tz) { + if (unlikely(tz != NULL)) { if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) return -EFAULT; } @@ -127,10 +126,10 @@ asmlinkage long sys_gettimeofday(struct */ inline static void warp_clock(void) { - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); xtime.tv_sec += sys_tz.tz_minuteswest * 60; last_time_offset = 0; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); } /* @@ -235,7 +234,7 @@ int do_adjtimex(struct timex *txc) txc->tick > 1100000/USER_HZ) return -EINVAL; - write_lock_irq(&xtime_lock); + fr_write_lock_irq(&xtime_lock); result = time_state; /* mostly `TIME_OK' */ /* Save for later - semantics of adjtime is to return old value */ @@ -386,7 +385,7 @@ leave: if ((time_status & (STA_UNSYNC|ST txc->errcnt = pps_errcnt; txc->stbcnt = pps_stbcnt; last_time_offset = 0; - write_unlock_irq(&xtime_lock); + fr_write_unlock_irq(&xtime_lock); do_gettimeofday(&txc->time); return(result); } @@ -409,9 +408,13 @@ asmlinkage long sys_adjtimex(struct time struct timespec current_kernel_time(void) { struct timespec now; - unsigned long flags; - read_lock_irqsave(&xtime_lock,flags); - now = xtime; - read_unlock_irqrestore(&xtime_lock,flags); + unsigned long seq; + + do { + seq = fr_read_begin(&xtime_lock); + + now = xtime; + } while (seq != fr_read_end(&xtime_lock)); + return now; } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/timer.c 90-mjb/kernel/timer.c --- 00-virgin/kernel/timer.c Mon Dec 16 21:50:51 2002 +++ 90-mjb/kernel/timer.c Thu Feb 6 19:49:44 2003 @@ -758,7 +758,7 @@ unsigned long wall_jiffies; * This read-write spinlock protects us from races in SMP while * playing with xtime and avenrun. */ -rwlock_t xtime_lock __cacheline_aligned_in_smp = RW_LOCK_UNLOCKED; +frlock_t xtime_lock __cacheline_aligned_in_smp = FR_LOCK_UNLOCKED; unsigned long last_time_offset; /* @@ -798,8 +798,7 @@ static inline void update_times(void) } /* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without holding read_lock_irq(&xtime_lock). + * The 64-bit jiffies value is not atomic * jiffies is defined in the linker script... */ @@ -1087,18 +1086,21 @@ asmlinkage long sys_sysinfo(struct sysin struct sysinfo val; unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; + unsigned long seq; memset((char *)&val, 0, sizeof(struct sysinfo)); - read_lock_irq(&xtime_lock); - val.uptime = jiffies / HZ; + do { + seq = fr_read_begin(&xtime_lock); - val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + val.uptime = jiffies / HZ; - val.procs = nr_threads; - read_unlock_irq(&xtime_lock); + val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + + val.procs = nr_threads; + } while (seq != fr_read_end(&xtime_lock)); si_meminfo(&val); si_swapinfo(&val); diff -urpN -X /home/fletch/.diff.exclude 00-virgin/mm/memory.c 90-mjb/mm/memory.c --- 00-virgin/mm/memory.c Mon Jan 13 21:09:28 2003 +++ 90-mjb/mm/memory.c Wed Feb 5 22:23:00 2003 @@ -101,8 +101,7 @@ static inline void free_one_pmd(struct m static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) { - int j; - pmd_t * pmd; + pmd_t * pmd, * md, * emd; if (pgd_none(*dir)) return; @@ -113,8 +112,21 @@ static inline void free_one_pgd(struct m } pmd = pmd_offset(dir, 0); pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(tlb, pmd+j); + /* + * Beware if changing the loop below. It once used int j, + * for (j = 0; j < PTRS_PER_PMD; j++) + * free_one_pmd(pmd+j); + * but some older i386 compilers (e.g. egcs-2.91.66, gcc-2.95.3) + * terminated the loop with a _signed_ address comparison + * using "jle", when configured for HIGHMEM64GB (X86_PAE). + * If also configured for 3GB of kernel virtual address space, + * if page at physical 0x3ffff000 virtual 0x7ffff000 is used as + * a pmd, when that mm exits the loop goes on to free "entries" + * found at 0x80000000 onwards. The loop below compiles instead + * to be terminated by unsigned address comparison using "jb". + */ + for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++) + free_one_pmd(tlb,md); pmd_free_tlb(tlb, pmd); } diff -urpN -X /home/fletch/.diff.exclude 00-virgin/net/ipv4/tcp_output.c 90-mjb/net/ipv4/tcp_output.c --- 00-virgin/net/ipv4/tcp_output.c Sun Nov 17 20:29:50 2002 +++ 90-mjb/net/ipv4/tcp_output.c Thu Feb 6 19:49:48 2003 @@ -786,13 +786,13 @@ static void tcp_retrans_try_collapse(str /* Ok. We will be able to collapse the packet. */ __skb_unlink(next_skb, next_skb->list); + memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); + if (next_skb->ip_summed == CHECKSUM_HW) skb->ip_summed = CHECKSUM_HW; - if (skb->ip_summed != CHECKSUM_HW) { - memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); + if (skb->ip_summed != CHECKSUM_HW) skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); - } /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;