diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/arm/mm/mm-armv.c xx/arch/arm/mm/mm-armv.c --- 2.6.5-rc2/arch/arm/mm/mm-armv.c 2004-03-11 08:27:21.000000000 +0100 +++ xx/arch/arm/mm/mm-armv.c 2004-03-21 15:21:42.000000000 +0100 @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/Kconfig xx/arch/i386/Kconfig --- 2.6.5-rc2/arch/i386/Kconfig 2004-03-21 15:09:23.000000000 +0100 +++ xx/arch/i386/Kconfig 2004-03-21 15:21:40.000000000 +0100 @@ -1282,14 +1282,195 @@ config DEBUG_SPINLOCK_SLEEP If you say Y here, various routines which may sleep will become very noisy if they are called with a spinlock held. +config KGDB + bool "Include kgdb kernel debugger" + depends on DEBUG_KERNEL + help + If you say Y here, the system will be compiled with the debug + option (-g) and a debugging stub will be included in the + kernel. This stub communicates with gdb on another (host) + computer via a serial port. The host computer should have + access to the kernel binary file (vmlinux) and a serial port + that is connected to the target machine. Gdb can be made to + configure the serial port or you can use stty and setserial to + do this. See the 'target' command in gdb. This option also + configures in the ability to request a breakpoint early in the + boot process. To request the breakpoint just include 'kgdb' + as a boot option when booting the target machine. The system + will then break as soon as it looks at the boot options. This + option also installs a breakpoint in panic and sends any + kernel faults to the debugger. For more information see the + Documentation/i386/kgdb/kgdb.txt file. + +choice + depends on KGDB + prompt "Debug serial port BAUD" + default KGDB_115200BAUD + help + Gdb and the kernel stub need to agree on the baud rate to be + used. Some systems (x86 family at this writing) allow this to + be configured. + +config KGDB_9600BAUD + bool "9600" + +config KGDB_19200BAUD + bool "19200" + +config KGDB_38400BAUD + bool "38400" + +config KGDB_57600BAUD + bool "57600" + +config KGDB_115200BAUD + bool "115200" +endchoice + +config KGDB_PORT + hex "hex I/O port address of the debug serial port" + depends on KGDB + default 3f8 + help + Some systems (x86 family at this writing) allow the port + address to be configured. The number entered is assumed to be + hex, don't put 0x in front of it. The standard address are: + COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx + will tell you what you have. It is good to test the serial + connection with a live system before trying to debug. + +config KGDB_IRQ + int "IRQ of the debug serial port" + depends on KGDB + default 4 + help + This is the irq for the debug port. If everything is working + correctly and the kernel has interrupts on a control C to the + port should cause a break into the kernel debug stub. + +config DEBUG_INFO + bool + depends on KGDB + default y + +config KGDB_MORE + bool "Add any additional compile options" + depends on KGDB + default n + help + Saying yes here turns on the ability to enter additional + compile options. + + +config KGDB_OPTIONS + depends on KGDB_MORE + string "Additional compile arguments" + default "-O1" + help + This option allows you enter additional compile options for + the whole kernel compile. Each platform will have a default + that seems right for it. For example on PPC "-ggdb -O1", and + for i386 "-O1". Note that by configuring KGDB "-g" is already + turned on. In addition, on i386 platforms + "-fomit-frame-pointer" is deleted from the standard compile + options. + +config NO_KGDB_CPUS + int "Number of CPUs" + depends on KGDB && SMP + default NR_CPUS + help + + This option sets the number of cpus for kgdb ONLY. It is used + to prune some internal structures so they look "nice" when + displayed with gdb. This is to overcome possibly larger + numbers that may have been entered above. Enter the real + number to get nice clean kgdb_info displays. + +config KGDB_TS + bool "Enable kgdb time stamp macros?" + depends on KGDB + default n + help + Kgdb event macros allow you to instrument your code with calls + to the kgdb event recording function. The event log may be + examined with gdb at a break point. Turning on this + capability also allows you to choose how many events to + keep. Kgdb always keeps the lastest events. + +choice + depends on KGDB_TS + prompt "Max number of time stamps to save?" + default KGDB_TS_128 + +config KGDB_TS_64 + bool "64" + +config KGDB_TS_128 + bool "128" + +config KGDB_TS_256 + bool "256" + +config KGDB_TS_512 + bool "512" + +config KGDB_TS_1024 + bool "1024" + +endchoice + +config STACK_OVERFLOW_TEST + bool "Turn on kernel stack overflow testing?" + depends on KGDB + default n + help + This option enables code in the front line interrupt handlers + to check for kernel stack overflow on interrupts and system + calls. This is part of the kgdb code on x86 systems. + +config KGDB_CONSOLE + bool "Enable serial console thru kgdb port" + depends on KGDB + default n + help + This option enables the command line "console=kgdb" option. + When the system is booted with this option in the command line + all kernel printk output is sent to gdb (as well as to other + consoles). For this to work gdb must be connected. For this + reason, this command line option will generate a breakpoint if + gdb has not yet connected. After the gdb continue command is + given all pent up console output will be printed by gdb on the + host machine. Neither this option, nor KGDB require the + serial driver to be configured. + +config KGDB_SYSRQ + bool "Turn on SysRq 'G' command to do a break?" + depends on KGDB + default y + help + This option includes an option in the SysRq code that allows + you to enter SysRq G which generates a breakpoint to the KGDB + stub. This will work if the keyboard is alive and can + interrupt the system. Because of constraints on when the + serial port interrupt can be enabled, this code may allow you + to interrupt the system before the serial port control C is + available. Just say yes here. + config FRAME_POINTER bool "Compile the kernel with frame pointers" + default KGDB help If you say Y here the resulting kernel image will be slightly larger and slower, but it will give very useful debugging information. If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. +config MAGIC_SYSRQ + bool + depends on KGDB_SYSRQ + default y + config X86_FIND_SMP_CONFIG bool depends on X86_LOCAL_APIC || X86_VOYAGER diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/kernel/entry.S xx/arch/i386/kernel/entry.S --- 2.6.5-rc2/arch/i386/kernel/entry.S 2004-03-11 08:27:22.000000000 +0100 +++ xx/arch/i386/kernel/entry.S 2004-03-21 15:21:40.000000000 +0100 @@ -48,6 +48,18 @@ #include #include #include "irq_vectors.h" + /* We do not recover from a stack overflow, but at least + * we know it happened and should be able to track it down. + */ +#ifdef CONFIG_STACK_OVERFLOW_TEST +#define STACK_OVERFLOW_TEST \ + testl $7680,%esp; \ + jnz 10f; \ + call stack_overflow; \ +10: +#else +#define STACK_OVERFLOW_TEST +#endif #define nr_syscalls ((syscall_table_size)/4) @@ -100,7 +112,8 @@ TSS_ESP0_OFFSET = (4 - 0x200) pushl %ebx; \ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ - movl %edx, %es; + movl %edx, %es; \ + STACK_OVERFLOW_TEST #define RESTORE_INT_REGS \ popl %ebx; \ @@ -300,6 +313,19 @@ syscall_exit: testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work restore_all: +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS + movl EFLAGS(%esp), %eax # mix EFLAGS and CS + movb CS(%esp), %al + testl $(VM_MASK | 3), %eax + jz resume_kernelX # returning to kernel or vm86-space + + cmpl $0,TI_PRE_COUNT(%ebx) # non-zero preempt_count ? + jz resume_kernelX + + int $3 + +resume_kernelX: +#endif RESTORE_ALL # perform work that needs to be done immediately before resumption diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/kernel/kgdb_stub.c xx/arch/i386/kernel/kgdb_stub.c --- 2.6.5-rc2/arch/i386/kernel/kgdb_stub.c 1970-01-01 01:00:00.000000000 +0100 +++ xx/arch/i386/kernel/kgdb_stub.c 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,2334 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (c) 2000 VERITAS Software Corporation. + * + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: David Grothe + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Compatibility with 2.1.xx kernel by David Grothe + * + * Changes to allow auto initilization. All that is needed is that it + * be linked with the kernel and a break point (int 3) be executed. + * The header file defines BREAKPOINT to allow one to do + * this. It should also be possible, once the interrupt system is up, to + * call putDebugChar("+"). Once this is done, the remote debugger should + * get our attention by sending a ^C in a packet. George Anzinger + * + * Integrated into 2.2.5 kernel by Tigran Aivazian + * Added thread support, support for multiple processors, + * support for ia-32(x86) hardware debugging. + * Amit S. Kale ( akale@veritas.com ) + * + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ +#define KGDB_VERSION "<20030915.1651.33>" +#include +#include +#include /* for strcpy */ +#include +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#include +#include +#include +#include +#include + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern void putDebugChar(int); /* write a single character */ +extern int getDebugChar(void); /* read and return a single char */ + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 400 + +char *kgdb_version = KGDB_VERSION; + +/* debug > 0 prints ill-formed commands in valid packets & checksum errors */ +int debug_regs = 0; /* set to non-zero to print registers */ + +/* filled in by an external module */ +char *gdb_module_offsets; + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES 64 +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +enum regnames { _EAX, /* 0 */ + _ECX, /* 1 */ + _EDX, /* 2 */ + _EBX, /* 3 */ + _ESP, /* 4 */ + _EBP, /* 5 */ + _ESI, /* 6 */ + _EDI, /* 7 */ + _PC /* 8 also known as eip */ , + _PS /* 9 also known as eflags */ , + _CS, /* 10 */ + _SS, /* 11 */ + _DS, /* 12 */ + _ES, /* 13 */ + _FS, /* 14 */ + _GS /* 15 */ +}; + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* + * Put the error code here just in case the user cares. + * Likewise, the vector number here (since GDB only gets the signal + * number through the usual means, and that's not very specific). + * The called_from is the return address so he can tell how we entered kgdb. + * This will allow him to seperate out the various possible entries. + */ +#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */ + +#define PID_MAX PID_MAX_DEFAULT + +#ifdef CONFIG_SMP +void smp_send_nmi_allbutself(void); +#define IF_SMP(x) x +#undef MAX_NO_CPUS +#ifndef CONFIG_NO_KGDB_CPUS +#define CONFIG_NO_KGDB_CPUS 2 +#endif +#if CONFIG_NO_KGDB_CPUS > NR_CPUS +#define MAX_NO_CPUS NR_CPUS +#else +#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS +#endif +#define hold_init hold_on_sstep: 1, +#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL) +#define NUM_CPUS num_online_cpus() +#else +#define IF_SMP(x) +#define hold_init +#undef MAX_NO_CPUS +#define MAX_NO_CPUS 1 +#define NUM_CPUS 1 +#endif +#define NOCPU (struct task_struct *)0xbad1fbad +/* *INDENT-OFF* */ +struct kgdb_info { + int used_malloc; + void *called_from; + long long entry_tsc; + int errcode; + int vector; + int print_debug_info; +#ifdef CONFIG_SMP + int hold_on_sstep; + struct { + volatile struct task_struct *task; + int pid; + int hold; + struct pt_regs *regs; + } cpus_waiting[MAX_NO_CPUS]; +#endif +} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1}; + +/* *INDENT-ON* */ + +#define used_m kgdb_info.used_malloc +/* + * This is little area we set aside to contain the stack we + * need to build to allow gdb to call functions. We use one + * per cpu to avoid locking issues. We will do all this work + * with interrupts off so that should take care of the protection + * issues. + */ +#define LOOKASIDE_SIZE 200 /* should be more than enough */ +#define MALLOC_MAX 200 /* Max malloc size */ +struct { + unsigned int esp; + int array[LOOKASIDE_SIZE]; +} fn_call_lookaside[MAX_NO_CPUS]; + +static int trap_cpu; +static unsigned int OLD_esp; + +#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE] +#define IF_BIT 0x200 +#define TF_BIT 0x100 + +#define MALLOC_ROUND 8-1 + +static char malloc_array[MALLOC_MAX]; +IF_SMP(static void to_gdb(const char *mess)); +void * +malloc(int size) +{ + + if (size <= (MALLOC_MAX - used_m)) { + int old_used = used_m; + used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND)); + return &malloc_array[old_used]; + } else { + return NULL; + } +} + +/* + * Gdb calls functions by pushing agruments, including a return address + * on the stack and the adjusting EIP to point to the function. The + * whole assumption in GDB is that we are on a different stack than the + * one the "user" i.e. code that hit the break point, is on. This, of + * course is not true in the kernel. Thus various dodges are needed to + * do the call without directly messing with EIP (which we can not change + * as it is just a location and not a register. To adjust it would then + * require that we move every thing below EIP up or down as needed. This + * will not work as we may well have stack relative pointer on the stack + * (such as the pointer to regs, for example). + + * So here is what we do: + * We detect gdb attempting to store into the stack area and instead, store + * into the fn_call_lookaside.array at the same relative location as if it + * were the area ESP pointed at. We also trap ESP modifications + * and uses these to adjust fn_call_lookaside.esp. On entry + * fn_call_lookaside.esp will be set to point at the last entry in + * fn_call_lookaside.array. This allows us to check if it has changed, and + * if so, on exit, we add the registers we will use to do the move and a + * trap/ interrupt return exit sequence. We then adjust the eflags in the + * regs array (remember we now have a copy in the fn_call_lookaside.array) to + * kill the interrupt bit, AND we change EIP to point at our set up stub. + * As part of the register set up we preset the registers to point at the + * begining and end of the fn_call_lookaside.array, so all the stub needs to + * do is move words from the array to the stack until ESP= the desired value + * then do the rti. This will then transfer to the desired function with + * all the correct registers. Nifty huh? + */ +extern asmlinkage void fn_call_stub(void); +extern asmlinkage void fn_rtn_stub(void); +/* *INDENT-OFF* */ +__asm__("fn_rtn_stub:\n\t" + "movl %eax,%esp\n\t" + "fn_call_stub:\n\t" + "1:\n\t" + "addl $-4,%ebx\n\t" + "movl (%ebx), %eax\n\t" + "pushl %eax\n\t" + "cmpl %esp,%ecx\n\t" + "jne 1b\n\t" + "popl %eax\n\t" + "popl %ebx\n\t" + "popl %ecx\n\t" + "iret \n\t"); +/* *INDENT-ON* */ +#define gdb_i386vector kgdb_info.vector +#define gdb_i386errcode kgdb_info.errcode +#define waiting_cpus kgdb_info.cpus_waiting +#define remote_debug kgdb_info.print_debug_info +#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold +/* gdb locks */ + +#ifdef CONFIG_SMP +static int in_kgdb_called; +static spinlock_t waitlocks[MAX_NO_CPUS] = + {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; +/* + * The following array has the thread pointer of each of the "other" + * cpus. We make it global so it can be seen by gdb. + */ +volatile int in_kgdb_entry_log[MAX_NO_CPUS]; +volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; +/* +static spinlock_t continuelocks[MAX_NO_CPUS]; +*/ +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +/* waiters on our spinlock plus us */ +static atomic_t spinlock_waiters = ATOMIC_INIT(1); +static int spinlock_count = 0; +static int spinlock_cpu = 0; +/* + * Note we use nested spin locks to account for the case where a break + * point is encountered when calling a function by user direction from + * kgdb. Also there is the memory exception recursion to account for. + * Well, yes, but this lets other cpus thru too. Lets add a + * cpu id to the lock. + */ +#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \ + spinlock_cpu != smp_processor_id()){\ + atomic_inc(&spinlock_waiters); \ + while (! spin_trylock(x)) {\ + in_kgdb(®s);\ + }\ + atomic_dec(&spinlock_waiters); \ + spinlock_count = 1; \ + spinlock_cpu = smp_processor_id(); \ + }else{ \ + spinlock_count++; \ + } +#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x) +#else +unsigned kgdb_spinlock = 0; +#define KGDB_SPIN_LOCK(x) --*x +#define KGDB_SPIN_UNLOCK(x) ++*x +#endif + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + if ((remote_debug) && (checksum != xmitcsum)) { + printk + ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n", + checksum, xmitcsum, buffer); + } + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + + if (remote_debug) + printk("R:%s\n", buffer); +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + putDebugChar(ch); + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + + } while ((getDebugChar() & 0x7f) != '+'); + +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static short error; + +void +debug_error(char *format, char *parm) +{ + if (remote_debug) + printk(format, parm); +} + +static void +print_regs(struct pt_regs *regs) +{ + printk("EAX=%08lx ", regs->eax); + printk("EBX=%08lx ", regs->ebx); + printk("ECX=%08lx ", regs->ecx); + printk("EDX=%08lx ", regs->edx); + printk("\n"); + printk("ESI=%08lx ", regs->esi); + printk("EDI=%08lx ", regs->edi); + printk("EBP=%08lx ", regs->ebp); + printk("ESP=%08lx ", (long) ®s->esp); + printk("\n"); + printk(" DS=%08x ", regs->xds); + printk(" ES=%08x ", regs->xes); + printk(" SS=%08x ", __KERNEL_DS); + printk(" FL=%08lx ", regs->eflags); + printk("\n"); + printk(" CS=%08x ", regs->xcs); + printk(" IP=%08lx ", regs->eip); +#if 0 + printk(" FS=%08x ", regs->fs); + printk(" GS=%08x ", regs->gs); +#endif + printk("\n"); + +} /* print_regs */ + +#define NEW_esp fn_call_lookaside[trap_cpu].esp + +static void +regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_EAX] = regs->eax; + gdb_regs[_EBX] = regs->ebx; + gdb_regs[_ECX] = regs->ecx; + gdb_regs[_EDX] = regs->edx; + gdb_regs[_ESI] = regs->esi; + gdb_regs[_EDI] = regs->edi; + gdb_regs[_EBP] = regs->ebp; + gdb_regs[_DS] = regs->xds; + gdb_regs[_ES] = regs->xes; + gdb_regs[_PS] = regs->eflags; + gdb_regs[_CS] = regs->xcs; + gdb_regs[_PC] = regs->eip; + /* Note, as we are a debugging the kernel, we will always + * trap in kernel code, this means no priviledge change, + * and so the pt_regs structure is not completely valid. In a non + * privilege change trap, only EFLAGS, CS and EIP are put on the stack, + * SS and ESP are not stacked, this means that the last 2 elements of + * pt_regs is not valid (they would normally refer to the user stack) + * also, using regs+1 is no good because you end up will a value that is + * 2 longs (8) too high. This used to cause stepping over functions + * to fail, so my fix is to use the address of regs->esp, which + * should point at the end of the stack frame. Note I have ignored + * completely exceptions that cause an error code to be stacked, such + * as double fault. Stuart Hughes, Zentropix. + * original code: gdb_regs[_ESP] = (int) (regs + 1) ; + + * this is now done on entry and moved to OLD_esp (as well as NEW_esp). + */ + gdb_regs[_ESP] = NEW_esp; + gdb_regs[_SS] = __KERNEL_DS; + gdb_regs[_FS] = 0xFFFF; + gdb_regs[_GS] = 0xFFFF; +} /* regs_to_gdb_regs */ + +static void +gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) +{ + regs->eax = gdb_regs[_EAX]; + regs->ebx = gdb_regs[_EBX]; + regs->ecx = gdb_regs[_ECX]; + regs->edx = gdb_regs[_EDX]; + regs->esi = gdb_regs[_ESI]; + regs->edi = gdb_regs[_EDI]; + regs->ebp = gdb_regs[_EBP]; + regs->xds = gdb_regs[_DS]; + regs->xes = gdb_regs[_ES]; + regs->eflags = gdb_regs[_PS]; + regs->xcs = gdb_regs[_CS]; + regs->eip = gdb_regs[_PC]; + NEW_esp = gdb_regs[_ESP]; /* keep the value */ +#if 0 /* can't change these */ + regs->esp = gdb_regs[_ESP]; + regs->xss = gdb_regs[_SS]; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif + +} /* gdb_regs_to_regs */ +extern void scheduling_functions_start_here(void); +extern void scheduling_functions_end_here(void); +#define first_sched ((unsigned long) scheduling_functions_start_here) +#define last_sched ((unsigned long) scheduling_functions_end_here) + +int thread_list = 0; + +void +get_gdb_regs(struct task_struct *p, struct pt_regs *regs, int *gdb_regs) +{ + unsigned long stack_page; + int count = 0; + IF_SMP(int i); + if (!p || p == current) { + regs_to_gdb_regs(gdb_regs, regs); + return; + } +#ifdef CONFIG_SMP + for (i = 0; i < MAX_NO_CPUS; i++) { + if (p == kgdb_info.cpus_waiting[i].task) { + regs_to_gdb_regs(gdb_regs, + kgdb_info.cpus_waiting[i].regs); + gdb_regs[_ESP] = + (int) &kgdb_info.cpus_waiting[i].regs->esp; + + return; + } + } +#endif + memset(gdb_regs, 0, NUMREGBYTES); + gdb_regs[_ESP] = p->thread.esp; + gdb_regs[_PC] = p->thread.eip; + gdb_regs[_EBP] = *(int *) gdb_regs[_ESP]; + gdb_regs[_EDI] = *(int *) (gdb_regs[_ESP] + 4); + gdb_regs[_ESI] = *(int *) (gdb_regs[_ESP] + 8); + +/* + * This code is to give a more informative notion of where a process + * is waiting. It is used only when the user asks for a thread info + * list. If he then switches to the thread, s/he will find the task + * is in schedule, but a back trace should show the same info we come + * up with. This code was shamelessly purloined from process.c. It was + * then enhanced to provide more registers than simply the program + * counter. + */ + + if (!thread_list) { + return; + } + + if (p->state == TASK_RUNNING) + return; + stack_page = (unsigned long) p->thread_info; + if (gdb_regs[_ESP] < stack_page || gdb_regs[_ESP] > + THREAD_SIZE - sizeof(long) + stack_page) + return; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + do { + if (gdb_regs[_EBP] < stack_page || + gdb_regs[_EBP] > THREAD_SIZE - 2*sizeof(long) + stack_page) + return; + gdb_regs[_PC] = *(unsigned long *) (gdb_regs[_EBP] + 4); + gdb_regs[_ESP] = gdb_regs[_EBP] + 8; + gdb_regs[_EBP] = *(unsigned long *) gdb_regs[_EBP]; + if (gdb_regs[_PC] < first_sched || gdb_regs[_PC] >= last_sched) + return; + } while (count++ < 16); + return; +} + +/* Indicate to caller of mem2hex or hex2mem that there has been an + error. */ +static volatile int mem_err = 0; +static volatile int mem_err_expected = 0; +static volatile int mem_err_cnt = 0; +static int garbage_loc = -1; + +int +get_char(char *addr) +{ + return *addr; +} + +void +set_char(char *addr, int val, int may_fault) +{ + /* + * This code traps references to the area mapped to the kernel + * stack as given by the regs and, instead, stores to the + * fn_call_lookaside[cpu].array + */ + if (may_fault && + (unsigned int) addr < OLD_esp && + ((unsigned int) addr > (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) { + addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr); + } + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set mem_err in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + /* printk("%lx = ", mem) ; */ + + ch = get_char(mem++); + + /* printk("%02x\n", ch & 0xFF) ; */ + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault fetching from addr %lx\n", + (long) (mem - 1)); + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + mem_err_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +/* NOTE: We use the may fault flag to also indicate if the write is to + * the registers (0) or "other" memory (!=0) + */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch, may_fault); + + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault storing to addr %lx\n", + (long) (mem - 1)); + return (mem); + } + } + if (may_fault) + mem_err_expected = 0; + return (mem); +} + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToInt(char **ptr, int *intValue) +{ + int numChars = 0; + int hexValue; + + *intValue = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *intValue = (*intValue << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#define stubhex(h) hex(h) +#ifdef old_thread_list + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +#ifdef old_thread_list +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} +int +int_to_hex_v(unsigned char * id, int value) +{ + unsigned char *start = id; + int shift; + int ch; + + for (shift = 28; shift >= 0; shift -= 4) { + if ((ch = (value >> shift) & 0xf) || (id != start)) { + *id = hexchars[ch]; + id++; + } + } + if (id == start) + *id++ = '0'; + return id - start; +} +#ifdef old_thread_list + +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} +#endif +static int +cmp_str(char *s1, char *s2, int count) +{ + while (count--) { + if (*s1++ != *s2++) + return 0; + } + return 1; +} + +#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */ +extern struct task_struct *kgdb_get_idle(int cpu); +#define idle_task(cpu) kgdb_get_idle(cpu) +#else +#define idle_task(cpu) init_tasks[cpu] +#endif + +extern int kgdb_pid_init_done; + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) { + + return idle_task(pid - PID_MAX); + } else { + /* + * find_task_by_pid is relatively safe all the time + * Other pid functions require lock downs which imply + * that we may be interrupting them (as we get here + * in the middle of most any lock down). + * Still we don't want to call until the table exists! + */ + if (kgdb_pid_init_done){ + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } + } + } + return NULL; +} +/* *INDENT-OFF* */ +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned addr; +} breakinfo[4] = { {enabled:0}, + {enabled:0}, + {enabled:0}, + {enabled:0}}; +/* *INDENT-ON* */ +unsigned hw_breakpoint_status; +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned dr7; + + asm volatile ("movl %%db7, %0\n":"=r" (dr7) + :); + /* *INDENT-OFF* */ + do { + unsigned addr0, addr1, addr2, addr3; + asm volatile ("movl %%db0, %0\n" + "movl %%db1, %1\n" + "movl %%db2, %2\n" + "movl %%db3, %3\n" + :"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3) + :); + } while (0); + /* *INDENT-ON* */ + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movl %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movl %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movl %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movl %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movl %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +#ifdef CONFIG_SMP +static int in_kgdb_console = 0; + +int +in_kgdb(struct pt_regs *regs) +{ + unsigned flags; + int cpu = smp_processor_id(); + in_kgdb_called = 1; + if (!spin_is_locked(&kgdb_spinlock)) { + if (in_kgdb_here_log[cpu] || /* we are holding this cpu */ + in_kgdb_console) { /* or we are doing slow i/o */ + return 1; + } + return 0; + } + + /* As I see it the only reason not to let all cpus spin on + * the same spin_lock is to allow selected ones to proceed. + * This would be a good thing, so we leave it this way. + * Maybe someday.... Done ! + + * in_kgdb() is called from an NMI so we don't pretend + * to have any resources, like printk() for example. + */ + + kgdb_local_irq_save(flags); /* only local here, to avoid hanging */ + /* + * log arival of this cpu + * The NMI keeps on ticking. Protect against recurring more + * than once, and ignor the cpu that has the kgdb lock + */ + in_kgdb_entry_log[cpu]++; + in_kgdb_here_log[cpu] = regs; + if (cpu == spinlock_cpu || waiting_cpus[cpu].task) { + goto exit_in_kgdb; + } + /* + * For protection of the initilization of the spin locks by kgdb + * it locks the kgdb spinlock before it gets the wait locks set + * up. We wait here for the wait lock to be taken. If the + * kgdb lock goes away first?? Well, it could be a slow exit + * sequence where the wait lock is removed prior to the kgdb lock + * so if kgdb gets unlocked, we just exit. + */ + while (spin_is_locked(&kgdb_spinlock) && + !spin_is_locked(waitlocks + cpu)) ; + if (!spin_is_locked(&kgdb_spinlock)) { + goto exit_in_kgdb; + } + waiting_cpus[cpu].task = current; + waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu); + waiting_cpus[cpu].regs = regs; + + spin_unlock_wait(waitlocks + cpu); + /* + * log departure of this cpu + */ + waiting_cpus[cpu].task = 0; + waiting_cpus[cpu].pid = 0; + waiting_cpus[cpu].regs = 0; + correct_hw_break(); + exit_in_kgdb: + in_kgdb_here_log[cpu] = 0; + kgdb_local_irq_restore(flags); + return 1; + /* + spin_unlock(continuelocks + smp_processor_id()); + */ +} + +void +smp__in_kgdb(struct pt_regs regs) +{ + ack_APIC_irq(); + in_kgdb(®s); +} +#else +int +in_kgdb(struct pt_regs *regs) +{ + return (kgdb_spinlock); +} +#endif + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +kgdb_handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + struct task_struct *thread_list_start = 0, *thread = NULL; + int addr, length; + int breakno, breaktype; + char *ptr; + int newPC; + threadref thref; + int threadid; + int thread_min = PID_MAX + MAX_NO_CPUS; +#ifdef old_thread_list + int maxthreads; +#endif + int nothreads; + unsigned long flags; + int gdb_regs[NUMREGBYTES / 4]; + int dr6; + IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */ +#define NO_NMI 1 +#define NO_SYNC 2 +#define regs (*linux_regs) +#define NUMREGS NUMREGBYTES/4 + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { + printk("ignoring non-kernel exception\n"); + print_regs(®s); + return (0); + } + + kgdb_local_irq_save(flags); + + /* Get kgdb spinlock */ + + KGDB_SPIN_LOCK(&kgdb_spinlock); + rdtscll(kgdb_info.entry_tsc); + /* + * We depend on this spinlock and the NMI watch dog to control the + * other cpus. They will arrive at "in_kgdb()" as a result of the + * NMI and will wait there for the following spin locks to be + * released. + */ +#ifdef CONFIG_SMP + +#if 0 + if (cpu_callout_map & ~MAX_CPU_MASK) { + printk("kgdb : too many cpus, possibly not mapped" + " in contiguous space, change MAX_NO_CPUS" + " in kgdb_stub and make new kernel.\n" + " cpu_callout_map is %lx\n", cpu_callout_map); + goto exit_just_unlock; + } +#endif + if (spinlock_count == 1) { + int time, end_time, dum; + int i; + int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0) + }; + if (remote_debug) { + printk("kgdb : cpu %d entry, syncing others\n", + smp_processor_id()); + } + for (i = 0; i < MAX_NO_CPUS; i++) { + /* + * Use trylock as we may already hold the lock if + * we are holding the cpu. Net result is all + * locked. + */ + spin_trylock(&waitlocks[i]); + } + for (i = 0; i < MAX_NO_CPUS; i++) + cpu_logged_in[i] = 0; + /* + * Wait for their arrival. We know the watch dog is active if + * in_kgdb() has ever been called, as it is always called on a + * watchdog tick. + */ + rdtsc(dum, time); + end_time = time + 2; /* Note: we use the High order bits! */ + i = 1; + if (num_online_cpus() > 1) { + int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()]; + smp_send_nmi_allbutself(); + while (i < num_online_cpus() && time != end_time) { + int j; + for (j = 0; j < MAX_NO_CPUS; j++) { + if (waiting_cpus[j].task && + !cpu_logged_in[j]) { + i++; + cpu_logged_in[j] = 1; + if (remote_debug) { + printk + ("kgdb : cpu %d arrived at kgdb\n", + j); + } + break; + } else if (!waiting_cpus[j].task && + !cpu_online(j)) { + waiting_cpus[j].task = NOCPU; + cpu_logged_in[j] = 1; + waiting_cpus[j].hold = 1; + break; + } + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + + int wait = 100000; + while (wait--) ; + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + printk + ("kgdb : cpu %d stall" + " in in_kgdb\n", + j); + i++; + cpu_logged_in[j] = 1; + waiting_cpus[j].task = + (struct task_struct + *) 1; + } + } + } + + if (in_kgdb_entry_log[smp_processor_id()] > + (me_in_kgdb + 10)) { + break; + } + + rdtsc(dum, time); + } + if (i < num_online_cpus()) { + printk + ("kgdb : time out, proceeding without sync\n"); +#if 0 + printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n", + waiting_cpus[0].task != 0, + waiting_cpus[1].task != 0); + printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n", + cpu_logged_in[0], cpu_logged_in[1]); + printk + ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n", + in_kgdb_here_log[0] != 0, + in_kgdb_here_log[1] != 0); +#endif + entry_state = NO_SYNC; + } else { +#if 0 + int ent = + in_kgdb_entry_log[smp_processor_id()] - + me_in_kgdb; + printk("kgdb : sync after %d entries\n", ent); +#endif + } + } else { + if (remote_debug) { + printk + ("kgdb : %d cpus, but watchdog not active\n" + "proceeding without locking down other cpus\n", + num_online_cpus()); + entry_state = NO_NMI; + } + } + } +#endif + + if (remote_debug) { + unsigned long *lp = (unsigned long *) &linux_regs; + + printk("handle_exception(exceptionVector=%d, " + "signo=%d, err_code=%d, linux_regs=%p)\n", + exceptionVector, signo, err_code, linux_regs); + if (debug_regs) { + print_regs(®s); + printk("Stk: %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[0], lp[1], lp[2], lp[3], + lp[4], lp[5], lp[6], lp[7]); + printk(" %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[8], lp[9], lp[10], lp[11], + lp[12], lp[13], lp[14], lp[15]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[16], lp[17], lp[18], lp[19], + lp[20], lp[21], lp[22], lp[23]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[24], lp[25], lp[26], lp[27], + lp[28], lp[29], lp[30], lp[31]); + } + } + + /* Disable hardware debugging while we are in kgdb */ + /* Get the debug register status register */ +/* *INDENT-OFF* */ + __asm__("movl %0,%%db7" + : /* no output */ + :"r"(0)); + + asm volatile ("movl %%db6, %0\n" + :"=r" (hw_breakpoint_status) + :); + +/* *INDENT-ON* */ + switch (exceptionVector) { + case 0: /* divide error */ + case 1: /* debug exception */ + case 2: /* NMI */ + case 3: /* breakpoint */ + case 4: /* overflow */ + case 5: /* bounds check */ + case 6: /* invalid opcode */ + case 7: /* device not available */ + case 8: /* double fault (errcode) */ + case 10: /* invalid TSS (errcode) */ + case 12: /* stack fault (errcode) */ + case 16: /* floating point error */ + case 17: /* alignment check (errcode) */ + default: /* any undocumented */ + break; + case 11: /* segment not present (errcode) */ + case 13: /* general protection (errcode) */ + case 14: /* page fault (special errcode) */ + case 19: /* cache flush denied */ + if (mem_err_expected) { + /* + * This fault occured because of the + * get_char or set_char routines. These + * two routines use either eax of edx to + * indirectly reference the location in + * memory that they are working with. + * For a page fault, when we return the + * instruction will be retried, so we + * have to make sure that these + * registers point to valid memory. + */ + mem_err = 1; /* set mem error flag */ + mem_err_expected = 0; + mem_err_cnt++; /* helps in debugging */ + /* make valid address */ + regs.eax = (long) &garbage_loc; + /* make valid address */ + regs.edx = (long) &garbage_loc; + if (remote_debug) + printk("Return after memory error: " + "mem_err_cnt=%d\n", mem_err_cnt); + if (debug_regs) + print_regs(®s); + goto exit_kgdb; + } + break; + } + if (remote_debug) + printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id()); + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + kgdb_info.called_from = __builtin_return_address(0); +#ifdef CONFIG_SMP + /* + * OK, we can now communicate, lets tell gdb about the sync. + * but only if we had a problem. + */ + switch (entry_state) { + case NO_NMI: + to_gdb("NMI not active, other cpus not stopped\n"); + break; + case NO_SYNC: + to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n"); + default:; + } + +#endif +/* + * Set up the gdb function call area. + */ + trap_cpu = smp_processor_id(); + OLD_esp = NEW_esp = (int) (&linux_regs->esp); + + IF_SMP(once_again:) + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'd': + remote_debug = !(remote_debug); /* toggle debug flag */ + printk("Remote debug %s\n", + remote_debug ? "on" : "off"); + break; + case 'g': /* return the value of the CPU registers */ + get_gdb_regs(usethread, ®s, gdb_regs); + mem2hex((char *) gdb_regs, + remcomOutBuffer, NUMREGBYTES, 0); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], + (char *) gdb_regs, NUMREGBYTES, 0); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + case 'P':{ /* set the value of a single CPU register - + return OK */ + /* + * For some reason, gdb wants to talk about psudo + * registers (greater than 15). These may have + * meaning for ptrace, but for us it is safe to + * ignor them. We do this by dumping them into + * _GS which we also ignor, but do have memory for. + */ + int regno; + + ptr = &remcomInBuffer[1]; + regs_to_gdb_regs(gdb_regs, ®s); + if ((!usethread || usethread == current) && + hexToInt(&ptr, ®no) && + *ptr++ == '=' && (regno >= 0)) { + regno = + (regno >= NUMREGS ? _GS : regno); + hex2mem(ptr, (char *) &gdb_regs[regno], + 4, 0); + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + break; + } + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr) && + (*(ptr++) == ',') && (hexToInt(&ptr, &length))) { + ptr = 0; + /* + * hex doubles the byte count + */ + if (length > (BUFMAX / 2)) + length = BUFMAX / 2; + mem2hex((char *) addr, + remcomOutBuffer, length, 1); + if (mem_err) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + debug_error + ("malformed read memory command: %s\n", + remcomInBuffer); + } + break; + + /* MAA..AA,LLLL: + Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr) && + (*(ptr++) == ',') && + (hexToInt(&ptr, &length)) && (*(ptr++) == ':')) { + hex2mem(ptr, (char *) addr, length, 1); + + if (mem_err) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } else { + strcpy(remcomOutBuffer, "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + debug_error + ("malformed write memory command: %s\n", + remcomInBuffer); + } + break; + case 'S': + remcomInBuffer[0] = 's'; + case 'C': + /* Csig;AA..AA where ;AA..AA is optional + * continue with signal + * Since signals are meaning less to us, delete that + * part and then fall into the 'c' code. + */ + ptr = &remcomInBuffer[1]; + length = 2; + while (*ptr && *ptr != ';') { + length++; + ptr++; + } + if (*ptr) { + do { + ptr++; + *(ptr - length++) = *ptr; + } while (*ptr); + } else { + remcomInBuffer[1] = 0; + } + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + /* D detach, reply OK and then continue */ + case 'c': + case 's': + case 'D': + + /* try to read optional parameter, + pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) { + if (remote_debug) + printk("Changing EIP to 0x%x\n", addr); + + regs.eip = addr; + } + + newPC = regs.eip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + /* detach is a friendly version of continue. Note that + debugging is still enabled (e.g hit control C) + */ + if (remcomInBuffer[0] == 'D') { + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + } + + if (remote_debug) { + printk("Resuming execution\n"); + print_regs(®s); + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno) && + (breakinfo[breakno].type == 0)) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + correct_hw_break(); + asm volatile ("movl %0, %%db6\n"::"r" (0)); + goto exit_kgdb; + + /* kill the program */ + case 'k': /* do nothing */ + break; + + /* query */ + case 'q': + nothreads = 0; + switch (remcomInBuffer[1]) { + case 'f': + threadid = 1; + thread_list = 2; + thread_list_start = (usethread ? : current); + case 's': + if (!cmp_str(&remcomInBuffer[2], + "ThreadInfo", 10)) + break; + + remcomOutBuffer[nothreads++] = 'm'; + for (; threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + nothreads += int_to_hex_v( + &remcomOutBuffer[ + nothreads], + threadid); + if (thread_min > threadid) + thread_min = threadid; + remcomOutBuffer[ + nothreads] = ','; + nothreads++; + if (nothreads > BUFMAX - 10) + break; + } + } + if (remcomOutBuffer[nothreads - 1] == 'm') { + remcomOutBuffer[nothreads - 1] = 'l'; + } else { + nothreads--; + } + remcomOutBuffer[nothreads] = 0; + break; + +#ifdef old_thread_list /* Old thread info request */ + case 'L': + /* List threads */ + thread_list = 2; + thread_list_start = (usethread ? : current); + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + do { + int buf_thread_limit = + (BUFMAX - 22) / BUF_THREAD_ID_SIZE; + if (maxthreads > buf_thread_limit) { + maxthreads = buf_thread_limit; + } + } while (0); + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads && + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + if (thread_min > threadid) + thread_min = threadid; + } + } + + if (threadid == PID_MAX + MAX_NO_CPUS) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; +#endif + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + if (!threadid) { + /* + * idle thread + */ + for (threadid = PID_MAX; + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + if (current == + idle_task(threadid - + PID_MAX)) + break; + } + } + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, + err_code, remcomOutBuffer); + break; + case 'T':{ + char * nptr; + /* Thread extra info */ + if (!cmp_str(&remcomInBuffer[2], + "hreadExtraInfo,", 15)) { + break; + } + ptr = &remcomInBuffer[17]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + nptr = &thread->comm[0]; + length = 0; + ptr = &remcomOutBuffer[0]; + do { + length++; + ptr = pack_hex_byte(ptr, *nptr++); + } while (*nptr && length < 16); + /* + * would like that 16 to be the size of + * task_struct.comm but don't know the + * syntax.. + */ + *ptr = 0; + } + } + break; + + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + /* + * Just in case I forget what this is all about, + * the "thread info" command to gdb causes it + * to ask for a thread list. It then switches + * to each thread and asks for the registers. + * For this (and only this) usage, we want to + * fudge the registers of tasks not on the run + * list (i.e. waiting) to show the routine that + * called schedule. Also, gdb, is a minimalist + * in that if the current thread is the last + * it will not re-read the info when done. + * This means that in this case we must show + * the real registers. So here is how we do it: + * Each entry we keep track of the min + * thread in the list (the last that gdb will) + * get info for. We also keep track of the + * starting thread. + * "thread_list" is cleared when switching back + * to the min thread if it is was current, or + * if it was not current, thread_list is set + * to 1. When the switch to current comes, + * if thread_list is 1, clear it, else do + * nothing. + */ + usethread = thread; + if ((thread_list == 1) && + (thread == thread_list_start)) { + thread_list = 0; + } + if (thread_list && (threadid == thread_min)) { + if (thread == thread_list_start) { + thread_list = 0; + } else { + thread_list = 1; + } + } + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + if (thread_min > threadid) + thread_min = threadid; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; + + case 'Y': /* set up a hardware breakpoint */ + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + ptr++; + hexToInt(&ptr, &breaktype); + ptr++; + hexToInt(&ptr, &length); + ptr++; + hexToInt(&ptr, &addr); + if (set_hw_break(breakno & 0x3, + breaktype & 0x3, + length & 0x3, addr) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + case 'r': /* reboot */ + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + /*to_gdb("Rebooting\n"); */ + /* triplefault no return from here */ + { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt[0])); + BREAKPOINT; + } + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + } /* while(1==1) */ + /* + * reached by goto only. + */ + exit_kgdb: + /* + * Here is where we set up to trap a gdb function call. NEW_esp + * will be changed if we are trying to do this. We handle both + * adding and subtracting, thus allowing gdb to put grung on + * the stack which it removes later. + */ + if (NEW_esp != OLD_esp) { + int *ptr = END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) + ptr -= (OLD_esp - NEW_esp) / sizeof (int); + *--ptr = linux_regs->eflags; + *--ptr = linux_regs->xcs; + *--ptr = linux_regs->eip; + *--ptr = linux_regs->ecx; + *--ptr = linux_regs->ebx; + *--ptr = linux_regs->eax; + linux_regs->ecx = NEW_esp - (sizeof (int) * 6); + linux_regs->ebx = (unsigned int) END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) { + linux_regs->eip = (unsigned int) fn_call_stub; + } else { + linux_regs->eip = (unsigned int) fn_rtn_stub; + linux_regs->eax = NEW_esp; + } + linux_regs->eflags &= ~(IF_BIT | TF_BIT); + } +#ifdef CONFIG_SMP + /* + * Release gdb wait locks + * Sanity check time. Must have at least one cpu to run. Also single + * step must not be done if the current cpu is on hold. + */ + if (spinlock_count == 1) { + int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep; + int cpu_avail = 0; + int i; + + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!cpu_online(i)) + break; + if (!hold_cpu(i)) { + cpu_avail = 1; + } + } + /* + * Early in the bring up there will be NO cpus on line... + */ + if (!cpu_avail && !cpus_empty(cpu_online_map)) { + to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n"); + goto once_again; + } + if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) { + to_gdb + ("Current cpu must be unblocked to single step\n"); + goto once_again; + } + if (!(ss_hold)) { + int i; + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!hold_cpu(i)) { + spin_unlock(&waitlocks[i]); + } + } + } else { + spin_unlock(&waitlocks[smp_processor_id()]); + } + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + /* + * If this cpu is on hold, this is where we + * do it. Note, the NMI will pull us out of here, + * but will return as the above lock is not held. + * We will stay here till another cpu releases the lock for us. + */ + spin_unlock_wait(waitlocks + smp_processor_id()); + kgdb_local_irq_restore(flags); + return (0); + } +#if 0 +exit_just_unlock: +#endif +#endif + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + kgdb_local_irq_restore(flags); + return (0); +} + +/* this function is used to set up exception handlers for tracing and + * breakpoints. + * This function is not needed as the above line does all that is needed. + * We leave it for backward compatitability... + */ +void +set_debug_traps(void) +{ + /* + * linux_debug_hook is defined in traps.c. We store a pointer + * to our own exception handler into it. + + * But really folks, every hear of labeled common, an old Fortran + * concept. Lots of folks can reference it and it is define if + * anyone does. Only one can initialize it at link time. We do + * this with the hook. See the statement above. No need for any + * executable code and it is ready as soon as the kernel is + * loaded. Very desirable in kernel debugging. + + linux_debug_hook = handle_exception ; + */ + + /* In case GDB is started before us, ack any packets (presumably + "$?#xx") sitting there. + putDebugChar ('+'); + + initialized = 1; + */ +} + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ +/* But really, just use the BREAKPOINT macro. We will handle the int stuff + */ + +#ifdef later +/* + * possibly we should not go thru the traps.c code at all? Someday. + */ +void +do_kgdb_int3(struct pt_regs *regs, long error_code) +{ + kgdb_handle_exception(3, 5, error_code, regs); + return; +} +#endif +#undef regs +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS +asmlinkage void +bad_sys_call_exit(int stuff) +{ + struct pt_regs *regs = (struct pt_regs *) &stuff; + printk("Sys call %d return with %x preempt_count\n", + (int) regs->orig_eax, preempt_count()); +} +#endif +#ifdef CONFIG_STACK_OVERFLOW_TEST +#include +asmlinkage void +stack_overflow(void) +{ +#ifdef BREAKPOINT + BREAKPOINT; +#else + printk("Kernel stack overflow, looping forever\n"); +#endif + while (1) { + } +} +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE) +char gdbconbuf[BUFMAX]; + +static void +kgdb_gdb_message(const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + /* + * This takes care of NMI while spining out chars to gdb + */ + IF_SMP(in_kgdb_console = 1); + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } + IF_SMP(in_kgdb_console = 0); +} +#endif +#ifdef CONFIG_SMP +static void +to_gdb(const char *s) +{ + int count = 0; + while (s[count] && (count++ < BUFMAX)) ; + kgdb_gdb_message(s, count); +} +#endif +#ifdef CONFIG_KGDB_CONSOLE +#include +#include +#include +#include +#include + +void +kgdb_console_write(struct console *co, const char *s, unsigned count) +{ + + if (gdb_i386vector == -1) { + /* + * We have not yet talked to gdb. What to do... + * lets break, on continue we can do the write. + * But first tell him whats up. Uh, well no can do, + * as this IS the console. Oh well... + * We do need to wait or the messages will be lost. + * Other option would be to tell the above code to + * ignore this breakpoint and do an auto return, + * but that might confuse gdb. Also this happens + * early enough in boot up that we don't have the traps + * set up yet, so... + */ + breakpoint(); + } + kgdb_gdb_message(s, count); +} + +/* + * ------------------------------------------------------------ + * Serial KGDB driver + * ------------------------------------------------------------ + */ + +static struct console kgdbcons = { + name:"kgdb", + write:kgdb_console_write, +#ifdef CONFIG_KGDB_USER_CONSOLE + device:kgdb_console_device, +#endif + flags:CON_PRINTBUFFER | CON_ENABLED, + index:-1, +}; + +/* + * The trick here is that this file gets linked before printk.o + * That means we get to peer at the console info in the command + * line before it does. If we are up, we register, otherwise, + * do nothing. By returning 0, we allow printk to look also. + */ +static int kgdb_console_enabled; + +int __init +kgdb_console_init(char *str) +{ + if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) { + register_console(&kgdbcons); + kgdb_console_enabled = 1; + } + return 0; /* let others look at the string */ +} + +__setup("console=", kgdb_console_init); + +#ifdef CONFIG_KGDB_USER_CONSOLE +static kdev_t kgdb_console_device(struct console *c); +/* This stuff sort of works, but it knocks out telnet devices + * we are leaving it here in case we (or you) find time to figure it out + * better.. + */ + +/* + * We need a real char device as well for when the console is opened for user + * space activities. + */ + +static int +kgdb_consdev_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static ssize_t +kgdb_consdev_write(struct file *file, const char *buf, + size_t count, loff_t * ppos) +{ + int size, ret = 0; + static char kbuf[128]; + static DECLARE_MUTEX(sem); + + /* We are not reentrant... */ + if (down_interruptible(&sem)) + return -ERESTARTSYS; + + while (count > 0) { + /* need to copy the data from user space */ + size = count; + if (size > sizeof (kbuf)) + size = sizeof (kbuf); + if (copy_from_user(kbuf, buf, size)) { + ret = -EFAULT; + break;; + } + kgdb_console_write(&kgdbcons, kbuf, size); + count -= size; + ret += size; + buf += size; + } + + up(&sem); + + return ret; +} + +struct file_operations kgdb_consdev_fops = { + open:kgdb_consdev_open, + write:kgdb_consdev_write +}; +static kdev_t +kgdb_console_device(struct console *c) +{ + return MKDEV(TTYAUX_MAJOR, 1); +} + +/* + * This routine gets called from the serial stub in the i386/lib + * This is so it is done late in bring up (just before the console open). + */ +void +kgdb_console_finit(void) +{ + if (kgdb_console_enabled) { + char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1)); + char *cp = cptr; + while (*cptr && *cptr != '(') + cptr++; + *cptr = 0; + unregister_chrdev(TTYAUX_MAJOR, cp); + register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops); + } +} +#endif +#endif +#ifdef CONFIG_KGDB_TS +#include /* time stamp code */ +#include /* in_interrupt */ +#ifdef CONFIG_KGDB_TS_64 +#define DATA_POINTS 64 +#endif +#ifdef CONFIG_KGDB_TS_128 +#define DATA_POINTS 128 +#endif +#ifdef CONFIG_KGDB_TS_256 +#define DATA_POINTS 256 +#endif +#ifdef CONFIG_KGDB_TS_512 +#define DATA_POINTS 512 +#endif +#ifdef CONFIG_KGDB_TS_1024 +#define DATA_POINTS 1024 +#endif +#ifndef DATA_POINTS +#define DATA_POINTS 128 /* must be a power of two */ +#endif +#define INDEX_MASK (DATA_POINTS - 1) +#if (INDEX_MASK & DATA_POINTS) +#error "CONFIG_KGDB_TS_COUNT must be a power of 2" +#endif +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + int data0; + int data1; +}; +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + struct task_struct *t1; + struct task_struct *t2; +}; +struct kgdb_and_then_struct kgdb_data[DATA_POINTS]; + +struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0]; +int kgdb_and_then_count; + +void +kgdb_tstamp(int line, char *source, int data0, int data1) +{ + static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; + int flags; + kgdb_local_irq_save(flags); + spin_lock(&ts_spin); + rdtscll(kgdb_and_then->at_time); +#ifdef CONFIG_SMP + kgdb_and_then->on_cpu = smp_processor_id(); +#endif + kgdb_and_then->task = current; + kgdb_and_then->from_ln = line; + kgdb_and_then->in_src = source; + kgdb_and_then->from = __builtin_return_address(0); + kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) | + (preempt_count() << 8)); + kgdb_and_then->data0 = data0; + kgdb_and_then->data1 = data1; + kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK]; + spin_unlock(&ts_spin); + kgdb_local_irq_restore(flags); +#ifdef CONFIG_PREEMPT + +#endif + return; +} +#endif +typedef int gdb_debug_hook(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs); +gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/kernel/Makefile xx/arch/i386/kernel/Makefile --- 2.6.5-rc2/arch/i386/kernel/Makefile 2004-03-21 15:09:24.000000000 +0100 +++ xx/arch/i386/kernel/Makefile 2004-03-21 15:21:40.000000000 +0100 @@ -14,6 +14,7 @@ obj-y += timers/ obj-$(CONFIG_ACPI_BOOT) += acpi/ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca.o +obj-$(CONFIG_KGDB) += kgdb_stub.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/kernel/nmi.c xx/arch/i386/kernel/nmi.c --- 2.6.5-rc2/arch/i386/kernel/nmi.c 2004-03-11 08:27:22.000000000 +0100 +++ xx/arch/i386/kernel/nmi.c 2004-03-21 15:21:40.000000000 +0100 @@ -31,7 +31,16 @@ #include #include +#ifdef CONFIG_KGDB +#include +#ifdef CONFIG_SMP +unsigned int nmi_watchdog = NMI_IO_APIC; +#else +unsigned int nmi_watchdog = NMI_LOCAL_APIC; +#endif +#else unsigned int nmi_watchdog = NMI_NONE; +#endif static unsigned int nmi_hz = HZ; unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ extern void show_registers(struct pt_regs *regs); @@ -408,6 +417,9 @@ void touch_nmi_watchdog (void) for (i = 0; i < NR_CPUS; i++) alert_counter[i] = 0; } +#ifdef CONFIG_KGDB +int tune_watchdog = 5*HZ; +#endif void nmi_watchdog_tick (struct pt_regs * regs) { @@ -421,12 +433,24 @@ void nmi_watchdog_tick (struct pt_regs * sum = irq_stat[cpu].apic_timer_irqs; +#ifdef CONFIG_KGDB + if (! in_kgdb(regs) && last_irq_sums[cpu] == sum ) { + +#else if (last_irq_sums[cpu] == sum) { +#endif /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; +#ifdef CONFIG_KGDB + if (alert_counter[cpu] == tune_watchdog) { + kgdb_handle_exception(2, SIGPWR, 0, regs); + last_irq_sums[cpu] = sum; + alert_counter[cpu] = 0; + } +#endif if (alert_counter[cpu] == 5*nmi_hz) { spin_lock(&nmi_print_lock); /* diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/kernel/smp.c xx/arch/i386/kernel/smp.c --- 2.6.5-rc2/arch/i386/kernel/smp.c 2004-03-11 08:27:22.000000000 +0100 +++ xx/arch/i386/kernel/smp.c 2004-03-21 15:21:40.000000000 +0100 @@ -466,7 +466,17 @@ void flush_tlb_all(void) { on_each_cpu(do_flush_tlb_all, 0, 1, 1); } - +#ifdef CONFIG_KGDB +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} +#endif /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/kernel/traps.c xx/arch/i386/kernel/traps.c --- 2.6.5-rc2/arch/i386/kernel/traps.c 2004-03-21 15:09:24.000000000 +0100 +++ xx/arch/i386/kernel/traps.c 2004-03-21 15:21:40.000000000 +0100 @@ -92,6 +92,40 @@ asmlinkage void alignment_check(void); asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); +#ifdef CONFIG_KGDB +extern void sysenter_entry(void); +#include +#include +void set_intr_gate(unsigned int n, void *addr); +static void set_intr_usr_gate(unsigned int n, void *addr); +/* + * Should be able to call this breakpoint() very early in + * bring up. Just hard code the call where needed. + * The breakpoint() code is here because set_?_gate() functions + * are local (static) to trap.c. They need be done only once, + * but it does not hurt to do them over. + */ +void breakpoint(void) +{ + set_intr_usr_gate(3,&int3); /* disable ints on trap */ + set_intr_gate(1,&debug); + set_intr_gate(14,&page_fault); + + BREAKPOINT; +} +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (!user_mode(regs) ) \ + { \ + kgdb_handle_exception(trapnr, signr, error_code, regs); \ + after; \ + } else if ((trapnr == 3) && (regs->eflags &0x200)) local_irq_enable(); \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + + static int kstack_depth_to_print = 24; void show_trace(struct task_struct *task, unsigned long * stack) @@ -278,6 +312,15 @@ void die(const char * str, struct pt_reg #endif if (nl) printk("\n"); +#ifdef CONFIG_KGDB + /* This is about the only place we want to go to kgdb even if in + * user mode. But we must go in via a trap so within kgdb we will + * always be in kernel mode. + */ + if (user_mode(regs)) + BREAKPOINT; +#endif + CHK_REMOTE_DEBUG(0,SIGTRAP,err,regs,) show_registers(regs); bust_spinlocks(0); spin_unlock_irq(&die_lock); @@ -347,6 +390,7 @@ static inline void do_trap(int trapnr, i #define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,)\ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -364,7 +408,9 @@ asmlinkage void do_##name(struct pt_regs #define DO_VM86_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr, signr, error_code,regs, return)\ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + return; \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -411,8 +457,10 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) + if (!fixup_exception(regs)){ + CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,) die("general protection fault", regs, error_code); + } } static void mem_parity_error(unsigned char reason, struct pt_regs * regs) @@ -574,8 +622,18 @@ asmlinkage void do_debug(struct pt_regs * allowing programs to debug themselves without the ptrace() * interface. */ +#ifdef CONFIG_KGDB + /* + * I think this is the only "real" case of a TF in the kernel + * that really belongs to user space. Others are + * "Ours all ours!" + */ + if (((regs->xcs & 3) == 0) && ((void *)regs->eip == sysenter_entry)) + goto clear_TF_reenable; +#else if ((regs->xcs & 3) == 0) goto clear_TF_reenable; +#endif if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } @@ -587,6 +645,17 @@ asmlinkage void do_debug(struct pt_regs info.si_errno = 0; info.si_code = TRAP_BRKPT; +#ifdef CONFIG_KGDB + /* + * If this is a kernel mode trap, we need to reset db7 to allow us + * to continue sanely ALSO skip the signal delivery + */ + if ((regs->xcs & 3) == 0) + goto clear_dr7; + + /* if not kernel, allow ints but only if they were on */ + if ( regs->eflags & 0x200) local_irq_enable(); +#endif /* If this is a kernel mode trap, save the user PC on entry to * the kernel, that's what the debugger can make sense of. */ @@ -601,6 +670,7 @@ clear_dr7: __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); + CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) return; debug_vm86: @@ -849,6 +919,12 @@ static void __init set_call_gate(void *a { _set_gate(a,12,3,addr,__KERNEL_CS); } +#ifdef CONFIG_KGDB +void set_intr_usr_gate(unsigned int n, void *addr) +{ + _set_gate(idt_table+n,14,3,addr,__KERNEL_CS); +} +#endif static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { @@ -871,7 +947,11 @@ void __init trap_init(void) set_trap_gate(0,÷_error); set_intr_gate(1,&debug); set_intr_gate(2,&nmi); +#ifndef CONFIG_KGDB set_system_gate(3,&int3); /* int3-5 can be called from all */ +#else + set_intr_usr_gate(3,&int3); /* int3-5 can be called from all */ +#endif set_system_gate(4,&overflow); set_system_gate(5,&bounds); set_trap_gate(6,&invalid_op); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/lib/kgdb_serial.c xx/arch/i386/lib/kgdb_serial.c --- 2.6.5-rc2/arch/i386/lib/kgdb_serial.c 1970-01-01 01:00:00.000000000 +0100 +++ xx/arch/i386/lib/kgdb_serial.c 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,485 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * Modified to allow invokation early in boot see also + * kgdb.h for instructions by George Anzinger(george@mvista.com) + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_KGDB_USER_CONSOLE +extern void kgdb_console_finit(void); +#endif +#define PRNT_off +#define TEST_EXISTANCE +#ifdef PRNT +#define dbprintk(s) printk s +#else +#define dbprintk(s) +#endif +#define TEST_INTERRUPT_off +#ifdef TEST_INTERRUPT +#define intprintk(s) printk s +#else +#define intprintk(s) +#endif + +#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT) + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +struct async_struct *gdb_async_info; +static int gdb_async_irq; + +#define outb_px(a,b) outb_p(b,a) + +static void program_uart(struct async_struct *info); +static void write_char(struct async_struct *info, int chr); +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(struct async_struct *info) +{ + char it = inb_p(info->port + UART_LSR); + + if (it & UART_LSR_DR) + return (inb_p(info->port + UART_RX)); + /* + * If we have a framing error assume somebody messed with + * our uart. Reprogram it and send '-' both ways... + */ + if (it & 0xc) { + program_uart(info); + write_char(info, '-'); + return ('-'); + } + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + + * Locking here is a bit of a problem. We MUST not lock out communication + * if we are trying to talk to gdb about a kgdb entry. ON the other hand + * we can loose chars in the console pass thru if we don't lock. It is also + * possible that we could hold the lock or be waiting for it when kgdb + * NEEDS to talk. Since kgdb locks down the world, it does not need locks. + * We do, of course have possible issues with interrupting a uart operation, + * but we will just depend on the uart status to help keep that straight. + + */ +static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +#endif + +static int +read_char(struct async_struct *info) +{ + int chr; + unsigned long flags; + local_irq_save(flags); +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_lock(&uart_interrupt_lock); + } +#endif + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + } else { + chr = read_data_bfr(info); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_unlock(&uart_interrupt_lock); + } +#endif + local_irq_restore(flags); + return (chr); +} + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(struct async_struct *info, int chr) +{ + while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ; + + outb_p(chr, info->port + UART_TX); + +} /* write_char */ + +/* + * Mostly we don't need a spinlock, but since the console goes + * thru here with interrutps on, well, we need to catch those + * chars. + */ +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static irqreturn_t +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct async_struct *info; + unsigned long flags; + + info = gdb_async_info; + if (!info || !info->tty || irq != gdb_async_irq) + return IRQ_NONE; + + local_irq_save(flags); + spin_lock(&uart_interrupt_lock); + do { + int chr = read_data_bfr(info); + intprintk(("Debug char on int: %x hex\n", chr)); + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + BREAKPOINT; + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { + /* buffer overflow tosses early char */ + read_char(info); + } + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI); + spin_unlock(&uart_interrupt_lock); + local_irq_restore(flags); + return IRQ_HANDLED; +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +/* These structure are filled in with values defined in asm/kgdb_local.h + */ +static struct serial_state state = SB_STATE; +static struct async_struct local_info = SB_INFO; +static int ok_to_enable_ints = 0; +static void kgdb_enable_ints_now(void); + +extern char *kgdb_version; +/* + * Hook an IRQ for KGDB. + * + * This routine is called from putDebugChar, below. + */ +static int ints_disabled = 1; +int +gdb_hook_interrupt(struct async_struct *info, int verb) +{ + struct serial_state *state = info->state; + unsigned long flags; + int port; +#ifdef TEST_EXISTANCE + int scratch, scratch2; +#endif + + /* The above fails if memory managment is not set up yet. + * Rather than fail the set up, just keep track of the fact + * and pick up the interrupt thing later. + */ + gdb_async_info = info; + port = gdb_async_info->port; + gdb_async_irq = state->irq; + if (verb) { + printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n", + kgdb_version, + port, + gdb_async_irq, gdb_async_info->state->custom_divisor); + } + local_irq_save(flags); +#ifdef TEST_EXISTANCE + /* Existance test */ + /* Should not need all this, but just in case.... */ + + scratch = inb_p(port + UART_IER); + outb_px(port + UART_IER, 0); + outb_px(0xff, 0x080); + scratch2 = inb_p(port + UART_IER); + outb_px(port + UART_IER, scratch); + if (scratch2) { + printk + ("gdb_hook_interrupt: Could not clear IER, not a UART!\n"); + local_irq_restore(flags); + return 1; /* We failed; there's nothing here */ + } + scratch2 = inb_p(port + UART_LCR); + outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */ + outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */ + outb_px(port + UART_LCR, 0); + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO); + scratch = inb_p(port + UART_IIR) >> 6; + if (scratch == 1) { + printk("gdb_hook_interrupt: Undefined UART type!" + " Not a UART! \n"); + local_irq_restore(flags); + return 1; + } else { + dbprintk(("gdb_hook_interrupt: UART type " + "is %d where 0=16450, 2=16550 3=16550A\n", scratch)); + } + scratch = inb_p(port + UART_MCR); + outb_px(port + UART_MCR, UART_MCR_LOOP | scratch); + outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A); + scratch2 = inb_p(port + UART_MSR) & 0xF0; + outb_px(port + UART_MCR, scratch); + if (scratch2 != 0x90) { + printk("gdb_hook_interrupt: " + "Loop back test failed! Not a UART!\n"); + local_irq_restore(flags); + return scratch2 + 1000; /* force 0 to fail */ + } +#endif /* test existance */ + program_uart(info); + local_irq_restore(flags); + + return (0); + +} /* gdb_hook_interrupt */ + +static void +program_uart(struct async_struct *info) +{ + int port = info->port; + + (void) inb_p(port + UART_RX); + outb_px(port + UART_IER, 0); + + (void) inb_p(port + UART_RX); /* serial driver comments say */ + (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */ + (void) inb_p(port + UART_MSR); + outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB); + outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */ + outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */ + outb_px(port + UART_MCR, info->MCR); + + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */ + outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */ + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */ + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + return; +} + +/* + * getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. In the + */ +int kgdb_in_isr = 0; +int kgdb_in_lsr = 0; +extern spinlock_t kgdb_spinlock; + +/* Caller takes needed protections */ + +int +getDebugChar(void) +{ + volatile int chr, dum, time, end_time; + + dbprintk(("getDebugChar(port %x): ", gdb_async_info->port)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + /* + * This trick says if we wait a very long time and get + * no char, return the -1 and let the upper level deal + * with it. + */ + rdtsc(dum, time); + end_time = time + 2; + while (((chr = read_char(gdb_async_info)) == -1) && + (end_time - time) > 0) { + rdtsc(dum, time); + }; + /* + * This covers our butts if some other code messes with + * our uart, hay, it happens :o) + */ + if (chr == -1) + program_uart(gdb_async_info); + + dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' ')); + return (chr); + +} /* getDebugChar */ + +static int count = 3; +static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; + +static int __init +kgdb_enable_ints(void) +{ + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 1); + } + ok_to_enable_ints = 1; + kgdb_enable_ints_now(); +#ifdef CONFIG_KGDB_USER_CONSOLE + kgdb_console_finit(); +#endif + return 0; +} + +#ifdef CONFIG_SERIAL_8250 +void shutdown_for_kgdb(struct async_struct *gdb_async_info); +#endif + +#ifdef CONFIG_DISCONTIGMEM +static inline int kgdb_mem_init_done(void) +{ + return highmem_start_page != NULL; +} +#else +static inline int kgdb_mem_init_done(void) +{ + return max_mapnr != 0; +} +#endif + +static void +kgdb_enable_ints_now(void) +{ + if (!spin_trylock(&one_at_atime)) + return; + if (!ints_disabled) + goto exit; + if (kgdb_mem_init_done() && + ints_disabled) { /* don't try till mem init */ +#ifdef CONFIG_SERIAL_8250 + /* + * The ifdef here allows the system to be configured + * without the serial driver. + * Don't make it a module, however, it will steal the port + */ + shutdown_for_kgdb(gdb_async_info); +#endif + ints_disabled = request_irq(gdb_async_info->state->irq, + gdb_interrupt, + IRQ_T(gdb_async_info), + "KGDB-stub", NULL); + intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); + } + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + exit: + spin_unlock(&one_at_atime); +} + +/* + * putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. Caller takes needed protections. + */ +void +putDebugChar(int chr) +{ + dbprintk(("putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n", + gdb_async_info->port, + chr, + chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + + write_char(gdb_async_info, chr); /* this routine will wait */ + count = (chr == '#') ? 0 : count + 1; + if ((count == 2)) { /* try to enable after */ + if (ints_disabled & ok_to_enable_ints) + kgdb_enable_ints_now(); /* try to enable after */ + + /* We do this a lot because, well we really want to get these + * interrupts. The serial driver will clear these bits when it + * initializes the chip. Every thing else it does is ok, + * but this. + */ + if (!ints_disabled) { + outb_px(gdb_async_info->port + UART_IER, + gdb_async_info->IER); + } + } + +} /* putDebugChar */ + +module_init(kgdb_enable_ints); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/lib/Makefile xx/arch/i386/lib/Makefile --- 2.6.5-rc2/arch/i386/lib/Makefile 2004-03-21 15:09:24.000000000 +0100 +++ xx/arch/i386/lib/Makefile 2004-03-21 15:21:40.000000000 +0100 @@ -9,3 +9,4 @@ lib-y = checksum.o delay.o \ lib-$(CONFIG_X86_USE_3DNOW) += mmx.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o +lib-$(CONFIG_KGDB) += kgdb_serial.o diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/Makefile xx/arch/i386/Makefile --- 2.6.5-rc2/arch/i386/Makefile 2004-03-11 08:27:22.000000000 +0100 +++ xx/arch/i386/Makefile 2004-03-21 15:21:40.000000000 +0100 @@ -97,6 +97,9 @@ mcore-$(CONFIG_X86_ES7000) := mach-es700 # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +mflags-$(CONFIG_KGDB) += -gdwarf-2 +mflags-$(CONFIG_KGDB_MORE) += $(shell echo $(CONFIG_KGDB_OPTIONS) | sed -e 's/"//g') + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/mm/fault.c xx/arch/i386/mm/fault.c --- 2.6.5-rc2/arch/i386/mm/fault.c 2003-12-17 03:20:06.000000000 +0100 +++ xx/arch/i386/mm/fault.c 2004-03-21 15:21:40.000000000 +0100 @@ -403,6 +403,12 @@ no_context: * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ +#ifdef CONFIG_KGDB + if (!user_mode(regs)){ + kgdb_handle_exception(14,SIGBUS, error_code, regs); + return; + } +#endif bust_spinlocks(1); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/mm/hugetlbpage.c xx/arch/i386/mm/hugetlbpage.c --- 2.6.5-rc2/arch/i386/mm/hugetlbpage.c 2004-01-15 18:36:04.000000000 +0100 +++ xx/arch/i386/mm/hugetlbpage.c 2004-03-21 15:21:42.000000000 +0100 @@ -29,7 +29,7 @@ static spinlock_t htlbpage_lock = SPIN_L static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -44,8 +44,8 @@ static struct page *dequeue_huge_page(vo break; } if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, struct page, list); - list_del(&page->list); + page = list_entry(hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); } return page; } @@ -278,9 +278,8 @@ follow_huge_pmd(struct mm_struct *mm, un static void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); spin_lock(&htlbpage_lock); enqueue_huge_page(page); @@ -409,19 +408,19 @@ static int try_to_free_low(int count) /* all lowmem is on node 0 */ list_for_each(p, &hugepage_freelists[0]) { if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; map = NULL; if (++count == 0) break; } - page = list_entry(p, struct page, list); + page = list_entry(p, struct page, lru); if (!PageHighMem(page)) map = page; } if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; count++; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/mm/init.c xx/arch/i386/mm/init.c --- 2.6.5-rc2/arch/i386/mm/init.c 2004-03-21 15:09:24.000000000 +0100 +++ xx/arch/i386/mm/init.c 2004-03-21 15:21:41.000000000 +0100 @@ -523,30 +523,20 @@ void __init mem_init(void) #endif } -kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; +#ifdef CONFIG_X86_PAE +struct kmem_cache_s *pae_pgd_cachep; void __init pgtable_cache_init(void) { - if (PTRS_PER_PMD > 1) { - pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - pmd_ctor, - NULL); - if (!pmd_cache) - panic("pgtable_cache_init(): cannot create pmd cache"); - } - pgd_cache = kmem_cache_create("pgd", - PTRS_PER_PGD*sizeof(pgd_t), - 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); - if (!pgd_cache) - panic("pgtable_cache_init(): Cannot create pgd cache"); + /* + * PAE pgds must be 16-byte aligned: + */ + pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); + if (!pae_pgd_cachep) + panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); } +#endif /* * This function cannot be __init, since exceptions don't work in that diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/mm/pageattr.c xx/arch/i386/mm/pageattr.c --- 2.6.5-rc2/arch/i386/mm/pageattr.c 2003-08-16 14:54:02.000000000 +0200 +++ xx/arch/i386/mm/pageattr.c 2004-03-21 15:21:42.000000000 +0100 @@ -67,22 +67,19 @@ static void flush_kernel_map(void *dummy static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { - struct page *page; - unsigned long flags; - set_pte_atomic(kpte, pte); /* change init_mm */ - if (PTRS_PER_PMD > 1) - return; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pmd_t *pmd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - pmd = pmd_offset(pgd, address); - set_pte_atomic((pte_t *)pmd, pte); +#ifndef CONFIG_X86_PAE + { + struct list_head *l; + spin_lock(&mmlist_lock); + list_for_each(l, &init_mm.mmlist) { + struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist); + pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address); + set_pte_atomic((pte_t *)pmd, pte); + } + spin_unlock(&mmlist_lock); } - spin_unlock_irqrestore(&pgd_lock, flags); +#endif } /* @@ -135,7 +132,7 @@ __change_page_attr(struct page *page, pg } if (cpu_has_pse && (atomic_read(&kpte_page->count) == 1)) { - list_add(&kpte_page->list, &df_list); + list_add(&kpte_page->lru, &df_list); revert_page(kpte_page, address); } return 0; @@ -188,7 +185,7 @@ void global_flush_tlb(void) flush_map(); n = l.next; while (n != &l) { - struct page *pg = list_entry(n, struct page, list); + struct page *pg = list_entry(n, struct page, lru); n = n->next; __free_page(pg); } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/i386/mm/pgtable.c xx/arch/i386/mm/pgtable.c --- 2.6.5-rc2/arch/i386/mm/pgtable.c 2003-08-16 14:54:02.000000000 +0200 +++ xx/arch/i386/mm/pgtable.c 2004-03-21 15:21:41.000000000 +0100 @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -152,88 +151,61 @@ struct page *pte_alloc_one(struct mm_str return pte; } -void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) -{ - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); -} - -/* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking - * against pageattr.c; it is the unique case in which a valid change - * of kernel pagetables can't be lazily synchronized by vmalloc faults. - * vmalloc faults work because attached pagetables are never freed. - * If the locking proves to be non-performant, a ticketing scheme with - * checks at dup_mmap(), exec(), and other mmlist addition points - * could be used. The locking scheme was chosen on the basis of - * manfred's recommendations and having no core impact whatsoever. - * -- wli - */ -spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; -LIST_HEAD(pgd_list); +#ifdef CONFIG_X86_PAE -void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +pgd_t *pgd_alloc(struct mm_struct *mm) { - unsigned long flags; - - if (PTRS_PER_PMD == 1) - spin_lock_irqsave(&pgd_lock, flags); + int i; + pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); - memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, + if (pgd) { + for (i = 0; i < USER_PTRS_PER_PGD; i++) { + unsigned long pmd = __get_free_page(GFP_KERNEL); + if (!pmd) + goto out_oom; + clear_page(pmd); + set_pgd(pgd + i, __pgd(1 + __pa(pmd))); + } + memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - - if (PTRS_PER_PMD > 1) - return; - - list_add(&virt_to_page(pgd)->lru, &pgd_list); - spin_unlock_irqrestore(&pgd_lock, flags); - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + } + return pgd; +out_oom: + for (i--; i >= 0; i--) + free_page((unsigned long)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pae_pgd_cachep, pgd); + return NULL; } -/* never called when PTRS_PER_PMD > 1 */ -void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +void pgd_free(pgd_t *pgd) { - unsigned long flags; /* can be called from interrupt context */ + int i; - spin_lock_irqsave(&pgd_lock, flags); - list_del(&virt_to_page(pgd)->lru); - spin_unlock_irqrestore(&pgd_lock, flags); + for (i = 0; i < USER_PTRS_PER_PGD; i++) + free_page((unsigned long)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pae_pgd_cachep, pgd); } +#else + pgd_t *pgd_alloc(struct mm_struct *mm) { - int i; - pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); - - if (PTRS_PER_PMD == 1 || !pgd) - return pgd; + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - if (!pmd) - goto out_oom; - set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd)))); + if (pgd) { + memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); } return pgd; - -out_oom: - for (i--; i >= 0; i--) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pgd_cache, pgd); - return NULL; } void pgd_free(pgd_t *pgd) { - int i; - - /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); - /* in the non-PAE case, clear_page_tables() clears user pgd entries */ - kmem_cache_free(pgd_cache, pgd); + free_page((unsigned long)pgd); } + +#endif /* CONFIG_X86_PAE */ + diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/ia64/ia32/binfmt_elf32.c xx/arch/ia64/ia32/binfmt_elf32.c --- 2.6.5-rc2/arch/ia64/ia32/binfmt_elf32.c 2004-03-11 08:27:23.000000000 +0100 +++ xx/arch/ia64/ia32/binfmt_elf32.c 2004-03-21 15:21:42.000000000 +0100 @@ -79,9 +79,10 @@ ia64_elf32_init (struct pt_regs *regs) vma->vm_page_prot = PAGE_SHARED; vma->vm_flags = VM_READ|VM_MAYREAD; vma->vm_ops = &ia32_shared_page_vm_ops; - vma->vm_pgoff = 0; + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; vma->vm_file = NULL; vma->vm_private_data = NULL; + vma->anon_vma = NULL; down_write(¤t->mm->mmap_sem); { insert_vm_struct(current->mm, vma); @@ -101,8 +102,9 @@ ia64_elf32_init (struct pt_regs *regs) vma->vm_page_prot = PAGE_SHARED; vma->vm_flags = VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE; vma->vm_ops = NULL; - vma->vm_pgoff = 0; + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; vma->vm_file = NULL; + vma->anon_vma = NULL; vma->vm_private_data = NULL; down_write(¤t->mm->mmap_sem); { @@ -181,8 +183,9 @@ ia32_setup_arg_pages (struct linux_binpr mpnt->vm_page_prot = PAGE_COPY; mpnt->vm_flags = VM_STACK_FLAGS; mpnt->vm_ops = NULL; - mpnt->vm_pgoff = 0; + mpnt->vm_pgoff = mpnt->vm_start >> PAGE_SHIFT; mpnt->vm_file = NULL; + mpnt->anon_vma = NULL; mpnt->vm_private_data = 0; insert_vm_struct(current->mm, mpnt); current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; @@ -192,7 +195,7 @@ ia32_setup_arg_pages (struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, PAGE_COPY); + put_dirty_page(current, page, stack_base, PAGE_COPY, mpnt); } stack_base += PAGE_SIZE; } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/ia64/kernel/perfmon.c xx/arch/ia64/kernel/perfmon.c --- 2.6.5-rc2/arch/ia64/kernel/perfmon.c 2004-03-11 08:27:23.000000000 +0100 +++ xx/arch/ia64/kernel/perfmon.c 2004-03-21 15:21:42.000000000 +0100 @@ -2271,9 +2271,11 @@ pfm_smpl_buffer_alloc(struct task_struct vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED|VM_DONTCOPY; vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ vma->vm_ops = &pfm_vm_ops; - vma->vm_pgoff = 0; + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; vma->vm_file = NULL; vma->vm_private_data = ctx; /* information needed by the pfm_vm_close() function */ + /* insert_vm_struct takes care of anon_vma_node */ + vma->anon_vma = NULL; /* * Now we have everything we need and we can initialize diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/ia64/mm/hugetlbpage.c xx/arch/ia64/mm/hugetlbpage.c --- 2.6.5-rc2/arch/ia64/mm/hugetlbpage.c 2004-03-11 08:27:23.000000000 +0100 +++ xx/arch/ia64/mm/hugetlbpage.c 2004-03-21 15:21:42.000000000 +0100 @@ -32,7 +32,7 @@ static spinlock_t htlbpage_lock = SPIN_L static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -48,8 +48,8 @@ static struct page *dequeue_huge_page(vo } if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, struct page, list); - list_del(&page->list); + page = list_entry(hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); } return page; } @@ -246,9 +246,8 @@ follow_huge_pmd(struct mm_struct *mm, un void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); spin_lock(&htlbpage_lock); enqueue_huge_page(page); @@ -449,19 +448,19 @@ int try_to_free_low(int count) spin_lock(&htlbpage_lock); list_for_each(p, &hugepage_freelists[0]) { if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; map = NULL; if (++count == 0) break; } - page = list_entry(p, struct page, list); + page = list_entry(p, struct page, lru); if (!PageHighMem(page)) map = page; } if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; count++; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/ia64/mm/init.c xx/arch/ia64/mm/init.c --- 2.6.5-rc2/arch/ia64/mm/init.c 2004-03-21 15:09:24.000000000 +0100 +++ xx/arch/ia64/mm/init.c 2004-03-21 15:21:42.000000000 +0100 @@ -128,8 +128,10 @@ ia64_init_addr_space (void) vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7]; vma->vm_flags = VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE|VM_GROWSUP; vma->vm_ops = NULL; - vma->vm_pgoff = 0; + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; vma->vm_file = NULL; + /* insert_vm_struct takes care of anon_vma_node */ + vma->anon_vma = NULL; vma->vm_private_data = NULL; insert_vm_struct(current->mm, vma); } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/ppc64/mm/hugetlbpage.c xx/arch/ppc64/mm/hugetlbpage.c --- 2.6.5-rc2/arch/ppc64/mm/hugetlbpage.c 2004-03-11 08:27:28.000000000 +0100 +++ xx/arch/ppc64/mm/hugetlbpage.c 2004-03-21 15:21:42.000000000 +0100 @@ -25,7 +25,6 @@ #include #include #include -#include #include @@ -40,7 +39,7 @@ static struct list_head hugepage_freelis static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -63,8 +62,8 @@ static struct page *dequeue_huge_page(vo } if (!list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, struct page, list); - list_del(&page->list); + page = list_entry(hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); } if (largepage_roundrobin) @@ -407,9 +406,8 @@ follow_huge_pmd(struct mm_struct *mm, un static void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); spin_lock(&htlbpage_lock); enqueue_huge_page(page); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/s390/kernel/compat_exec.c xx/arch/s390/kernel/compat_exec.c --- 2.6.5-rc2/arch/s390/kernel/compat_exec.c 2003-07-17 01:53:55.000000000 +0200 +++ xx/arch/s390/kernel/compat_exec.c 2004-03-21 15:21:42.000000000 +0100 @@ -69,9 +69,11 @@ int setup_arg_pages32(struct linux_binpr mpnt->vm_page_prot = PAGE_COPY; mpnt->vm_flags = VM_STACK_FLAGS; mpnt->vm_ops = NULL; - mpnt->vm_pgoff = 0; + mpnt->vm_pgoff = mpnt->vm_start >> PAGE_SHIFT; mpnt->vm_file = NULL; INIT_LIST_HEAD(&mpnt->shared); + /* insert_vm_struct takes care of anon_vma_node */ + mpnt->anon_vma = NULL; mpnt->vm_private_data = (void *) 0; insert_vm_struct(mm, mpnt); mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; @@ -81,7 +83,7 @@ int setup_arg_pages32(struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY); + put_dirty_page(current,page,stack_base,PAGE_COPY, mpnt); } stack_base += PAGE_SIZE; } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/sparc64/mm/hugetlbpage.c xx/arch/sparc64/mm/hugetlbpage.c --- 2.6.5-rc2/arch/sparc64/mm/hugetlbpage.c 2004-01-15 18:36:08.000000000 +0100 +++ xx/arch/sparc64/mm/hugetlbpage.c 2004-03-21 15:21:42.000000000 +0100 @@ -29,7 +29,7 @@ static spinlock_t htlbpage_lock = SPIN_L static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -46,8 +46,8 @@ static struct page *dequeue_huge_page(vo if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) { page = list_entry(hugepage_freelists[nid].next, - struct page, list); - list_del(&page->list); + struct page, lru); + list_del(&page->lru); } return page; } @@ -248,9 +248,8 @@ struct page *follow_huge_pmd(struct mm_s static void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); spin_lock(&htlbpage_lock); enqueue_huge_page(page); @@ -384,19 +383,19 @@ static int try_to_free_low(int count) /* all lowmem is on node 0 */ list_for_each(p, &hugepage_freelists[0]) { if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; map = NULL; if (++count == 0) break; } - page = list_entry(p, struct page, list); + page = list_entry(p, struct page, lru); if (!PageHighMem(page)) map = page; } if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; count++; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/x86_64/ia32/ia32_binfmt.c xx/arch/x86_64/ia32/ia32_binfmt.c --- 2.6.5-rc2/arch/x86_64/ia32/ia32_binfmt.c 2004-03-21 15:09:24.000000000 +0100 +++ xx/arch/x86_64/ia32/ia32_binfmt.c 2004-03-21 15:21:42.000000000 +0100 @@ -358,9 +358,11 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? PAGE_COPY_EXEC : PAGE_COPY; mpnt->vm_ops = NULL; - mpnt->vm_pgoff = 0; + mpnt->vm_pgoff = mpnt->vm_start >> PAGE_SHIFT; mpnt->vm_file = NULL; INIT_LIST_HEAD(&mpnt->shared); + /* insert_vm_struct takes care of anon_vma_node */ + mpnt->anon_vma = NULL; mpnt->vm_private_data = (void *) 0; insert_vm_struct(mm, mpnt); mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; @@ -370,7 +372,7 @@ int setup_arg_pages(struct linux_binprm struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY_EXEC); + put_dirty_page(current,page,stack_base,PAGE_COPY_EXEC, mpnt); } stack_base += PAGE_SIZE; } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/x86_64/Kconfig xx/arch/x86_64/Kconfig --- 2.6.5-rc2/arch/x86_64/Kconfig 2004-03-21 15:09:24.000000000 +0100 +++ xx/arch/x86_64/Kconfig 2004-03-21 15:21:40.000000000 +0100 @@ -462,6 +462,7 @@ config INIT_DEBUG config DEBUG_INFO bool "Compile the kernel with debug info" depends on DEBUG_KERNEL + default n help If you say Y here the resulting kernel image will include debugging info resulting in a larger kernel image. @@ -493,9 +494,8 @@ config IOMMU_LEAK help Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. - -#config X86_REMOTE_DEBUG -# bool "kgdb debugging stub" + +source "arch/x86_64/Kconfig.kgdb" endmenu diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/x86_64/Kconfig.kgdb xx/arch/x86_64/Kconfig.kgdb --- 2.6.5-rc2/arch/x86_64/Kconfig.kgdb 1970-01-01 01:00:00.000000000 +0100 +++ xx/arch/x86_64/Kconfig.kgdb 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,176 @@ +config KGDB + bool "Include kgdb kernel debugger" + depends on DEBUG_KERNEL + select DEBUG_INFO + help + If you say Y here, the system will be compiled with the debug + option (-g) and a debugging stub will be included in the + kernel. This stub communicates with gdb on another (host) + computer via a serial port. The host computer should have + access to the kernel binary file (vmlinux) and a serial port + that is connected to the target machine. Gdb can be made to + configure the serial port or you can use stty and setserial to + do this. See the 'target' command in gdb. This option also + configures in the ability to request a breakpoint early in the + boot process. To request the breakpoint just include 'kgdb' + as a boot option when booting the target machine. The system + will then break as soon as it looks at the boot options. This + option also installs a breakpoint in panic and sends any + kernel faults to the debugger. For more information see the + Documentation/i386/kgdb.txt file. + +choice + depends on KGDB + prompt "Debug serial port BAUD" + default KGDB_115200BAUD + help + Gdb and the kernel stub need to agree on the baud rate to be + used. Some systems (x86 family at this writing) allow this to + be configured. + +config KGDB_9600BAUD + bool "9600" + +config KGDB_19200BAUD + bool "19200" + +config KGDB_38400BAUD + bool "38400" + +config KGDB_57600BAUD + bool "57600" + +config KGDB_115200BAUD + bool "115200" +endchoice + +config KGDB_PORT + hex "hex I/O port address of the debug serial port" + depends on KGDB + default 3f8 + help + Some systems (x86 family at this writing) allow the port + address to be configured. The number entered is assumed to be + hex, don't put 0x in front of it. The standard address are: + COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx + will tell you what you have. It is good to test the serial + connection with a live system before trying to debug. + +config KGDB_IRQ + int "IRQ of the debug serial port" + depends on KGDB + default 4 + help + This is the irq for the debug port. If everything is working + correctly and the kernel has interrupts on a control C to the + port should cause a break into the kernel debug stub. + +config DEBUG_INFO + bool + depends on KGDB + default y + +config KGDB_MORE + bool "Add any additional compile options" + depends on KGDB + default n + help + Saying yes here turns on the ability to enter additional + compile options. + + +config KGDB_OPTIONS + depends on KGDB_MORE + string "Additional compile arguments" + default "-O1" + help + This option allows you enter additional compile options for + the whole kernel compile. Each platform will have a default + that seems right for it. For example on PPC "-ggdb -O1", and + for i386 "-O1". Note that by configuring KGDB "-g" is already + turned on. In addition, on i386 platforms + "-fomit-frame-pointer" is deleted from the standard compile + options. + +config NO_KGDB_CPUS + int "Number of CPUs" + depends on KGDB && SMP + default NR_CPUS + help + + This option sets the number of cpus for kgdb ONLY. It is used + to prune some internal structures so they look "nice" when + displayed with gdb. This is to overcome possibly larger + numbers that may have been entered above. Enter the real + number to get nice clean kgdb_info displays. + +config KGDB_TS + bool "Enable kgdb time stamp macros?" + depends on KGDB + default n + help + Kgdb event macros allow you to instrument your code with calls + to the kgdb event recording function. The event log may be + examined with gdb at a break point. Turning on this + capability also allows you to choose how many events to + keep. Kgdb always keeps the lastest events. + +choice + depends on KGDB_TS + prompt "Max number of time stamps to save?" + default KGDB_TS_128 + +config KGDB_TS_64 + bool "64" + +config KGDB_TS_128 + bool "128" + +config KGDB_TS_256 + bool "256" + +config KGDB_TS_512 + bool "512" + +config KGDB_TS_1024 + bool "1024" + +endchoice + +config STACK_OVERFLOW_TEST + bool "Turn on kernel stack overflow testing?" + depends on KGDB + default n + help + This option enables code in the front line interrupt handlers + to check for kernel stack overflow on interrupts and system + calls. This is part of the kgdb code on x86 systems. + +config KGDB_CONSOLE + bool "Enable serial console thru kgdb port" + depends on KGDB + default n + help + This option enables the command line "console=kgdb" option. + When the system is booted with this option in the command line + all kernel printk output is sent to gdb (as well as to other + consoles). For this to work gdb must be connected. For this + reason, this command line option will generate a breakpoint if + gdb has not yet connected. After the gdb continue command is + given all pent up console output will be printed by gdb on the + host machine. Neither this option, nor KGDB require the + serial driver to be configured. + +config KGDB_SYSRQ + bool "Turn on SysRq 'G' command to do a break?" + depends on KGDB + default y + help + This option includes an option in the SysRq code that allows + you to enter SysRq G which generates a breakpoint to the KGDB + stub. This will work if the keyboard is alive and can + interrupt the system. Because of constraints on when the + serial port interrupt can be enabled, this code may allow you + to interrupt the system before the serial port control C is + available. Just say yes here. + diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/x86_64/kernel/irq.c xx/arch/x86_64/kernel/irq.c --- 2.6.5-rc2/arch/x86_64/kernel/irq.c 2004-03-11 08:27:30.000000000 +0100 +++ xx/arch/x86_64/kernel/irq.c 2004-03-21 15:21:40.000000000 +0100 @@ -405,6 +405,9 @@ out: spin_unlock(&desc->lock); irq_exit(); + + kgdb_process_breakpoint(); + return 1; } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/x86_64/kernel/kgdb_stub.c xx/arch/x86_64/kernel/kgdb_stub.c --- 2.6.5-rc2/arch/x86_64/kernel/kgdb_stub.c 1970-01-01 01:00:00.000000000 +0100 +++ xx/arch/x86_64/kernel/kgdb_stub.c 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,2595 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (c) 2000 VERITAS Software Corporation. + * + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: David Grothe + * Updated by: Robert Walsh + * Updated by: wangdi + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Compatibility with 2.1.xx kernel by David Grothe + * + * Changes to allow auto initilization. All that is needed is that it + * be linked with the kernel and a break point (int 3) be executed. + * The header file defines BREAKPOINT to allow one to do + * this. It should also be possible, once the interrupt system is up, to + * call putDebugChar("+"). Once this is done, the remote debugger should + * get our attention by sending a ^C in a packet. George Anzinger + * + * Integrated into 2.2.5 kernel by Tigran Aivazian + * Added thread support, support for multiple processors, + * support for ia-32(x86) hardware debugging. + * Amit S. Kale ( akale@veritas.com ) + * + * Modified to support debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + * X86_64 changes from Andi Kleen's patch merged by Jim Houston + * (jim.houston@ccur.com). If it works thank Andi if its broken + * blame me. + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ +#define KGDB_VERSION "<20030915.1651.33>" +#include +#include +#include /* for strcpy */ +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define Dearly_printk(x...) +int kgdb_enabled = 0; + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int tty_putDebugChar(int); /* write a single character */ +extern int tty_getDebugChar(void); /* read and return a single char */ +extern void tty_flushDebugChar(void); /* flush pending characters */ +extern int eth_putDebugChar(int); /* write a single character */ +extern int eth_getDebugChar(void); /* read and return a single char */ +extern void eth_flushDebugChar(void); /* flush pending characters */ + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 400 + +char *kgdb_version = KGDB_VERSION; + +/* debug > 0 prints ill-formed commands in valid packets & checksum errors */ +int debug_regs = 0; /* set to non-zero to print registers */ + +/* filled in by an external module */ +char *gdb_module_offsets; + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES (NUMREGS * sizeof(unsigned long)) +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + * + * Could add XMM and segment registers here. + */ +enum regnames {_RAX, + _RBX, + _RCX, + _RDX, + _RSI, + _RDI, + _RBP, + _RSP, + _R8, + _R9, + _R10, + _R11, + _R12, + _R13, + _R14, + _R15, + _PC, + _PS, + NUMREGS }; + + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* + * Put the error code here just in case the user cares. + * Likewise, the vector number here (since GDB only gets the signal + * number through the usual means, and that's not very specific). + * The called_from is the return address so he can tell how we entered kgdb. + * This will allow him to seperate out the various possible entries. + */ +#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */ + +#define PID_MAX PID_MAX_DEFAULT + +#ifdef CONFIG_SMP +void smp_send_nmi_allbutself(void); +#define IF_SMP(x) x +#undef MAX_NO_CPUS +#ifndef CONFIG_NO_KGDB_CPUS +#define CONFIG_NO_KGDB_CPUS 2 +#endif +#if CONFIG_NO_KGDB_CPUS > NR_CPUS +#define MAX_NO_CPUS NR_CPUS +#else +#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS +#endif +#define hold_init hold_on_sstep: 1, +#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL) +#define NUM_CPUS num_online_cpus() +#else +#define IF_SMP(x) +#define hold_init +#undef MAX_NO_CPUS +#define MAX_NO_CPUS 1 +#define NUM_CPUS 1 +#endif +#define NOCPU (struct task_struct *)0xbad1fbad +/* *INDENT-OFF* */ +struct kgdb_info { + int used_malloc; + void *called_from; + long long entry_tsc; + int errcode; + int vector; + int print_debug_info; +#ifdef CONFIG_SMP + int hold_on_sstep; + struct { + volatile struct task_struct *task; + int pid; + int hold; + struct pt_regs *regs; + } cpus_waiting[MAX_NO_CPUS]; +#endif +} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1}; + +/* *INDENT-ON* */ + +#define used_m kgdb_info.used_malloc +/* + * This is little area we set aside to contain the stack we + * need to build to allow gdb to call functions. We use one + * per cpu to avoid locking issues. We will do all this work + * with interrupts off so that should take care of the protection + * issues. + */ +#define LOOKASIDE_SIZE 200 /* should be more than enough */ +#define MALLOC_MAX 200 /* Max malloc size */ +struct { + unsigned long rsp; + unsigned long array[LOOKASIDE_SIZE]; +} fn_call_lookaside[MAX_NO_CPUS]; + +static int trap_cpu; +static unsigned long OLD_esp; + +#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE] +#define IF_BIT 0x200 +#define TF_BIT 0x100 + +#define MALLOC_ROUND 8-1 + +static char malloc_array[MALLOC_MAX]; +IF_SMP(static void to_gdb(const char *mess)); +void * +malloc(int size) +{ + + if (size <= (MALLOC_MAX - used_m)) { + int old_used = used_m; + used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND)); + return &malloc_array[old_used]; + } else { + return NULL; + } +} + +/* + * I/O dispatch functions... + * Based upon kgdboe, either call the ethernet + * handler or the serial one.. + */ +void +putDebugChar(int c) +{ + if (!kgdboe) { + tty_putDebugChar(c); + } else { + eth_putDebugChar(c); + } +} + +int +getDebugChar(void) +{ + if (!kgdboe) { + return tty_getDebugChar(); + } else { + return eth_getDebugChar(); + } +} + +void +flushDebugChar(void) +{ + if (!kgdboe) { + tty_flushDebugChar(); + } else { + eth_flushDebugChar(); + } +} + +/* + * Gdb calls functions by pushing agruments, including a return address + * on the stack and the adjusting EIP to point to the function. The + * whole assumption in GDB is that we are on a different stack than the + * one the "user" i.e. code that hit the break point, is on. This, of + * course is not true in the kernel. Thus various dodges are needed to + * do the call without directly messing with EIP (which we can not change + * as it is just a location and not a register. To adjust it would then + * require that we move every thing below EIP up or down as needed. This + * will not work as we may well have stack relative pointer on the stack + * (such as the pointer to regs, for example). + + * So here is what we do: + * We detect gdb attempting to store into the stack area and instead, store + * into the fn_call_lookaside.array at the same relative location as if it + * were the area ESP pointed at. We also trap ESP modifications + * and uses these to adjust fn_call_lookaside.esp. On entry + * fn_call_lookaside.esp will be set to point at the last entry in + * fn_call_lookaside.array. This allows us to check if it has changed, and + * if so, on exit, we add the registers we will use to do the move and a + * trap/ interrupt return exit sequence. We then adjust the eflags in the + * regs array (remember we now have a copy in the fn_call_lookaside.array) to + * kill the interrupt bit, AND we change EIP to point at our set up stub. + * As part of the register set up we preset the registers to point at the + * begining and end of the fn_call_lookaside.array, so all the stub needs to + * do is move words from the array to the stack until ESP= the desired value + * then do the rti. This will then transfer to the desired function with + * all the correct registers. Nifty huh? + */ +extern asmlinkage void fn_call_stub(void); +extern asmlinkage void fn_rtn_stub(void); +/* *INDENT-OFF* */ +__asm__("fn_rtn_stub:\n\t" + "movq %rax,%rsp\n\t" + "fn_call_stub:\n\t" + "1:\n\t" + "addq $-8,%rbx\n\t" + "movq (%rbx), %rax\n\t" + "pushq %rax\n\t" + "cmpq %rsp,%rcx\n\t" + "jne 1b\n\t" + "popq %rax\n\t" + "popq %rbx\n\t" + "popq %rcx\n\t" + "iret \n\t"); +/* *INDENT-ON* */ +#define gdb_i386vector kgdb_info.vector +#define gdb_i386errcode kgdb_info.errcode +#define waiting_cpus kgdb_info.cpus_waiting +#define remote_debug kgdb_info.print_debug_info +#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold +/* gdb locks */ + +#ifdef CONFIG_SMP +static int in_kgdb_called; +static spinlock_t waitlocks[MAX_NO_CPUS] = + {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; +/* + * The following array has the thread pointer of each of the "other" + * cpus. We make it global so it can be seen by gdb. + */ +volatile int in_kgdb_entry_log[MAX_NO_CPUS]; +volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; +/* +static spinlock_t continuelocks[MAX_NO_CPUS]; +*/ +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +/* waiters on our spinlock plus us */ +static atomic_t spinlock_waiters = ATOMIC_INIT(1); +static int spinlock_count = 0; +static int spinlock_cpu = 0; +/* + * Note we use nested spin locks to account for the case where a break + * point is encountered when calling a function by user direction from + * kgdb. Also there is the memory exception recursion to account for. + * Well, yes, but this lets other cpus thru too. Lets add a + * cpu id to the lock. + */ +#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \ + spinlock_cpu != smp_processor_id()){\ + atomic_inc(&spinlock_waiters); \ + while (! spin_trylock(x)) {\ + in_kgdb(®s);\ + }\ + atomic_dec(&spinlock_waiters); \ + spinlock_count = 1; \ + spinlock_cpu = smp_processor_id(); \ + }else{ \ + spinlock_count++; \ + } +#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x) +#else +unsigned kgdb_spinlock = 0; +#define KGDB_SPIN_LOCK(x) --*x +#define KGDB_SPIN_UNLOCK(x) ++*x +#endif + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + if ((remote_debug) && (checksum != xmitcsum)) { + printk + ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n", + checksum, xmitcsum, buffer); + } + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + + if (remote_debug) + printk("R:%s\n", buffer); + flushDebugChar(); +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + + if (!kgdboe) { + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + putDebugChar(ch); + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + + } while ((getDebugChar() & 0x7f) != '+'); + } else { + /* + * For udp, we can not transfer too much bytes once. + * We only transfer MAX_SEND_COUNT size bytes each time + */ + +#define MAX_SEND_COUNT 30 + + int send_count = 0, i = 0; + char send_buf[MAX_SEND_COUNT]; + + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + send_count = 0; + while ((ch = buffer[count])) { + if (send_count >= MAX_SEND_COUNT) { + for(i = 0; i < MAX_SEND_COUNT; i++) { + putDebugChar(send_buf[i]); + } + flushDebugChar(); + send_count = 0; + } else { + send_buf[send_count] = ch; + checksum += ch; + count ++; + send_count++; + } + } + for(i = 0; i < send_count; i++) + putDebugChar(send_buf[i]); + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + } while ((getDebugChar() & 0x7f) != '+'); + } +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static char lbuf[BUFMAX]; +static short error; + +void +debug_error(char *format, char *parm) +{ + if (remote_debug) + printk(format, parm); +} + +static void +print_regs(struct pt_regs *regs) +{ + printk("RAX=%016lx RBX=%016lx RCX=%016lx\n", + regs->rax, regs->rbx, regs->rcx); + printk("RDX=%016lx RSI=%016lx RDI=%016lx\n", + regs->rdx, regs->rsi, regs->rdi); + printk("RBP=%016lx PS=%016lx PC=%016lx\n", + regs->rbp, regs->eflags, regs->rip); + printk("R8=%016lx R9=%016lx R10=%016lx\n", + regs->r8, regs->r9, regs->r10); + printk("R11=%016lx R12=%016lx R13=%016lx\n", + regs->r11, regs->r12, regs->r13); + printk("R14=%016lx R15=%016lx RSP=%016lx\n", + regs->r14, regs->r15, regs->rsp); +} + +#define NEW_esp fn_call_lookaside[trap_cpu].rsp + +static void +regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_RAX] = regs->rax; + gdb_regs[_RBX] = regs->rbx; + gdb_regs[_RCX] = regs->rcx; + gdb_regs[_RDX] = regs->rdx; + gdb_regs[_RSI] = regs->rsi; + gdb_regs[_RDI] = regs->rdi; + gdb_regs[_RBP] = regs->rbp; + gdb_regs[ _PS] = regs->eflags; + gdb_regs[ _PC] = regs->rip; + gdb_regs[ _R8] = regs->r8; + gdb_regs[ _R9] = regs->r9; + gdb_regs[_R10] = regs->r10; + gdb_regs[_R11] = regs->r11; + gdb_regs[_R12] = regs->r12; + gdb_regs[_R13] = regs->r13; + gdb_regs[_R14] = regs->r14; + gdb_regs[_R15] = regs->r15; + gdb_regs[_RSP] = regs->rsp; + + /* Note, as we are a debugging the kernel, we will always + * trap in kernel code, this means no priviledge change, + * and so the pt_regs structure is not completely valid. In a non + * privilege change trap, only EFLAGS, CS and EIP are put on the stack, + * SS and ESP are not stacked, this means that the last 2 elements of + * pt_regs is not valid (they would normally refer to the user stack) + * also, using regs+1 is no good because you end up will a value that is + * 2 longs (8) too high. This used to cause stepping over functions + * to fail, so my fix is to use the address of regs->esp, which + * should point at the end of the stack frame. Note I have ignored + * completely exceptions that cause an error code to be stacked, such + * as double fault. Stuart Hughes, Zentropix. + * original code: gdb_regs[_ESP] = (int) (regs + 1) ; + + * this is now done on entry and moved to OLD_esp (as well as NEW_esp). + */ +} + +static void +gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + regs->rax = gdb_regs[_RAX] ; + regs->rbx = gdb_regs[_RBX] ; + regs->rcx = gdb_regs[_RCX] ; + regs->rdx = gdb_regs[_RDX] ; + regs->rsi = gdb_regs[_RSI] ; + regs->rdi = gdb_regs[_RDI] ; + regs->rbp = gdb_regs[_RBP] ; + regs->eflags = gdb_regs[ _PS] ; + regs->rip = gdb_regs[ _PC] ; + regs->r8 = gdb_regs[ _R8] ; + regs->r9 = gdb_regs[ _R9] ; + regs->r10 = gdb_regs[ _R10] ; + regs->r11 = gdb_regs[ _R11] ; + regs->r12 = gdb_regs[ _R12] ; + regs->r13 = gdb_regs[ _R13] ; + regs->r14 = gdb_regs[ _R14] ; + regs->r15 = gdb_regs[ _R15] ; + #if 0 /* can't change these */ + regs->rsp = gdb_regs[_RSP] ; + regs->ss = gdb_regs[ _SS] ; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif +} /* gdb_regs_to_regs */ + +extern void scheduling_functions_start_here(void); +extern void scheduling_functions_end_here(void); +#define first_sched ((unsigned long) scheduling_functions_start_here) +#define last_sched ((unsigned long) scheduling_functions_end_here) + +int thread_list = 0; +extern void thread_return(void); + +void +get_gdb_regs(struct task_struct *p, struct pt_regs *regs, unsigned long *gdb_regs) +{ + unsigned long **rbp, *rsp, *rsp0, pc; + int count = 0; + IF_SMP(int i); + if (!p || p == current) { + regs_to_gdb_regs(gdb_regs, regs); + return; + } +#ifdef CONFIG_SMP + for (i = 0; i < MAX_NO_CPUS; i++) { + if (p == kgdb_info.cpus_waiting[i].task) { + regs_to_gdb_regs(gdb_regs, + kgdb_info.cpus_waiting[i].regs); + gdb_regs[_RSP] = + (unsigned long)&kgdb_info.cpus_waiting[i].regs->rsp; + + return; + } + } +#endif + memset(gdb_regs, 0, NUMREGBYTES); + rsp = (unsigned long *)p->thread.rsp; + rbp = (unsigned long **)rsp[0]; + rsp += 2; + gdb_regs[_PC] = (unsigned long)thread_return; + gdb_regs[_RBP] = (unsigned long)rbp; + gdb_regs[_RSP] = (unsigned long)rsp; + +/* + * This code is to give a more informative notion of where a process + * is waiting. It is used only when the user asks for a thread info + * list. If he then switches to the thread, s/he will find the task + * is in schedule, but a back trace should show the same info we come + * up with. This code was shamelessly purloined from process.c. It was + * then enhanced to provide more registers than simply the program + * counter. + */ + + if (!thread_list) { + return; + } + + if (p->state == TASK_RUNNING) + return; + rsp0 = (unsigned long *)p->thread.rsp0; + if (rsp < (unsigned long *) p->thread_info || rsp > rsp0) + return; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + do { + if (*rbp < rsp || *rbp > rsp0) + break; + rbp = (unsigned long **)*rbp; + rsp = (unsigned long *)rbp; + pc = rsp[1]; + + if (pc < first_sched || pc >= last_sched) + break; + gdb_regs[_PC] = (unsigned long)pc; + gdb_regs[_RSP] = (unsigned long)rsp; + gdb_regs[_RBP] = (unsigned long)rbp; + } while (count++ < 16); + return; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* returns nonzero if any memory access fails. */ +int mem2hex( char* mem, char* buf, int count) +{ + int i; + unsigned char ch; + int ret = 0; + + for (i=0;i> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (ret) { + Dearly_printk("mem2hex: fault at accessing %p\n", mem); + } + return(ret); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return nonzero if any memory access fails. */ +int hex2mem( char* buf, char* mem, int count) +{ + int i; + unsigned char ch; + int ret = 0; + + for (i=0;i (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) { + addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr); + } + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set mem_err in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + /* printk("%lx = ", mem) ; */ + + ch = get_char(mem++); + + /* printk("%02x\n", ch & 0xFF) ; */ + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault fetching from addr %lx\n", + (long) (mem - 1)); + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + mem_err_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +/* NOTE: We use the may fault flag to also indicate if the write is to + * the registers (0) or "other" memory (!=0) + */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch, may_fault); + + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault storing to addr %lx\n", + (long) (mem - 1)); + return (mem); + } + } + if (may_fault) + mem_err_expected = 0; + return (mem); +} +#endif + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToLong(char **ptr, unsigned long *value) +{ + int numChars = 0; + int hexValue; + + *value = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *value = (*value << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#define stubhex(h) hex(h) +#ifdef old_thread_list + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +#ifdef old_thread_list +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} +int +int_to_hex_v(unsigned char * id, int value) +{ + unsigned char *start = id; + int shift; + int ch; + + for (shift = 28; shift >= 0; shift -= 4) { + if ((ch = (value >> shift) & 0xf) || (id != start)) { + *id = hexchars[ch]; + id++; + } + } + if (id == start) + *id++ = '0'; + return id - start; +} +#ifdef old_thread_list + +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} +#endif +static int +cmp_str(char *s1, char *s2, int count) +{ + while (count--) { + if (*s1++ != *s2++) + return 0; + } + return 1; +} + +#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */ +extern struct task_struct *kgdb_get_idle(int cpu); +#define idle_task(cpu) kgdb_get_idle(cpu) +#else +#define idle_task(cpu) init_tasks[cpu] +#endif + +extern int kgdb_pid_init_done; + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) { + if (!cpu_online(pid - PID_MAX)) + return NULL; + + return idle_task(pid - PID_MAX); + } else { + /* + * find_task_by_pid is relatively safe all the time + * Other pid functions require lock downs which imply + * that we may be interrupting them (as we get here + * in the middle of most any lock down). + * Still we don't want to call until the table exists! + */ + if (kgdb_pid_init_done){ + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } + } + } + return NULL; +} +/* *INDENT-OFF* */ +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned long addr; +} breakinfo[4] = { {enabled:0}, + {enabled:0}, + {enabled:0}, + {enabled:0}}; +/* *INDENT-ON* */ +unsigned long hw_breakpoint_status; +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned long dr7; + + asm volatile ("movq %%db7, %0\n":"=r" (dr7) + :); + /* *INDENT-OFF* */ + do { + unsigned long addr0, addr1, addr2, addr3; + asm volatile ("movq %%db0, %0\n" + "movq %%db1, %1\n" + "movq %%db2, %2\n" + "movq %%db3, %3\n" + :"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3) + :); + } while (0); + /* *INDENT-ON* */ + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movq %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movq %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movq %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movq %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movq %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +#ifdef CONFIG_SMP +static int in_kgdb_console = 0; + +int +in_kgdb(struct pt_regs *regs) +{ + unsigned long flags; + int cpu; + if (!kgdb_enabled) + return 0; + cpu = smp_processor_id(); + in_kgdb_called = 1; + if (!spin_is_locked(&kgdb_spinlock)) { + if (in_kgdb_here_log[cpu] || /* we are holding this cpu */ + in_kgdb_console) { /* or we are doing slow i/o */ + return 1; + } + return 0; + } + + /* As I see it the only reason not to let all cpus spin on + * the same spin_lock is to allow selected ones to proceed. + * This would be a good thing, so we leave it this way. + * Maybe someday.... Done ! + + * in_kgdb() is called from an NMI so we don't pretend + * to have any resources, like printk() for example. + */ + + local_irq_save(flags); /* only local here, to avoid hanging */ + /* + * log arival of this cpu + * The NMI keeps on ticking. Protect against recurring more + * than once, and ignor the cpu that has the kgdb lock + */ + in_kgdb_entry_log[cpu]++; + in_kgdb_here_log[cpu] = regs; + if (cpu == spinlock_cpu || waiting_cpus[cpu].task) + goto exit_in_kgdb; + + /* + * For protection of the initilization of the spin locks by kgdb + * it locks the kgdb spinlock before it gets the wait locks set + * up. We wait here for the wait lock to be taken. If the + * kgdb lock goes away first?? Well, it could be a slow exit + * sequence where the wait lock is removed prior to the kgdb lock + * so if kgdb gets unlocked, we just exit. + */ + + while (spin_is_locked(&kgdb_spinlock) && + !spin_is_locked(waitlocks + cpu)) ; + if (!spin_is_locked(&kgdb_spinlock)) + goto exit_in_kgdb; + + waiting_cpus[cpu].task = current; + waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu); + waiting_cpus[cpu].regs = regs; + + spin_unlock_wait(waitlocks + cpu); + + /* + * log departure of this cpu + */ + waiting_cpus[cpu].task = 0; + waiting_cpus[cpu].pid = 0; + waiting_cpus[cpu].regs = 0; + correct_hw_break(); + exit_in_kgdb: + in_kgdb_here_log[cpu] = 0; + local_irq_restore(flags); + return 1; + /* + spin_unlock(continuelocks + smp_processor_id()); + */ +} + +void +smp__in_kgdb(struct pt_regs regs) +{ + ack_APIC_irq(); + in_kgdb(®s); +} +#else +int +in_kgdb(struct pt_regs *regs) +{ + return (kgdb_spinlock); +} +#endif + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned long dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movq %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * The ThreadExtraInfo query allows us to pass an arbitrary string + * for display with the "info threads" command. + */ + +void +print_extra_info(task_t *p, char *buf) +{ + if (!p) { + sprintf(buf, "Invalid thread"); + return; + } + sprintf(buf, "0x%p %8d %4d %c %s", + (void *)p, p->parent->pid, + task_cpu(p), + (p->state == 0) ? (task_curr(p)?'R':'r') : + (p->state < 0) ? 'U' : + (p->state & TASK_UNINTERRUPTIBLE) ? 'D' : + (p->state & TASK_STOPPED || p->ptrace & PT_PTRACED) ? 'T' : + (p->state & (TASK_ZOMBIE | TASK_DEAD)) ? 'Z' : + (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?', + p->comm); +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +kgdb_handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + struct task_struct *thread_list_start = 0, *thread = NULL; + struct task_struct *p; + unsigned long addr, length; + unsigned long breakno, breaktype; + char *ptr; + unsigned long newPC; + threadref thref; + unsigned long threadid, tmpid; + int thread_min = PID_MAX + MAX_NO_CPUS; +#ifdef old_thread_list + int maxthreads; +#endif + int nothreads; + unsigned long flags; + unsigned long gdb_regs[NUMREGS]; + unsigned long dr6; + IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */ +#define NO_NMI 1 +#define NO_SYNC 2 +#define regs (*linux_regs) + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->cs)) { + printk("ignoring non-kernel exception\n"); + print_regs(®s); + return (0); + } + /* + * If we're using eth mode, set the 'mode' in the netdevice. + */ + + if (kgdboe) + netpoll_set_trap(1); + + local_irq_save(flags); + + /* Get kgdb spinlock */ + + KGDB_SPIN_LOCK(&kgdb_spinlock); + rdtscll(kgdb_info.entry_tsc); + /* + * We depend on this spinlock and the NMI watch dog to control the + * other cpus. They will arrive at "in_kgdb()" as a result of the + * NMI and will wait there for the following spin locks to be + * released. + */ +#ifdef CONFIG_SMP + +#if 0 + if (cpu_callout_map & ~MAX_CPU_MASK) { + printk("kgdb : too many cpus, possibly not mapped" + " in contiguous space, change MAX_NO_CPUS" + " in kgdb_stub and make new kernel.\n" + " cpu_callout_map is %lx\n", cpu_callout_map); + goto exit_just_unlock; + } +#endif + if (spinlock_count == 1) { + int time, end_time, dum; + int i; + int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0) + }; + if (remote_debug) { + printk("kgdb : cpu %d entry, syncing others\n", + smp_processor_id()); + } + for (i = 0; i < MAX_NO_CPUS; i++) { + /* + * Use trylock as we may already hold the lock if + * we are holding the cpu. Net result is all + * locked. + */ + spin_trylock(&waitlocks[i]); + } + for (i = 0; i < MAX_NO_CPUS; i++) + cpu_logged_in[i] = 0; + /* + * Wait for their arrival. We know the watch dog is active if + * in_kgdb() has ever been called, as it is always called on a + * watchdog tick. + */ + rdtsc(dum, time); + end_time = time + 2; /* Note: we use the High order bits! */ + i = 1; + if (num_online_cpus() > 1) { + int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()]; + smp_send_nmi_allbutself(); + + while (i < num_online_cpus() && time != end_time) { + int j; + for (j = 0; j < MAX_NO_CPUS; j++) { + if (waiting_cpus[j].task && + waiting_cpus[j].task != NOCPU && + !cpu_logged_in[j]) { + i++; + cpu_logged_in[j] = 1; + if (remote_debug) { + printk + ("kgdb : cpu %d arrived at kgdb\n", + j); + } + break; + } else if (!waiting_cpus[j].task && + !cpu_online(j)) { + waiting_cpus[j].task = NOCPU; + cpu_logged_in[j] = 1; + waiting_cpus[j].hold = 1; + break; + } + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + + int wait = 100000; + while (wait--) ; + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + printk + ("kgdb : cpu %d stall" + " in in_kgdb\n", + j); + i++; + cpu_logged_in[j] = 1; + waiting_cpus[j].task = + (struct task_struct + *) 1; + } + } + } + + if (in_kgdb_entry_log[smp_processor_id()] > + (me_in_kgdb + 10)) { + break; + } + + rdtsc(dum, time); + } + if (i < num_online_cpus()) { + printk + ("kgdb : time out, proceeding without sync\n"); +#if 0 + printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n", + waiting_cpus[0].task != 0, + waiting_cpus[1].task != 0); + printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n", + cpu_logged_in[0], cpu_logged_in[1]); + printk + ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n", + in_kgdb_here_log[0] != 0, + in_kgdb_here_log[1] != 0); +#endif + entry_state = NO_SYNC; + } else { +#if 0 + int ent = + in_kgdb_entry_log[smp_processor_id()] - + me_in_kgdb; + printk("kgdb : sync after %d entries\n", ent); +#endif + } + } else { + if (remote_debug) { + printk + ("kgdb : %d cpus, but watchdog not active\n" + "proceeding without locking down other cpus\n", + num_online_cpus()); + entry_state = NO_NMI; + } + } + } +#endif + + if (remote_debug) { + unsigned long *lp = (unsigned long *) &linux_regs; + + printk("handle_exception(exceptionVector=%d, " + "signo=%d, err_code=%d, linux_regs=%p)\n", + exceptionVector, signo, err_code, linux_regs); + if (debug_regs) { + print_regs(®s); + printk("Stk: %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[0], lp[1], lp[2], lp[3], + lp[4], lp[5], lp[6], lp[7]); + printk(" %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[8], lp[9], lp[10], lp[11], + lp[12], lp[13], lp[14], lp[15]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[16], lp[17], lp[18], lp[19], + lp[20], lp[21], lp[22], lp[23]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[24], lp[25], lp[26], lp[27], + lp[28], lp[29], lp[30], lp[31]); + } + } + + /* Disable hardware debugging while we are in kgdb */ + /* Get the debug register status register */ +/* *INDENT-OFF* */ + __asm__("movq %0,%%db7" + : /* no output */ + :"r"(0UL)); + + asm volatile ("movq %%db6, %0\n" + :"=r" (hw_breakpoint_status) + :); + +#if 0 +/* *INDENT-ON* */ + switch (exceptionVector) { + case 0: /* divide error */ + case 1: /* debug exception */ + case 2: /* NMI */ + case 3: /* breakpoint */ + case 4: /* overflow */ + case 5: /* bounds check */ + case 6: /* invalid opcode */ + case 7: /* device not available */ + case 8: /* double fault (errcode) */ + case 10: /* invalid TSS (errcode) */ + case 12: /* stack fault (errcode) */ + case 16: /* floating point error */ + case 17: /* alignment check (errcode) */ + default: /* any undocumented */ + break; + case 11: /* segment not present (errcode) */ + case 13: /* general protection (errcode) */ + case 14: /* page fault (special errcode) */ + case 19: /* cache flush denied */ + if (mem_err_expected) { + /* + * This fault occured because of the + * get_char or set_char routines. These + * two routines use either eax of edx to + * indirectly reference the location in + * memory that they are working with. + * For a page fault, when we return the + * instruction will be retried, so we + * have to make sure that these + * registers point to valid memory. + */ + mem_err = 1; /* set mem error flag */ + mem_err_expected = 0; + mem_err_cnt++; /* helps in debugging */ + /* make valid address */ + regs.eax = (long) &garbage_loc; + /* make valid address */ + regs.edx = (long) &garbage_loc; + if (remote_debug) + printk("Return after memory error: " + "mem_err_cnt=%d\n", mem_err_cnt); + if (debug_regs) + print_regs(®s); + goto exit_kgdb; + } + break; + } +#endif + if (remote_debug) + printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id()); + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + kgdb_info.called_from = __builtin_return_address(0); +#ifdef CONFIG_SMP + /* + * OK, we can now communicate, lets tell gdb about the sync. + * but only if we had a problem. + */ + switch (entry_state) { + case NO_NMI: + to_gdb("NMI not active, other cpus not stopped\n"); + break; + case NO_SYNC: + to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n"); + default:; + } + +#endif +/* + * Set up the gdb function call area. + */ + trap_cpu = smp_processor_id(); + OLD_esp = NEW_esp = (unsigned long) (&linux_regs->rsp); + + IF_SMP(once_again:) + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'd': + remote_debug = !(remote_debug); /* toggle debug flag */ + printk("Remote debug %s\n", + remote_debug ? "on" : "off"); + break; + case 'g': /* return the value of the CPU registers */ + get_gdb_regs(usethread, ®s, gdb_regs); + mem2hex((char *) gdb_regs, + remcomOutBuffer, NUMREGBYTES); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], + (char *) gdb_regs, NUMREGBYTES); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + case 'P':{ /* set the value of a single CPU register - + return OK */ + /* + * For some reason, gdb wants to talk about psudo + * registers (greater than 15). + */ + unsigned long regno; + + ptr = &remcomInBuffer[1]; + regs_to_gdb_regs(gdb_regs, ®s); + if ((!usethread || usethread == current) && + hexToLong(&ptr, ®no) && + *ptr++ == '=' && (regno >= 0)) { + if (regno >= NUMREGS) + break; + hex2mem(ptr, (char *) &gdb_regs[regno], + 8); + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + break; + } + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr) && + (*(ptr++) == ',') && (hexToLong(&ptr, &length))) { + ptr = 0; + /* + * hex doubles the byte count + */ + if (length > (BUFMAX / 2)) + length = BUFMAX / 2; + if (mem2hex((char *) addr, + remcomOutBuffer, length)) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + debug_error + ("malformed read memory command: %s\n", + remcomInBuffer); + } + break; + + /* MAA..AA,LLLL: + Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr) && + (*(ptr++) == ',') && + (hexToLong(&ptr, &length)) && (*(ptr++) == ':')) { + if (hex2mem(ptr, (char *) addr, length)) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } else { + strcpy(remcomOutBuffer, "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + debug_error + ("malformed write memory command: %s\n", + remcomInBuffer); + } + break; + case 'S': + remcomInBuffer[0] = 's'; + case 'C': + /* Csig;AA..AA where ;AA..AA is optional + * continue with signal + * Since signals are meaning less to us, delete that + * part and then fall into the 'c' code. + */ + ptr = &remcomInBuffer[1]; + length = 2; + while (*ptr && *ptr != ';') { + length++; + ptr++; + } + if (*ptr) { + do { + ptr++; + *(ptr - length++) = *ptr; + } while (*ptr); + } else { + remcomInBuffer[1] = 0; + } + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + /* D detach, reply OK and then continue */ + case 'c': + case 's': + case 'D': + + /* try to read optional parameter, + pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr)) { + if (remote_debug) + printk("Changing EIP to 0x%lx\n", addr); + + regs.rip = addr; + } + + newPC = regs.rip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + /* detach is a friendly version of continue. Note that + debugging is still enabled (e.g hit control C) + */ + if (remcomInBuffer[0] == 'D') { + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + } + + if (remote_debug) { + printk("Resuming execution\n"); + print_regs(®s); + } + asm volatile ("movq %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno) && + (breakinfo[breakno].type == 0)) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + + if (kgdboe) + netpoll_set_trap(0); + + correct_hw_break(); + asm volatile ("movq %0, %%db6\n"::"r" (0UL)); + goto exit_kgdb; + + /* kill the program */ + case 'k': /* do nothing */ + break; + + /* query */ + case 'q': + nothreads = 0; + switch (remcomInBuffer[1]) { + case 'f': + threadid = 1; + thread_list = 2; + thread_list_start = (usethread ? : current); + case 's': + if (!cmp_str(&remcomInBuffer[2], + "ThreadInfo", 10)) + break; + + remcomOutBuffer[nothreads++] = 'm'; + for (; threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + nothreads += int_to_hex_v( + &remcomOutBuffer[ + nothreads], + threadid); + if (thread_min > threadid) + thread_min = threadid; + remcomOutBuffer[ + nothreads] = ','; + nothreads++; + if (nothreads > BUFMAX - 10) + break; + } + } + if (remcomOutBuffer[nothreads - 1] == 'm') { + remcomOutBuffer[nothreads - 1] = 'l'; + } else { + nothreads--; + } + remcomOutBuffer[nothreads] = 0; + break; + +#ifdef old_thread_list /* Old thread info request */ + case 'L': + /* List threads */ + thread_list = 2; + thread_list_start = (usethread ? : current); + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + do { + int buf_thread_limit = + (BUFMAX - 22) / BUF_THREAD_ID_SIZE; + if (maxthreads > buf_thread_limit) { + maxthreads = buf_thread_limit; + } + } while (0); + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + /* If start flag set start at 0. */ + if (remcomInBuffer[2] == '1') + threadid = 0; + else + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads && + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + if (thread_min > threadid) + thread_min = threadid; + } + } + + if (threadid == PID_MAX + MAX_NO_CPUS) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; +#endif + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + if (!threadid) { + /* + * idle thread + */ + for (threadid = PID_MAX; + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + if (current == + idle_task(threadid - + PID_MAX)) + break; + } + } + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, + err_code, remcomOutBuffer); + break; + case 'T': + ptr = &remcomInBuffer[0]; + if (strncmp(ptr, "qThreadExtraInfo,", + strlen("qThreadExtraInfo,")) == 0) { + ptr += strlen("qThreadExtraInfo,"); + hexToLong(&ptr, &tmpid); + p = getthread(tmpid); + print_extra_info(p, lbuf); + mem2hex(lbuf, remcomOutBuffer, + strlen(lbuf)); + } + break; +#if 0 + case 'T':{ + char * nptr; + /* Thread extra info */ + if (!cmp_str(&remcomInBuffer[2], + "hreadExtraInfo,", 15)) { + break; + } + ptr = &remcomInBuffer[17]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + nptr = &thread->comm[0]; + length = 0; + ptr = &remcomOutBuffer[0]; + do { + length++; + ptr = pack_hex_byte(ptr, *nptr++); + } while (*nptr && length < 16); + /* + * would like that 16 to be the size of + * task_struct.comm but don't know the + * syntax.. + */ + *ptr = 0; + } +#endif + } + break; + + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + /* + * Just in case I forget what this is all about, + * the "thread info" command to gdb causes it + * to ask for a thread list. It then switches + * to each thread and asks for the registers. + * For this (and only this) usage, we want to + * fudge the registers of tasks not on the run + * list (i.e. waiting) to show the routine that + * called schedule. Also, gdb, is a minimalist + * in that if the current thread is the last + * it will not re-read the info when done. + * This means that in this case we must show + * the real registers. So here is how we do it: + * Each entry we keep track of the min + * thread in the list (the last that gdb will) + * get info for. We also keep track of the + * starting thread. + * "thread_list" is cleared when switching back + * to the min thread if it is was current, or + * if it was not current, thread_list is set + * to 1. When the switch to current comes, + * if thread_list is 1, clear it, else do + * nothing. + */ + usethread = thread; + if ((thread_list == 1) && + (thread == thread_list_start)) { + thread_list = 0; + } + if (thread_list && (threadid == thread_min)) { + if (thread == thread_list_start) { + thread_list = 0; + } else { + thread_list = 1; + } + } + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + if (thread_min > threadid) + thread_min = threadid; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; + + case 'Y': /* set up a hardware breakpoint */ + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &breakno); + ptr++; + hexToLong(&ptr, &breaktype); + ptr++; + hexToLong(&ptr, &length); + ptr++; + hexToLong(&ptr, &addr); + if (set_hw_break(breakno & 0x3, + breaktype & 0x3, + length & 0x3, addr) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + case 'r': /* reboot */ + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + /*to_gdb("Rebooting\n"); */ + /* triplefault no return from here */ + { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt[0])); + BREAKPOINT; + } + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + } /* while(1==1) */ + /* + * reached by goto only. + */ + exit_kgdb: + /* + * Here is where we set up to trap a gdb function call. NEW_esp + * will be changed if we are trying to do this. We handle both + * adding and subtracting, thus allowing gdb to put grung on + * the stack which it removes later. + */ + if (NEW_esp != OLD_esp) { + unsigned long *ptr = END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) + ptr -= (OLD_esp - NEW_esp) / sizeof (unsigned long); + *--ptr = linux_regs->eflags; + *--ptr = linux_regs->cs; + *--ptr = linux_regs->rip; + *--ptr = linux_regs->rcx; + *--ptr = linux_regs->rbx; + *--ptr = linux_regs->rax; + linux_regs->rcx = NEW_esp - (sizeof (unsigned long) * 6); + linux_regs->rbx = (unsigned long) END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) { + linux_regs->rip = (unsigned long) fn_call_stub; + } else { + linux_regs->rip = (unsigned long) fn_rtn_stub; + linux_regs->rax = NEW_esp; + } + linux_regs->eflags &= ~(IF_BIT | TF_BIT); + } +#ifdef CONFIG_SMP + /* + * Release gdb wait locks + * Sanity check time. Must have at least one cpu to run. Also single + * step must not be done if the current cpu is on hold. + */ + if (spinlock_count == 1) { + int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep; + int cpu_avail = 0; + int i; + + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!cpu_online(i)) + break; + if (!hold_cpu(i)) { + cpu_avail = 1; + } + } + /* + * Early in the bring up there will be NO cpus on line... + */ + if (!cpu_avail && !cpus_empty(cpu_online_map)) { + to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n"); + goto once_again; + } + if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) { + to_gdb + ("Current cpu must be unblocked to single step\n"); + goto once_again; + } + if (!(ss_hold)) { + int i; + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!hold_cpu(i)) { + spin_unlock(&waitlocks[i]); + } + } + } else { + spin_unlock(&waitlocks[smp_processor_id()]); + } + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + /* + * If this cpu is on hold, this is where we + * do it. Note, the NMI will pull us out of here, + * but will return as the above lock is not held. + * We will stay here till another cpu releases the lock for us. + */ + spin_unlock_wait(waitlocks + smp_processor_id()); + local_irq_restore(flags); + return (1); + } +#if 0 +exit_just_unlock: +#endif +#endif + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + local_irq_restore(flags); + return (1); +} + +#undef regs +static int kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) +{ + struct die_args *d = ptr; + + if (!kgdb_enabled || (cmd == DIE_DEBUG && user_mode(d->regs))) + return NOTIFY_DONE; + if (cmd == DIE_NMI_IPI) { + if (in_kgdb(d->regs)) + return NOTIFY_BAD; + } else if (kgdb_handle_exception(d->trapnr, d->signr, d->err, d->regs)) + return NOTIFY_BAD; /* skip */ + + return NOTIFY_DONE; +} + +static struct notifier_block kgdb_notifier = { + .notifier_call = kgdb_notify, + .priority = 0, +}; + +void set_debug_traps(void) +{ + static int initialized = 0; + + if (!initialized) { + initialized = 1; + notifier_chain_register(&die_chain, &kgdb_notifier); + } +} + +/* + * Provide the command line "gdb" initial break + */ +int __init kgdb_initial_break(char * str) +{ + if (*str == '\0'){ + breakpoint(); + return 1; + } + return 0; +} +__setup("gdb",kgdb_initial_break); + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ +/* But really, just use the BREAKPOINT macro. We will handle the int stuff + */ + +void breakpoint(void) +{ + + set_debug_traps(); + kgdb_enabled = 1; +#if 0 + /* + * These calls were not enough to allow breakpoint to be + * called before trap_init(). I moved the argument parsing + * after trap_init() and it seems to work. + */ + set_intr_usr_gate(3,&int3); /* disable ints on trap */ + set_intr_gate(1,&debug); + set_intr_gate(14,&page_fault); +#endif + + BREAKPOINT; +} + +#ifdef later +/* + * possibly we should not go thru the traps.c code at all? Someday. + */ +void +do_kgdb_int3(struct pt_regs *regs, long error_code) +{ + kgdb_handle_exception(3, 5, error_code, regs); + return; +} +#endif +#undef regs +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS +asmlinkage void +bad_sys_call_exit(int stuff) +{ + struct pt_regs *regs = (struct pt_regs *) &stuff; + printk("Sys call %d return with %x preempt_count\n", + (int) regs->orig_eax, preempt_count()); +} +#endif +#ifdef CONFIG_STACK_OVERFLOW_TEST +#include +asmlinkage void +stack_overflow(void) +{ +#ifdef BREAKPOINT + BREAKPOINT; +#else + printk("Kernel stack overflow, looping forever\n"); +#endif + while (1) { + } +} +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE) +char gdbconbuf[BUFMAX]; + +static void +kgdb_gdb_message(const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + /* + * This takes care of NMI while spining out chars to gdb + */ + IF_SMP(in_kgdb_console = 1); + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } + IF_SMP(in_kgdb_console = 0); +} +#endif +#ifdef CONFIG_SMP +static void +to_gdb(const char *s) +{ + int count = 0; + while (s[count] && (count++ < BUFMAX)) ; + kgdb_gdb_message(s, count); +} +#endif +#ifdef CONFIG_KGDB_CONSOLE +#include +#include +#include +#include + +void +kgdb_console_write(struct console *co, const char *s, unsigned count) +{ + + if (gdb_i386vector == -1) { + /* + * We have not yet talked to gdb. What to do... + * lets break, on continue we can do the write. + * But first tell him whats up. Uh, well no can do, + * as this IS the console. Oh well... + * We do need to wait or the messages will be lost. + * Other option would be to tell the above code to + * ignore this breakpoint and do an auto return, + * but that might confuse gdb. Also this happens + * early enough in boot up that we don't have the traps + * set up yet, so... + */ + breakpoint(); + } + kgdb_gdb_message(s, count); +} + +/* + * ------------------------------------------------------------ + * Serial KGDB driver + * ------------------------------------------------------------ + */ + +static struct console kgdbcons = { + name:"kgdb", + write:kgdb_console_write, +#ifdef CONFIG_KGDB_USER_CONSOLE + device:kgdb_console_device, +#endif + flags:CON_PRINTBUFFER | CON_ENABLED, + index:-1, +}; + +/* + * The trick here is that this file gets linked before printk.o + * That means we get to peer at the console info in the command + * line before it does. If we are up, we register, otherwise, + * do nothing. By returning 0, we allow printk to look also. + */ +static int kgdb_console_enabled; + +int __init +kgdb_console_init(char *str) +{ + if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) { + register_console(&kgdbcons); + kgdb_console_enabled = 1; + } + return 0; /* let others look at the string */ +} + +__setup("console=", kgdb_console_init); + +#ifdef CONFIG_KGDB_USER_CONSOLE +static kdev_t kgdb_console_device(struct console *c); +/* This stuff sort of works, but it knocks out telnet devices + * we are leaving it here in case we (or you) find time to figure it out + * better.. + */ + +/* + * We need a real char device as well for when the console is opened for user + * space activities. + */ + +static int +kgdb_consdev_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static ssize_t +kgdb_consdev_write(struct file *file, const char *buf, + size_t count, loff_t * ppos) +{ + int size, ret = 0; + static char kbuf[128]; + static DECLARE_MUTEX(sem); + + /* We are not reentrant... */ + if (down_interruptible(&sem)) + return -ERESTARTSYS; + + while (count > 0) { + /* need to copy the data from user space */ + size = count; + if (size > sizeof (kbuf)) + size = sizeof (kbuf); + if (copy_from_user(kbuf, buf, size)) { + ret = -EFAULT; + break;; + } + kgdb_console_write(&kgdbcons, kbuf, size); + count -= size; + ret += size; + buf += size; + } + + up(&sem); + + return ret; +} + +struct file_operations kgdb_consdev_fops = { + open:kgdb_consdev_open, + write:kgdb_consdev_write +}; +static kdev_t +kgdb_console_device(struct console *c) +{ + return MKDEV(TTYAUX_MAJOR, 1); +} + +/* + * This routine gets called from the serial stub in the i386/lib + * This is so it is done late in bring up (just before the console open). + */ +void +kgdb_console_finit(void) +{ + if (kgdb_console_enabled) { + char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1)); + char *cp = cptr; + while (*cptr && *cptr != '(') + cptr++; + *cptr = 0; + unregister_chrdev(TTYAUX_MAJOR, cp); + register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops); + } +} +#endif +#endif +#ifdef CONFIG_KGDB_TS +#include /* time stamp code */ +#include /* in_interrupt */ +#ifdef CONFIG_KGDB_TS_64 +#define DATA_POINTS 64 +#endif +#ifdef CONFIG_KGDB_TS_128 +#define DATA_POINTS 128 +#endif +#ifdef CONFIG_KGDB_TS_256 +#define DATA_POINTS 256 +#endif +#ifdef CONFIG_KGDB_TS_512 +#define DATA_POINTS 512 +#endif +#ifdef CONFIG_KGDB_TS_1024 +#define DATA_POINTS 1024 +#endif +#ifndef DATA_POINTS +#define DATA_POINTS 128 /* must be a power of two */ +#endif +#define INDEX_MASK (DATA_POINTS - 1) +#if (INDEX_MASK & DATA_POINTS) +#error "CONFIG_KGDB_TS_COUNT must be a power of 2" +#endif +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + int data0; + int data1; +}; +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + struct task_struct *t1; + struct task_struct *t2; +}; +struct kgdb_and_then_struct kgdb_data[DATA_POINTS]; + +struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0]; +int kgdb_and_then_count; + +void +kgdb_tstamp(int line, char *source, int data0, int data1) +{ + static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; + int flags; + local_irq_save(flags); + spin_lock(&ts_spin); + rdtscll(kgdb_and_then->at_time); +#ifdef CONFIG_SMP + kgdb_and_then->on_cpu = smp_processor_id(); +#endif + kgdb_and_then->task = current; + kgdb_and_then->from_ln = line; + kgdb_and_then->in_src = source; + kgdb_and_then->from = __builtin_return_address(0); + kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) | + (preempt_count() << 8)); + kgdb_and_then->data0 = data0; + kgdb_and_then->data1 = data1; + kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK]; + spin_unlock(&ts_spin); + local_irq_restore(flags); +#ifdef CONFIG_PREEMPT + +#endif + return; +} +#endif +typedef int gdb_debug_hook(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs); +gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */ + +static int kgdb_need_breakpoint[NR_CPUS]; + +void kgdb_schedule_breakpoint(void) +{ + kgdb_need_breakpoint[smp_processor_id()] = 1; +} + +void kgdb_process_breakpoint(void) +{ + /* + * Handle a breakpoint queued from inside network driver code + * to avoid reentrancy issues + */ + if (kgdb_need_breakpoint[smp_processor_id()]) { + kgdb_need_breakpoint[smp_processor_id()] = 0; + kgdb_enabled = 1; + BREAKPOINT; + } +} + diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/x86_64/kernel/Makefile xx/arch/x86_64/kernel/Makefile --- 2.6.5-rc2/arch/x86_64/kernel/Makefile 2004-03-21 15:09:24.000000000 +0100 +++ xx/arch/x86_64/kernel/Makefile 2004-03-21 15:21:40.000000000 +0100 @@ -26,6 +26,7 @@ obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_KGDB) += kgdb_stub.o obj-y += topology.o diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/x86_64/kernel/smp.c xx/arch/x86_64/kernel/smp.c --- 2.6.5-rc2/arch/x86_64/kernel/smp.c 2003-11-01 18:06:33.000000000 +0100 +++ xx/arch/x86_64/kernel/smp.c 2004-03-21 15:21:40.000000000 +0100 @@ -362,6 +362,18 @@ void smp_send_reschedule(int cpu) send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } +#ifdef CONFIG_KGDB +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} +#endif + /* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/x86_64/kernel/traps.c xx/arch/x86_64/kernel/traps.c --- 2.6.5-rc2/arch/x86_64/kernel/traps.c 2004-03-21 15:09:24.000000000 +0100 +++ xx/arch/x86_64/kernel/traps.c 2004-03-21 15:21:40.000000000 +0100 @@ -45,6 +45,9 @@ #include #include +#ifdef CONFIG_KGDB +#include +#endif extern struct gate_struct idt_table[256]; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/x86_64/lib/kgdb_serial.c xx/arch/x86_64/lib/kgdb_serial.c --- 2.6.5-rc2/arch/x86_64/lib/kgdb_serial.c 1970-01-01 01:00:00.000000000 +0100 +++ xx/arch/x86_64/lib/kgdb_serial.c 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,490 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * Modified to allow invokation early in boot see also + * kgdb.h for instructions by George Anzinger(george@mvista.com) + * Modified to handle debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_KGDB_USER_CONSOLE +extern void kgdb_console_finit(void); +#endif +#define PRNT_off +#define TEST_EXISTANCE +#ifdef PRNT +#define dbprintk(s) printk s +#else +#define dbprintk(s) +#endif +#define TEST_INTERRUPT_off +#ifdef TEST_INTERRUPT +#define intprintk(s) printk s +#else +#define intprintk(s) +#endif + +#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT) + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +struct async_struct *gdb_async_info; +static int gdb_async_irq; + +#define outb_px(a,b) outb_p(b,a) + +static void program_uart(struct async_struct *info); +static void write_char(struct async_struct *info, int chr); +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(struct async_struct *info) +{ + char it = inb_p(info->port + UART_LSR); + + if (it & UART_LSR_DR) + return (inb_p(info->port + UART_RX)); + /* + * If we have a framing error assume somebody messed with + * our uart. Reprogram it and send '-' both ways... + */ + if (it & 0xc) { + program_uart(info); + write_char(info, '-'); + return ('-'); + } + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + + * Locking here is a bit of a problem. We MUST not lock out communication + * if we are trying to talk to gdb about a kgdb entry. ON the other hand + * we can loose chars in the console pass thru if we don't lock. It is also + * possible that we could hold the lock or be waiting for it when kgdb + * NEEDS to talk. Since kgdb locks down the world, it does not need locks. + * We do, of course have possible issues with interrupting a uart operation, + * but we will just depend on the uart status to help keep that straight. + + */ +static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +#endif + +static int +read_char(struct async_struct *info) +{ + int chr; + unsigned long flags; + local_irq_save(flags); +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_lock(&uart_interrupt_lock); + } +#endif + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + } else { + chr = read_data_bfr(info); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_unlock(&uart_interrupt_lock); + } +#endif + local_irq_restore(flags); + return (chr); +} + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(struct async_struct *info, int chr) +{ + while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ; + + outb_p(chr, info->port + UART_TX); + +} /* write_char */ + +/* + * Mostly we don't need a spinlock, but since the console goes + * thru here with interrutps on, well, we need to catch those + * chars. + */ +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine tty_getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via tty_putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static irqreturn_t +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct async_struct *info; + unsigned long flags; + + info = gdb_async_info; + if (!info || !info->tty || irq != gdb_async_irq) + return IRQ_NONE; + + local_irq_save(flags); + spin_lock(&uart_interrupt_lock); + do { + int chr = read_data_bfr(info); + intprintk(("Debug char on int: %x hex\n", chr)); + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + BREAKPOINT; + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { + /* buffer overflow tosses early char */ + read_char(info); + } + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI); + spin_unlock(&uart_interrupt_lock); + local_irq_restore(flags); + return IRQ_HANDLED; +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +/* These structure are filled in with values defined in asm/kgdb_local.h + */ +static struct serial_state state = SB_STATE; +static struct async_struct local_info = SB_INFO; +static int ok_to_enable_ints = 0; +static void kgdb_enable_ints_now(void); + +extern char *kgdb_version; +/* + * Hook an IRQ for KGDB. + * + * This routine is called from tty_putDebugChar, below. + */ +static int ints_disabled = 1; +int +gdb_hook_interrupt(struct async_struct *info, int verb) +{ + struct serial_state *state = info->state; + unsigned long flags; + int port; +#ifdef TEST_EXISTANCE + int scratch, scratch2; +#endif + + /* The above fails if memory managment is not set up yet. + * Rather than fail the set up, just keep track of the fact + * and pick up the interrupt thing later. + */ + gdb_async_info = info; + port = gdb_async_info->port; + gdb_async_irq = state->irq; + if (verb) { + printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n", + kgdb_version, + port, + gdb_async_irq, gdb_async_info->state->custom_divisor); + } + local_irq_save(flags); +#ifdef TEST_EXISTANCE + /* Existance test */ + /* Should not need all this, but just in case.... */ + + scratch = inb_p(port + UART_IER); + outb_px(port + UART_IER, 0); + outb_px(0xff, 0x080); + scratch2 = inb_p(port + UART_IER); + outb_px(port + UART_IER, scratch); + if (scratch2) { + printk + ("gdb_hook_interrupt: Could not clear IER, not a UART!\n"); + local_irq_restore(flags); + return 1; /* We failed; there's nothing here */ + } + scratch2 = inb_p(port + UART_LCR); + outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */ + outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */ + outb_px(port + UART_LCR, 0); + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO); + scratch = inb_p(port + UART_IIR) >> 6; + if (scratch == 1) { + printk("gdb_hook_interrupt: Undefined UART type!" + " Not a UART! \n"); + local_irq_restore(flags); + return 1; + } else { + dbprintk(("gdb_hook_interrupt: UART type " + "is %d where 0=16450, 2=16550 3=16550A\n", scratch)); + } + scratch = inb_p(port + UART_MCR); + outb_px(port + UART_MCR, UART_MCR_LOOP | scratch); + outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A); + scratch2 = inb_p(port + UART_MSR) & 0xF0; + outb_px(port + UART_MCR, scratch); + if (scratch2 != 0x90) { + printk("gdb_hook_interrupt: " + "Loop back test failed! Not a UART!\n"); + local_irq_restore(flags); + return scratch2 + 1000; /* force 0 to fail */ + } +#endif /* test existance */ + program_uart(info); + local_irq_restore(flags); + + return (0); + +} /* gdb_hook_interrupt */ + +static void +program_uart(struct async_struct *info) +{ + int port = info->port; + + (void) inb_p(port + UART_RX); + outb_px(port + UART_IER, 0); + + (void) inb_p(port + UART_RX); /* serial driver comments say */ + (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */ + (void) inb_p(port + UART_MSR); + outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB); + outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */ + outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */ + outb_px(port + UART_MCR, info->MCR); + + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */ + outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */ + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */ + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + return; +} + +/* + * tty_getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. In the + */ +int kgdb_in_isr = 0; +int kgdb_in_lsr = 0; +extern spinlock_t kgdb_spinlock; + +/* Caller takes needed protections */ + +int +tty_getDebugChar(void) +{ + volatile int chr, dum, time, end_time; + + dbprintk(("tty_getDebugChar(port %x): ", gdb_async_info->port)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + /* + * This trick says if we wait a very long time and get + * no char, return the -1 and let the upper level deal + * with it. + */ + rdtsc(dum, time); + end_time = time + 2; + while (((chr = read_char(gdb_async_info)) == -1) && + (end_time - time) > 0) { + rdtsc(dum, time); + }; + /* + * This covers our butts if some other code messes with + * our uart, hay, it happens :o) + */ + if (chr == -1) + program_uart(gdb_async_info); + + dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' ')); + return (chr); + +} /* tty_getDebugChar */ + +static int count = 3; +static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; + +static int __init +kgdb_enable_ints(void) +{ + set_debug_traps(); + if (kgdboe) { + return 0; + } + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 1); + } + ok_to_enable_ints = 1; + kgdb_enable_ints_now(); +#ifdef CONFIG_KGDB_USER_CONSOLE + kgdb_console_finit(); +#endif + return 0; +} + +#ifdef CONFIG_SERIAL_8250 +void shutdown_for_kgdb(struct async_struct *gdb_async_info); +#endif + +#define kgdb_mem_init_done() (1) + +static void +kgdb_enable_ints_now(void) +{ + if (!spin_trylock(&one_at_atime)) + return; + if (!ints_disabled) + goto exit; + if (kgdb_mem_init_done() && + ints_disabled) { /* don't try till mem init */ +#ifdef CONFIG_SERIAL_8250 + /* + * The ifdef here allows the system to be configured + * without the serial driver. + * Don't make it a module, however, it will steal the port + */ + shutdown_for_kgdb(gdb_async_info); +#endif + ints_disabled = request_irq(gdb_async_info->state->irq, + gdb_interrupt, + IRQ_T(gdb_async_info), + "KGDB-stub", NULL); + intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); + } + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + exit: + spin_unlock(&one_at_atime); +} + +/* + * tty_putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. Caller takes needed protections. + */ +void +tty_putDebugChar(int chr) +{ + dbprintk(("tty_putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n", + gdb_async_info->port, + chr, + chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + + write_char(gdb_async_info, chr); /* this routine will wait */ + count = (chr == '#') ? 0 : count + 1; + if ((count == 2)) { /* try to enable after */ + if (ints_disabled & ok_to_enable_ints) + kgdb_enable_ints_now(); /* try to enable after */ + + /* We do this a lot because, well we really want to get these + * interrupts. The serial driver will clear these bits when it + * initializes the chip. Every thing else it does is ok, + * but this. + */ + if (!ints_disabled) { + outb_px(gdb_async_info->port + UART_IER, + gdb_async_info->IER); + } + } + +} /* tty_putDebugChar */ + +/* + * This does nothing for the serial port, since it doesn't buffer. + */ + +void tty_flushDebugChar(void) +{ +} + +module_init(kgdb_enable_ints); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/arch/x86_64/lib/Makefile xx/arch/x86_64/lib/Makefile --- 2.6.5-rc2/arch/x86_64/lib/Makefile 2004-03-21 15:09:24.000000000 +0100 +++ xx/arch/x86_64/lib/Makefile 2004-03-21 15:21:40.000000000 +0100 @@ -10,3 +10,4 @@ lib-y := csum-partial.o csum-copy.o csum lib-y += memcpy.o memmove.o memset.o copy_user.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o +lib-$(CONFIG_KGDB) += kgdb_serial.o diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/Documentation/i386/kgdb/andthen xx/Documentation/i386/kgdb/andthen --- 2.6.5-rc2/Documentation/i386/kgdb/andthen 1970-01-01 01:00:00.000000000 +0100 +++ xx/Documentation/i386/kgdb/andthen 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,100 @@ + +define set_andthen + set var $thp=0 + set var $thp=(struct kgdb_and_then_struct *)&kgdb_data[0] + set var $at_size = (sizeof kgdb_data)/(sizeof *$thp) + set var $at_oc=kgdb_and_then_count + set var $at_cc=$at_oc +end + +define andthen_next + set var $at_cc=$arg0 +end + +define andthen + andthen_set_edge + if ($at_cc >= $at_oc) + printf "Outside window. Window size is %d\n",($at_oc-$at_low) + else + printf "%d: ",$at_cc + output *($thp+($at_cc++ % $at_size )) + printf "\n" + end +end +define andthen_set_edge + set var $at_oc=kgdb_and_then_count + set var $at_low = $at_oc - $at_size + if ($at_low < 0 ) + set var $at_low = 0 + end + if (( $at_cc > $at_oc) || ($at_cc < $at_low)) + printf "Count outside of window, setting count to " + if ($at_cc >= $at_oc) + set var $at_cc = $at_oc + else + set var $at_cc = $at_low + end + printf "%d\n",$at_cc + end +end + +define beforethat + andthen_set_edge + if ($at_cc <= $at_low) + printf "Outside window. Window size is %d\n",($at_oc-$at_low) + else + printf "%d: ",$at_cc-1 + output *($thp+(--$at_cc % $at_size )) + printf "\n" + end +end + +document andthen_next + andthen_next + . sets the number of the event to display next. If this event + . is not in the event pool, either andthen or beforethat will + . correct it to the nearest event pool edge. The event pool + . ends at the last event recorded and begins + . prior to that. If beforethat is used next, it will display + . event -1. +. + andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + + +document andthen + andthen +. displays the next event in the list. sets up to display +. the oldest saved event first. +. (optional) count of the event to display. +. note the number of events saved is specified at configure time. +. if events are saved between calls to andthen the index will change +. but the displayed event will be the next one (unless the event buffer +. is overrun). +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + +document set_andthen + set_andthen +. sets up to use the and commands. +. if you have defined your own struct, use the above and +. then enter the following: +. p $thp=(struct kgdb_and_then_structX *)&kgdb_data[0] +. where is the name of your structure. +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + +document beforethat + beforethat +. displays the next prior event in the list. sets up to +. display the last occuring event first. +. +. note the number of events saved is specified at configure time. +. if events are saved between calls to beforethat the index will change +. but the displayed event will be the next one (unless the event buffer +. is overrun). +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/Documentation/i386/kgdb/debug-nmi.txt xx/Documentation/i386/kgdb/debug-nmi.txt --- 2.6.5-rc2/Documentation/i386/kgdb/debug-nmi.txt 1970-01-01 01:00:00.000000000 +0100 +++ xx/Documentation/i386/kgdb/debug-nmi.txt 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,37 @@ +Subject: Debugging with NMI +Date: Mon, 12 Jul 1999 11:28:31 -0500 +From: David Grothe +Organization: Gcom, Inc +To: David Grothe + +Kernel hackers: + +Maybe this is old hat, but it is new to me -- + +On an ISA bus machine, if you short out the A1 and B1 pins of an ISA +slot you will generate an NMI to the CPU. This interrupts even a +machine that is hung in a loop with interrupts disabled. Used in +conjunction with kgdb < +ftp://ftp.gcom.com/pub/linux/src/kgdb-2.3.35/kgdb-2.3.35.tgz > you can +gain debugger control of a machine that is hung in the kernel! Even +without kgdb the kernel will print a stack trace so you can find out +where it was hung. + +The A1/B1 pins are directly opposite one another and the farthest pins +towards the bracket end of the ISA bus socket. You can stick a paper +clip or multi-meter probe between them to short them out. + +I had a spare ISA bus to PC104 bus adapter around. The PC104 end of the +board consists of two rows of wire wrap pins. So I wired a push button +between the A1/B1 pins and now have an ISA board that I can stick into +any ISA bus slot for debugger entry. + +Microsoft has a circuit diagram of a PCI card at +http://www.microsoft.com/hwdev/DEBUGGING/DMPSW.HTM. If you want to +build one you will have to mail them and ask for the PAL equations. +Nobody makes one comercially. + +[THIS TIP COMES WITH NO WARRANTY WHATSOEVER. It works for me, but if +your machine catches fire, it is your problem, not mine.] + +-- Dave (the kgdb guy) diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/Documentation/i386/kgdb/gdb-globals.txt xx/Documentation/i386/kgdb/gdb-globals.txt --- 2.6.5-rc2/Documentation/i386/kgdb/gdb-globals.txt 1970-01-01 01:00:00.000000000 +0100 +++ xx/Documentation/i386/kgdb/gdb-globals.txt 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,71 @@ +Sender: akale@veritas.com +Date: Fri, 23 Jun 2000 19:26:35 +0530 +From: "Amit S. Kale" +Organization: Veritas Software (India) +To: Dave Grothe , linux-kernel@vger.rutgers.edu +CC: David Milburn , + "Edouard G. Parmelan" , + ezannoni@cygnus.com, Keith Owens +Subject: Re: Module debugging using kgdb + +Dave Grothe wrote: +> +> Amit: +> +> There is a 2.4.0 version of kgdb on our ftp site: +> ftp://ftp.gcom.com/pub/linux/src/kgdb. I mirrored your version of gdb +> and loadmodule.sh there. +> +> Have a look at the README file and see if I go it right. If not, send +> me some corrections and I will update it. +> +> Does your version of gdb solve the global variable problem? + +Yes. +Thanks to Elena Zanoni, gdb (developement version) can now calculate +correctly addresses of dynamically loaded object files. I have not been +following gdb developement for sometime and am not sure when symbol +address calculation fix is going to appear in a gdb stable version. + +Elena, any idea when the fix will make it to a prebuilt gdb from a +redhat release? + +For the time being I have built a gdb developement version. It can be +used for module debugging with loadmodule.sh script. + +The problem with calculating of module addresses with previous versions +of gdb was as follows: +gdb did not use base address of a section while calculating address of +a symbol in the section in an object file loaded via 'add-symbol-file'. +It used address of .text segment instead. Due to this addresses of +symbols in .data, .bss etc. (e.g. global variables) were calculated incorrectly. + +Above mentioned fix allow gdb to use base address of a segment while +calculating address of a symbol in it. It adds a parameter '-s' to +'add-symbol-file' command for specifying base address of a segment. + +loadmodule.sh script works as follows. + +1. Copy a module file to target machine. +2. Load the module on the target machine using insmod with -m parameter. +insmod produces a module load map which contains base addresses of all +sections in the module and addresses of symbols in the module file. +3. Find all sections and their base addresses in the module from +the module map. +4. Generate a script that loads the module file. The script uses +'add-symbol-file' and specifies address of text segment followed by +addresses of all segments in the module. + +Here is an example gdb script produced by loadmodule.sh script. + +add-symbol-file foo 0xd082c060 -s .text.lock 0xd08cbfb5 +-s .fixup 0xd08cfbdf -s .rodata 0xd08cfde0 -s __ex_table 0xd08e3b38 +-s .data 0xd08e3d00 -s .bss 0xd08ec8c0 -s __ksymtab 0xd08ee838 + +With this command gdb can calculate addresses of symbols in ANY segment +in a module file. + +Regards. +-- +Amit Kale +Veritas Software ( http://www.veritas.com ) diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/Documentation/i386/kgdb/gdbinit xx/Documentation/i386/kgdb/gdbinit --- 2.6.5-rc2/Documentation/i386/kgdb/gdbinit 1970-01-01 01:00:00.000000000 +0100 +++ xx/Documentation/i386/kgdb/gdbinit 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,14 @@ +shell echo -e "\003" >/dev/ttyS0 +set remotebaud 38400 +target remote /dev/ttyS0 +define si +stepi +printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx +printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp +x/i $eip +end +define ni +nexti +printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx +printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp +x/i $eip diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/Documentation/i386/kgdb/gdbinit.hw xx/Documentation/i386/kgdb/gdbinit.hw --- 2.6.5-rc2/Documentation/i386/kgdb/gdbinit.hw 1970-01-01 01:00:00.000000000 +0100 +++ xx/Documentation/i386/kgdb/gdbinit.hw 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,117 @@ + +#Using ia-32 hardware breakpoints. +# +#4 hardware breakpoints are available in ia-32 processors. These breakpoints +#do not need code modification. They are set using debug registers. +# +#Each hardware breakpoint can be of one of the +#three types: execution, write, access. +#1. An Execution breakpoint is triggered when code at the breakpoint address is +#executed. +#2. A write breakpoint ( aka watchpoints ) is triggered when memory location +#at the breakpoint address is written. +#3. An access breakpoint is triggered when memory location at the breakpoint +#address is either read or written. +# +#As hardware breakpoints are available in limited number, use software +#breakpoints ( br command in gdb ) instead of execution hardware breakpoints. +# +#Length of an access or a write breakpoint defines length of the datatype to +#be watched. Length is 1 for char, 2 short , 3 int. +# +#For placing execution, write and access breakpoints, use commands +#hwebrk, hwwbrk, hwabrk +#To remove a breakpoint use hwrmbrk command. +# +#These commands take following types of arguments. For arguments associated +#with each command, use help command. +#1. breakpointno: 0 to 3 +#2. length: 1 to 3 +#3. address: Memory location in hex ( without 0x ) e.g c015e9bc +# +#Use the command exinfo to find which hardware breakpoint occured. + +#hwebrk breakpointno address +define hwebrk + maintenance packet Y$arg0,0,0,$arg1 +end +document hwebrk + hwebrk
+ Places a hardware execution breakpoint + = 0 - 3 +
= Hex digits without leading "0x". +end + +#hwwbrk breakpointno length address +define hwwbrk + maintenance packet Y$arg0,1,$arg1,$arg2 +end +document hwwbrk + hwwbrk
+ Places a hardware write breakpoint + = 0 - 3 + = 1 (1 byte), 2 (2 byte), 3 (4 byte) +
= Hex digits without leading "0x". +end + +#hwabrk breakpointno length address +define hwabrk + maintenance packet Y$arg0,1,$arg1,$arg2 +end +document hwabrk + hwabrk
+ Places a hardware access breakpoint + = 0 - 3 + = 1 (1 byte), 2 (2 byte), 3 (4 byte) +
= Hex digits without leading "0x". +end + +#hwrmbrk breakpointno +define hwrmbrk + maintenance packet y$arg0 +end +document hwrmbrk + hwrmbrk + = 0 - 3 + Removes a hardware breakpoint +end + +define reboot + maintenance packet r +end +#exinfo +define exinfo + maintenance packet qE +end +document exinfo + exinfo + Gives information about a breakpoint. +end +define get_th + p $th=(struct thread_info *)((int)$esp & ~8191) +end +document get_th + get_tu + Gets and prints the current thread_info pointer, Defines th to be it. +end +define get_cu + p $cu=((struct thread_info *)((int)$esp & ~8191))->task +end +document get_cu + get_cu + Gets and print the "current" value. Defines $cu to be it. +end +define int_off + set var $flags=$eflags + set $eflags=$eflags&~0x200 + end +define int_on + set var $eflags|=$flags&0x200 + end +document int_off + saves the current interrupt state and clears the processor interrupt + flag. Use int_on to restore the saved flag. +end +document int_on + Restores the interrupt flag saved by int_off. +end diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/Documentation/i386/kgdb/gdbinit-modules xx/Documentation/i386/kgdb/gdbinit-modules --- 2.6.5-rc2/Documentation/i386/kgdb/gdbinit-modules 1970-01-01 01:00:00.000000000 +0100 +++ xx/Documentation/i386/kgdb/gdbinit-modules 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,146 @@ +# +# Usefull GDB user-command to debug Linux Kernel Modules with gdbstub. +# +# This don't work for Linux-2.0 or older. +# +# Author Edouard G. Parmelan +# +# +# Fri Apr 30 20:33:29 CEST 1999 +# First public release. +# +# Major cleanup after experiment Linux-2.0 kernel without success. +# Symbols of a module are not in the correct order, I can't explain +# why :( +# +# Fri Mar 19 15:41:40 CET 1999 +# Initial version. +# +# Thu Jan 6 16:29:03 CST 2000 +# A little fixing by Dave Grothe +# +# Mon Jun 19 09:33:13 CDT 2000 +# Alignment changes from Edouard Parmelan +# +# The basic idea is to find where insmod load the module and inform +# GDB to load the symbol table of the module with the GDB command +# ``add-symbol-file
''. +# +# The Linux kernel holds the list of all loaded modules in module_list, +# this list end with &kernel_module (exactly with module->next == NULL, +# but the last module is not a real module). +# +# Insmod allocates the struct module before the object file. Since +# Linux-2.1, this structure contain his size. The real address of +# the object file is then (char*)module + module->size_of_struct. +# +# You can use three user functions ``mod-list'', ``mod-print-symbols'' +# and ``add-module-symbols''. +# +# mod-list list all loaded modules with the format: +# +# +# As soon as you have found the address of your module, you can +# print its exported symbols (mod-print-symbols) or inform GDB to add +# symbols from your module file (mod-add-symbols). +# +# The argument that you give to mod-print-symbols or mod-add-symbols +# is the from the mod-list command. +# +# When using the mod-add-symbols command you must also give the full +# pathname of the modules object code file. +# +# The command mod-add-lis is an example of how to make this easier. +# You can edit this macro to contain the path name of your own +# favorite module and then use it as a shorthand to load it. You +# still need the module-address, however. +# +# The internal function ``mod-validate'' set the GDB variable $mod +# as a ``struct module*'' if the kernel known the module otherwise +# $mod is set to NULL. This ensure to not add symbols for a wrong +# address. +# +# Have a nice hacking day ! +# +# +define mod-list + set $mod = (struct module*)module_list + # the last module is the kernel, ignore it + while $mod != &kernel_module + printf "%p\t%s\n", (long)$mod, ($mod)->name + set $mod = $mod->next + end +end +document mod-list +List all modules in the form: +Use the as the argument for the other +mod-commands: mod-print-symbols, mod-add-symbols. +end + +define mod-validate + set $mod = (struct module*)module_list + while ($mod != $arg0) && ($mod != &kernel_module) + set $mod = $mod->next + end + if $mod == &kernel_module + set $mod = 0 + printf "%p is not a module\n", $arg0 + end +end +document mod-validate +mod-validate +Internal user-command used to validate the module parameter. +If is a real loaded module, set $mod to it otherwise set $mod to 0. +end + + +define mod-print-symbols + mod-validate $arg0 + if $mod != 0 + set $i = 0 + while $i < $mod->nsyms + set $sym = $mod->syms[$i] + printf "%p\t%s\n", $sym->value, $sym->name + set $i = $i + 1 + end + end +end +document mod-print-symbols +mod-print-symbols +Print all exported symbols of the module. see mod-list +end + + +define mod-add-symbols-align + mod-validate $arg0 + if $mod != 0 + set $mod_base = ($mod->size_of_struct + (long)$mod) + if ($arg2 != 0) && (($mod_base & ($arg2 - 1)) != 0) + set $mod_base = ($mod_base | ($arg2 - 1)) + 1 + end + add-symbol-file $arg1 $mod_base + end +end +document mod-add-symbols-align +mod-add-symbols-align +Load the symbols table of the module from the object file where +first section aligment is . +To retreive alignment, use `objdump -h '. +end + +define mod-add-symbols + mod-add-symbols-align $arg0 $arg1 sizeof(long) +end +document mod-add-symbols +mod-add-symbols +Load the symbols table of the module from the object file. +Default alignment is 4. See mod-add-symbols-align. +end + +define mod-add-lis + mod-add-symbols-align $arg0 /usr/src/LiS/streams.o 16 +end +document mod-add-lis +mod-add-lis +Does mod-add-symbols /usr/src/LiS/streams.o +end diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/Documentation/i386/kgdb/kgdb.txt xx/Documentation/i386/kgdb/kgdb.txt --- 2.6.5-rc2/Documentation/i386/kgdb/kgdb.txt 1970-01-01 01:00:00.000000000 +0100 +++ xx/Documentation/i386/kgdb/kgdb.txt 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,775 @@ +Last edit: <20030806.1637.12> +This file has information specific to the i386 kgdb option. Other +platforms with the kgdb option may behave in a similar fashion. + +New features: +============ +20030806.1557.37 +This version was made against the 2.6.0-test2 kernel. We have made the +following changes: + +- The getthread() code in the stub calls find_task_by_pid(). It fails + if we are early in the bring up such that the pid arrays have yet to + be allocated. We have added a line to kernel/pid.c to make + "kgdb_pid_init_done" true once the arrays are allocated. This way the + getthread() code knows not to call. This is only used by the thread + debugging stuff and threads will not yet exist at this point in the + boot. + +- For some reason, gdb was not asking for a new thread list when the + "info thread" command was given. We changed to the newer version of + the thread info command and gdb now seems to ask when needed. Result, + we now get all threads in the thread list. + +- We now respond to the ThreadExtraInfo request from gdb with the thread + name from task_struct .comm. This then appears in the thread list. + Thoughts on additional options for this are welcome. Things such as + "has BKL" and "Preempted" come to mind. I think we could have a flag + word that could enable different bits of info here. + +- We now honor, sort of, the C and S commands. These are continue and + single set after delivering a signal. We ignore the signal and do the + requested action. This only happens when we told gdb that a signal + was the reason for entry, which is only done on memory faults. The + result is that you can now continue into the Oops. + +- We changed the -g to -gdwarf-2. This seems to be the same as -ggdb, + but it is more exact on what language to use. + +- We added two dwarf2 include files and a bit of code at the end of + entry.S. This does not yet work, so it is disabled. Still we want to + keep track of the code and "maybe" someone out there can fix it. + +- Randy Dunlap sent some fix ups for this file which are now merged. + +- Hugh Dickins sent a fix to a bit of code in traps.c that prevents a + compiler warning if CONFIG_KGDB is off (now who would do that :). + +- Andrew Morton sent a fix for the serial driver which is now merged. + +- Andrew also sent a change to the stub around the cpu managment code + which is also merged. + +- Andrew also sent a patch to make "f" as well as "g" work as SysRq + commands to enter kgdb, merged. + +- If CONFIG_KGDB and CONFIG_DEBUG_SPINLOCKS are both set we added a + "who" field to the spinlock data struct. This is filled with + "current" when ever the spinlock suceeds. Useful if you want to know + who has the lock. + +_ And last, but not least, we fixed the "get_cu" macro to properly get + the current value of "current". + +New features: +============ +20030505.1827.27 +We are starting to align with the sourceforge version, at least in +commands. To this end, the boot command string to start kgdb at +boot time has been changed from "kgdb" to "gdb". + +Andrew Morton sent a couple of patches which are now included as follows: +1.) We now return a flag to the interrupt handler. +2.) We no longer use smp_num_cpus (a conflict with the lock meter). +3.) And from William Lee Irwin III code to make + sure high-mem is set up before we attempt to register our interrupt + handler. +We now include asm/kgdb.h from config.h so you will most likely never +have to include it. It also 'NULLS' the kgdb macros you might have in +your code when CONFIG_KGDB is not defined. This allows you to just +turn off CONFIG_KGDB to turn off all the kgdb_ts() calls and such. +This include is conditioned on the machine being an x86 so as to not +mess with other archs. + +20020801.1129.03 +This is currently the version for the 2.4.18 (and beyond?) kernel. + +We have several new "features" beginning with this version: + +1.) Kgdb now syncs the "other" CPUs with a cross-CPU NMI. No more + waiting and it will pull that guy out of an IRQ off spin lock :) + +2.) We doctored up the code that tells where a task is waiting and + included it so that the "info thread" command will show a bit more + than "schedule()". Try it... + +3.) Added the ability to call a function from gdb. All the standard gdb + issues apply, i.e. if you hit a breakpoint in the function, you are + not allowed to call another (gdb limitation, not kgdb). To help + this capability we added a memory allocation function. Gdb does not + return this memory (it is used for strings that you pass to that function + you are calling from gdb) so we fixed up a way to allow you to + manually return the memory (see below). + +4.) Kgdb time stamps (kgdb_ts()) are enhanced to expand what was the + interrupt flag to now also include the preemption count and the + "in_interrupt" info. The flag is now called "with_pif" to indicate + the order, preempt_count, in_interrupt, flag. The preempt_count is + shifted left by 4 bits so you can read the count in hex by dropping + the low order digit. In_interrupt is in bit 1, and the flag is in + bit 0. + +5.) The command: "p kgdb_info" is now expanded and prints something + like: +(gdb) p kgdb_info +$2 = {used_malloc = 0, called_from = 0xc0107506, entry_tsc = 67468627259, + errcode = 0, vector = 3, print_debug_info = 0, hold_on_sstep = 1, + cpus_waiting = {{task = 0xc027a000, pid = 32768, hold = 0, + regs = 0xc027bf84}, {task = 0x0, pid = 0, hold = 0, regs = 0x0}}} + + Things to note here: a.) used_malloc is the amount of memory that + has been malloc'ed to do calls from gdb. You can reclaim this + memory like this: "p kgdb_info.used_malloc=0" Cool, huh? b.) + cpus_waiting is now "sized" by the number of CPUs you enter at + configure time in the kgdb configure section. This is NOT used + anywhere else in the system, but it is "nice" here. c.) The task's + "pid" is now in the structure. This is the pid you will need to use + to decode to the thread id to get gdb to look at that thread. + Remember that the "info thread" command prints a list of threads + wherein it numbers each thread with its reference number followed + by the thread's pid. Note that the per-CPU idle threads actually + have pids of 0 (yes, there is more than one pid 0 in an SMP system). + To avoid confusion, kgdb numbers these threads with numbers beyond + the MAX_PID. That is why you see 32768 and above. + +6.) A subtle change, we now provide the complete register set for tasks + that are active on the other CPUs. This allows better trace back on + those tasks. + + And, let's mention what we could not fix. Back-trace from all but the + thread that we trapped will, most likely, have a bogus entry in it. + The problem is that gdb does not recognize the entry code for + functions that use "current" near (at all?) the entry. The compiler + is putting the "current" decode as the first two instructions of the + function where gdb expects to find %ebp changing code. Back trace + also has trouble with interrupt frames. I am talking with Daniel + Jacobowitz about some way to fix this, but don't hold your breath. + +20011220.0050.35 +Major enhancement with this version is the ability to hold one or more +CPUs in an SMP system while allowing the others to continue. Also, by +default only the current CPU is enabled on single-step commands (please +note that gdb issues single-step commands at times other than when you +use the si command). + +Another change is to collect some useful information in +a global structure called "kgdb_info". You should be able to just: + +p kgdb_info + +although I have seen cases where the first time this is done gdb just +prints the first member but prints the whole structure if you then enter +CR (carriage return or enter). This also works: + +p *&kgdb_info + +Here is a sample: +(gdb) p kgdb_info +$4 = {called_from = 0xc010732c, entry_tsc = 32804123790856, errcode = 0, + vector = 3, print_debug_info = 0} + +"Called_from" is the return address from the current entry into kgdb. +Sometimes it is useful to know why you are in kgdb, for example, was +it an NMI or a real breakpoint? The simple way to interrogate this +return address is: + +l *0xc010732c + +which will print the surrounding few lines of source code. + +"Entry_tsc" is the CPU TSC on entry to kgdb (useful to compare to the +kgdb_ts entries). + +"errcode" and "vector" are other entry parameters which may be helpful on +some traps. + +"print_debug_info" is the internal debugging kgdb print enable flag. Yes, +you can modify it. + +In SMP systems kgdb_info also includes the "cpus_waiting" structure and +"hold_on_step": + +(gdb) p kgdb_info +$7 = {called_from = 0xc0112739, entry_tsc = 1034936624074, errcode = 0, + vector = 2, print_debug_info = 0, hold_on_sstep = 1, cpus_waiting = {{ + task = 0x0, hold = 0, regs = 0x0}, {task = 0xc71b8000, hold = 0, + regs = 0xc71b9f70}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}}} + +"Cpus_waiting" has an entry for each CPU other than the current one that +has been stopped. Each entry contains the task_struct address for that +CPU, the address of the regs for that task and a hold flag. All these +have the proper typing so that, for example: + +p *kgdb_info.cpus_waiting[1].regs + +will print the registers for CPU 1. + +"Hold_on_sstep" is a new feature with this version and comes up set or +true. What this means is that whenever kgdb is asked to single-step all +other CPUs are held (i.e. not allowed to execute). The flag applies to +all but the current CPU and, again, can be changed: + +p kgdb_info.hold_on_sstep=0 + +restores the old behavior of letting all CPUs run during single-stepping. + +Likewise, each CPU has a "hold" flag, which if set, locks that CPU out +of execution. Note that this has some risk in cases where the CPUs need +to communicate with each other. If kgdb finds no CPU available on exit, +it will push a message thru gdb and stay in kgdb. Note that it is legal +to hold the current CPU as long as at least one CPU can execute. + +20010621.1117.09 +This version implements an event queue. Events are signaled by calling +a function in the kgdb stub and may be examined from gdb. See EVENTS +below for details. This version also tightens up the interrupt and SMP +handling to not allow interrupts on the way to kgdb from a breakpoint +trap. It is fine to allow these interrupts for user code, but not +system debugging. + +Version +======= + +This version of the kgdb package was developed and tested on +kernel version 2.4.16. It will not install on any earlier kernels. +It is possible that it will continue to work on later versions +of 2.4 and then versions of 2.5 (I hope). + + +Debugging Setup +=============== + +Designate one machine as the "development" machine. This is the +machine on which you run your compiles and which has your source +code for the kernel. Designate a second machine as the "target" +machine. This is the machine that will run your experimental +kernel. + +The two machines will be connected together via a serial line out +one or the other of the COM ports of the PC. You will need the +appropriate modem eliminator (null modem) cable(s) for this. + +Decide on which tty port you want the machines to communicate, then +connect them up back-to-back using the null modem cable. COM1 is +/dev/ttyS0 and COM2 is /dev/ttyS1. You should test this connection +with the two machines prior to trying to debug a kernel. Once you +have it working, on the TARGET machine, enter: + +setserial /dev/ttyS0 (or what ever tty you are using) + +and record the port address and the IRQ number. + +On the DEVELOPMENT machine you need to apply the patch for the kgdb +hooks. You have probably already done that if you are reading this +file. + +On your DEVELOPMENT machine, go to your kernel source directory and do +"make Xconfig" where X is one of "x", "menu", or "". If you are +configuring in the standard serial driver, it must not be a module. +Either yes or no is ok, but making the serial driver a module means it +will initialize after kgdb has set up the UART interrupt code and may +cause a failure of the control-C option discussed below. The configure +question for the serial driver is under the "Character devices" heading +and is: + +"Standard/generic (8250/16550 and compatible UARTs) serial support" + +Go down to the kernel debugging menu item and open it up. Enable the +kernel kgdb stub code by selecting that item. You can also choose to +turn on the "-ggdb -O1" compile options. The -ggdb causes the compiler +to put more debug info (like local symbols) in the object file. On the +i386 -g and -ggdb are the same so this option just reduces to "O1". The +-O1 reduces the optimization level. This may be helpful in some cases, +be aware, however, that this may also mask the problem you are looking +for. + +The baud rate. Default is 115200. What ever you choose be sure that +the host machine is set to the same speed. I recommend the default. + +The port. This is the I/O address of the serial UART that you should +have gotten using setserial as described above. The standard COM1 port +(3f8) using IRQ 4 is default. COM2 is 2f8 which by convention uses IRQ +3. + +The port IRQ (see above). + +Stack overflow test. This option makes a minor change in the trap, +system call and interrupt code to detect stack overflow and transfer +control to kgdb if it happens. (Some platforms have this in the +baseline code, but the i386 does not.) + +You can also configure the system to recognize the boot option +"console=kgdb" which if given will cause all console output during +booting to be put thru gdb as well as other consoles. This option +requires that gdb and kgdb be connected prior to sending console output +so, if they are not, a breakpoint is executed to force the connection. +This will happen before any kernel output (it is going thru gdb, right), +and will stall the boot until the connection is made. + +You can also configure in a patch to SysRq to enable the kGdb SysRq. +This request generates a breakpoint. Since the serial port IRQ line is +set up after any serial drivers, it is possible that this command will +work when the control-C will not. + +Save and exit the Xconfig program. Then do "make clean" , "make dep" +and "make bzImage" (or whatever target you want to make). This gets the +kernel compiled with the "-g" option set -- necessary for debugging. + +You have just built the kernel on your DEVELOPMENT machine that you +intend to run on your TARGET machine. + +To install this new kernel, use the following installation procedure. +Remember, you are on the DEVELOPMENT machine patching the kernel source +for the kernel that you intend to run on the TARGET machine. + +Copy this kernel to your target machine using your usual procedures. I +usually arrange to copy development: +/usr/src/linux/arch/i386/boot/bzImage to /vmlinuz on the TARGET machine +via a LAN based NFS access. That is, I run the cp command on the target +and copy from the development machine via the LAN. Run Lilo (see "man +lilo" for details on how to set this up) on the new kernel on the target +machine so that it will boot! Then boot the kernel on the target +machine. + +On the DEVELOPMENT machine, create a file called .gdbinit in the +directory /usr/src/linux. An example .gdbinit file looks like this: + +shell echo -e "\003" >/dev/ttyS0 +set remotebaud 38400 (or what ever speed you have chosen) +target remote /dev/ttyS0 + + +Change the "echo" and "target" definition so that it specifies the tty +port that you intend to use. Change the "remotebaud" definition to +match the data rate that you are going to use for the com line. + +You are now ready to try it out. + +Boot your target machine with "kgdb" in the boot command i.e. something +like: + +lilo> test kgdb + +or if you also want console output thru gdb: + +lilo> test kgdb console=kgdb + +You should see the lilo message saying it has loaded the kernel and then +all output stops. The kgdb stub is trying to connect with gdb. Start +gdb something like this: + + +On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". +When gdb gets the symbols loaded it will read your .gdbinit file and, if +everything is working correctly, you should see gdb print out a few +lines indicating that a breakpoint has been taken. It will actually +show a line of code in the target kernel inside the kgdb activation +code. + +The gdb interaction should look something like this: + + linux-dev:/usr/src/linux# gdb vmlinux + GDB is free software and you are welcome to distribute copies of it + under certain conditions; type "show copying" to see the conditions. + There is absolutely no warranty for GDB; type "show warranty" for details. + GDB 4.15.1 (i486-slackware-linux), + Copyright 1995 Free Software Foundation, Inc... + breakpoint () at i386-stub.c:750 + 750 } + (gdb) + +You can now use whatever gdb commands you like to set breakpoints. +Enter "continue" to start your target machine executing again. At this +point the target system will run at full speed until it encounters +your breakpoint or gets a segment violation in the kernel, or whatever. + +If you have the kgdb console enabled when you continue, gdb will print +out all the console messages. + +The above example caused a breakpoint relatively early in the boot +process. For the i386 kgdb it is possible to code a break instruction +as the first C-language point in init/main.c, i.e. as the first instruction +in start_kernel(). This could be done as follows: + +#include + breakpoint(); + +This breakpoint() is really a function that sets up the breakpoint and +single-step hardware trap cells and then executes a breakpoint. Any +early hard coded breakpoint will need to use this function. Once the +trap cells are set up they need not be set again, but doing it again +does not hurt anything, so you don't need to be concerned about which +breakpoint is hit first. Once the trap cells are set up (and the kernel +sets them up in due course even if breakpoint() is never called) the +macro: + +BREAKPOINT; + +will generate an inline breakpoint. This may be more useful as it stops +the processor at the instruction instead of in a function a step removed +from the location of interest. In either case must be +included to define both breakpoint() and BREAKPOINT. + +Triggering kgdbstub at other times +================================== + +Often you don't need to enter the debugger until much later in the boot +or even after the machine has been running for some time. Once the +kernel is booted and interrupts are on, you can force the system to +enter the debugger by sending a control-C to the debug port. This is +what the first line of the recommended .gdbinit file does. This allows +you to start gdb any time after the system is up as well as when the +system is already at a breakpoint. (In the case where the system is +already at a breakpoint the control-C is not needed, however, it will +be ignored by the target so no harm is done. Also note the the echo +command assumes that the port speed is already set. This will be true +once gdb has connected, but it is best to set the port speed before you +run gdb.) + +Another simple way to do this is to put the following file in you ~/bin +directory: + +#!/bin/bash +echo -e "\003" > /dev/ttyS0 + +Here, the ttyS0 should be replaced with what ever port you are using. +The "\003" is control-C. Once you are connected with gdb, you can enter +control-C at the command prompt. + +An alternative way to get control to the debugger is to enable the kGdb +SysRq command. Then you would enter Alt-SysRq-g (all three keys at the +same time, but push them down in the order given). To refresh your +memory of the available SysRq commands try Alt-SysRq-=. Actually any +undefined command could replace the "=", but I like to KNOW that what I +am pushing will never be defined. + +Debugging hints +=============== + +You can break into the target machine at any time from the development +machine by typing ^C (see above paragraph). If the target machine has +interrupts enabled this will stop it in the kernel and enter the +debugger. + +There is unfortunately no way of breaking into the kernel if it is +in a loop with interrupts disabled, so if this happens to you then +you need to place exploratory breakpoints or printk's into the kernel +to find out where it is looping. The exploratory breakpoints can be +entered either thru gdb or hard coded into the source. This is very +handy if you do something like: + +if () BREAKPOINT; + + +There is a copy of an e-mail in the Documentation/i386/kgdb/ directory +(debug-nmi.txt) which describes how to create an NMI on an ISA bus +machine using a paper clip. I have a sophisticated version of this made +by wiring a push button switch into a PC104/ISA bus adapter card. The +adapter card nicely furnishes wire wrap pins for all the ISA bus +signals. + +When you are done debugging the kernel on the target machine it is a +good idea to leave it in a running state. This makes reboots faster, +bypassing the fsck. So do a gdb "continue" as the last gdb command if +this is possible. To terminate gdb itself on the development machine +and leave the target machine running, first clear all breakpoints and +continue, then type ^Z to suspend gdb and then kill it with "kill %1" or +something similar. + +If gdbstub Does Not Work +======================== + +If it doesn't work, you will have to troubleshoot it. Do the easy +things first like double checking your cabling and data rates. You +might try some non-kernel based programs to see if the back-to-back +connection works properly. Just something simple like cat /etc/hosts +>/dev/ttyS0 on one machine and cat /dev/ttyS0 on the other will tell you +if you can send data from one machine to the other. Make sure it works +in both directions. There is no point in tearing out your hair in the +kernel if the line doesn't work. + +All of the real action takes place in the file +/usr/src/linux/arch/i386/kernel/kgdb_stub.c. That is the code on the target +machine that interacts with gdb on the development machine. In gdb you can +turn on a debug switch with the following command: + + set remotedebug + +This will print out the protocol messages that gdb is exchanging with +the target machine. + +Another place to look is /usr/src/arch/i386/lib/kgdb_serial.c. This is +the code that talks to the serial port on the target side. There might +be a problem there. In particular there is a section of this code that +tests the UART which will tell you what UART you have if you define +"PRNT" (just remove "_off" from the #define PRNT_off). To view this +report you will need to boot the system without any beakpoints. This +allows the kernel to run to the point where it calls kgdb to set up +interrupts. At this time kgdb will test the UART and print out the type +it finds. (You need to wait so that the printks are actually being +printed. Early in the boot they are cached, waiting for the console to +be enabled. Also, if kgdb is entered thru a breakpoint it is possible +to cause a dead lock by calling printk when the console is locked. The +stub thus avoids doing printks from breakpoints, especially in the +serial code.) At this time, if the UART fails to do the expected thing, +kgdb will print out (using printk) information on what failed. (These +messages will be buried in all the other boot up messages. Look for +lines that start with "gdb_hook_interrupt:". You may want to use dmesg +once the system is up to view the log. If this fails or if you still +don't connect, review your answers for the port address. Use: + +setserial /dev/ttyS0 + +to get the current port and IRQ information. This command will also +tell you what the system found for the UART type. The stub recognizes +the following UART types: + +16450, 16550, and 16550A + +If you are really desperate you can use printk debugging in the +kgdbstub code in the target kernel until you get it working. In particular, +there is a global variable in /usr/src/linux/arch/i386/kernel/kgdb_stub.c +named "remote_debug". Compile your kernel with this set to 1, rather +than 0 and the debug stub will print out lots of stuff as it does +what it does. Likewise there are debug printks in the kgdb_serial.c +code that can be turned on with simple changes in the macro defines. + + +Debugging Loadable Modules +========================== + +This technique comes courtesy of Edouard Parmelan + + +When you run gdb, enter the command + +source gdbinit-modules + +This will read in a file of gdb macros that was installed in your +kernel source directory when kgdb was installed. This file implements +the following commands: + +mod-list + Lists the loaded modules in the form + +mod-print-symbols + Prints all the symbols in the indicated module. + +mod-add-symbols + Loads the symbols from the object file and associates them + with the indicated module. + +After you have loaded the module that you want to debug, use the command +mod-list to find the of your module. Then use that +address in the mod-add-symbols command to load your module's symbols. +From that point onward you can debug your module as if it were a part +of the kernel. + +The file gdbinit-modules also contains a command named mod-add-lis as +an example of how to construct a command of your own to load your +favorite module. The idea is to "can" the pathname of the module +in the command so you don't have to type so much. + +Threads +======= + +Each process in a target machine is seen as a gdb thread. gdb thread +related commands (info threads, thread n) can be used. + +ia-32 hardware breakpoints +========================== + +kgdb stub contains support for hardware breakpoints using debugging features +of ia-32(x86) processors. These breakpoints do not need code modification. +They use debugging registers. 4 hardware breakpoints are available in ia-32 +processors. + +Each hardware breakpoint can be of one of the following three types. + +1. Execution breakpoint - An Execution breakpoint is triggered when code + at the breakpoint address is executed. + + As limited number of hardware breakpoints are available, it is + advisable to use software breakpoints ( break command ) instead + of execution hardware breakpoints, unless modification of code + is to be avoided. + +2. Write breakpoint - A write breakpoint is triggered when memory + location at the breakpoint address is written. + + A write or can be placed for data of variable length. Length of + a write breakpoint indicates length of the datatype to be + watched. Length is 1 for 1 byte data , 2 for 2 byte data, 3 for + 4 byte data. + +3. Access breakpoint - An access breakpoint is triggered when memory + location at the breakpoint address is either read or written. + + Access breakpoints also have lengths similar to write breakpoints. + +IO breakpoints in ia-32 are not supported. + +Since gdb stub at present does not use the protocol used by gdb for hardware +breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros +for hardware breakpoints are described below. + +hwebrk - Places an execution breakpoint + hwebrk breakpointno address +hwwbrk - Places a write breakpoint + hwwbrk breakpointno length address +hwabrk - Places an access breakpoint + hwabrk breakpointno length address +hwrmbrk - Removes a breakpoint + hwrmbrk breakpointno +exinfo - Tells whether a software or hardware breakpoint has occurred. + Prints number of the hardware breakpoint if a hardware breakpoint has + occurred. + +Arguments required by these commands are as follows +breakpointno - 0 to 3 +length - 1 to 3 +address - Memory location in hex digits ( without 0x ) e.g c015e9bc + +SMP support +========== + +When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb +client, all the processors are forced to enter the debugger. Current +thread corresponds to the thread running on the processor where +breakpoint occurred. Threads running on other processor(s) appear +similar to other non-running threads in the 'info threads' output. +Within the kgdb stub there is a structure "waiting_cpus" in which kgdb +records the values of "current" and "regs" for each CPU other than the +one that hit the breakpoint. "current" is a pointer to the task +structure for the task that CPU is running, while "regs" points to the +saved registers for the task. This structure can be examined with the +gdb "p" command. + +ia-32 hardware debugging registers on all processors are set to same +values. Hence any hardware breakpoints may occur on any processor. + +gdb troubleshooting +=================== + +1. gdb hangs +Kill it. restart gdb. Connect to target machine. + +2. gdb cannot connect to target machine (after killing a gdb and +restarting another) If the target machine was not inside debugger when +you killed gdb, gdb cannot connect because the target machine won't +respond. In this case echo "Ctrl+C"(ASCII 3) to the serial line. +e.g. echo -e "\003" > /dev/ttyS1 +This forces that target machine into the debugger, after which you +can connect. + +3. gdb cannot connect even after echoing Ctrl+C into serial line +Try changing serial line settings min to 1 and time to 0 +e.g. stty min 1 time 0 < /dev/ttyS1 +Try echoing again + +Check serial line speed and set it to correct value if required +e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 + +EVENTS +====== + +Ever want to know the order of things happening? Which CPU did what and +when? How did the spinlock get the way it is? Then events are for +you. Events are defined by calls to an event collection interface and +saved for later examination. In this case, kgdb events are saved by a +very fast bit of code in kgdb which is fully SMP and interrupt protected +and they are examined by using gdb to display them. Kgdb keeps only +the last N events, where N must be a power of two and is defined at +configure time. + + +Events are signaled to kgdb by calling: + +kgdb_ts(data0,data1) + +For each call kgdb records each call in an array along with other info. +Here is the array definition: + +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + long long at_time; + int from_ln; + char * in_src; + void *from; + int with_if; + int data0; + int data1; +}; + +For SMP machines the CPU is recorded, for all machines the TSC is +recorded (gets a time stamp) as well as the line number and source file +the call was made from. The address of the (from), the "if" (interrupt +flag) and the two data items are also recorded. The macro kgdb_ts casts +the types to int, so you can put any 32-bit values here. There is a +configure option to select the number of events you want to keep. A +nice number might be 128, but you can keep up to 1024 if you want. The +number must be a power of two. An "andthen" macro library is provided +for gdb to help you look at these events. It is also possible to define +a different structure for the event storage and cast the data to this +structure. For example the following structure is defined in kgdb: + +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + long long at_time; + int from_ln; + char * in_src; + void *from; + int with_if; + struct task_struct *t1; + struct task_struct *t2; +}; + +If you use this for display, the data elements will be displayed as +pointers to task_struct entries. You may want to define your own +structure to use in casting. You should only change the last two items +and you must keep the structure size the same. Kgdb will handle these +as 32-bit ints, but within that constraint you can define a structure to +cast to any 32-bit quantity. This need only be available to gdb and is +only used for casting in the display code. + +Final Items +=========== + +I picked up this code from Amit S. Kale and enhanced it. + +If you make some really cool modification to this stuff, or if you +fix a bug, please let me know. + +George Anzinger + + +Amit S. Kale + + +(First kgdb by David Grothe ) + +(modified by Tigran Aivazian ) + Putting gdbstub into the kernel config menu. + +(modified by Scott Foehner ) + Hooks for entering gdbstub at boot time. + +(modified by Amit S. Kale ) + Threads, ia-32 hw debugging, mp support, console support, + nmi watchdog handling. + +(modified by George Anzinger ) + Extended threads to include the idle threads. + Enhancements to allow breakpoint() at first C code. + Use of module_init() and __setup() to automate the configure. + Enhanced the cpu "collection" code to work in early bring-up. + Added ability to call functions from gdb + Print info thread stuff without going back to schedule() + Now collect the "other" cpus with an IPI/ NMI. diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/Documentation/i386/kgdb/loadmodule.sh xx/Documentation/i386/kgdb/loadmodule.sh --- 2.6.5-rc2/Documentation/i386/kgdb/loadmodule.sh 1970-01-01 01:00:00.000000000 +0100 +++ xx/Documentation/i386/kgdb/loadmodule.sh 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,78 @@ +#/bin/sh +# This script loads a module on a target machine and generates a gdb script. +# source generated gdb script to load the module file at appropriate addresses +# in gdb. +# +# Usage: +# Loading the module on target machine and generating gdb script) +# [foo]$ loadmodule.sh +# +# Loading the module file into gdb +# (gdb) source +# +# Modify following variables according to your setup. +# TESTMACHINE - Name of the target machine +# GDBSCRIPTS - The directory where a gdb script will be generated +# +# Author: Amit S. Kale (akale@veritas.com). +# +# If you run into problems, please check files pointed to by following +# variables. +# ERRFILE - /tmp/.errs contains stderr output of insmod +# MAPFILE - /tmp/.map contains stdout output of insmod +# GDBSCRIPT - $GDBSCRIPTS/load gdb script. + +TESTMACHINE=foo +GDBSCRIPTS=/home/bar + +if [ $# -lt 1 ] ; then { + echo Usage: $0 modulefile + exit +} ; fi + +MODULEFILE=$1 +MODULEFILEBASENAME=`basename $1` + +if [ $MODULEFILE = $MODULEFILEBASENAME ] ; then { + MODULEFILE=`pwd`/$MODULEFILE +} fi + +ERRFILE=/tmp/$MODULEFILEBASENAME.errs +MAPFILE=/tmp/$MODULEFILEBASENAME.map +GDBSCRIPT=$GDBSCRIPTS/load$MODULEFILEBASENAME + +function findaddr() { + local ADDR=0x$(echo "$SEGMENTS" | \ + grep "$1" | sed 's/^[^ ]*[ ]*[^ ]*[ ]*//' | \ + sed 's/[ ]*[^ ]*$//') + echo $ADDR +} + +function checkerrs() { + if [ "`cat $ERRFILE`" != "" ] ; then { + cat $ERRFILE + exit + } fi +} + +#load the module +echo Copying $MODULEFILE to $TESTMACHINE +rcp $MODULEFILE root@${TESTMACHINE}: + +echo Loading module $MODULEFILE +rsh -l root $TESTMACHINE /sbin/insmod -m ./`basename $MODULEFILE` \ + > $MAPFILE 2> $ERRFILE +checkerrs + +SEGMENTS=`head -n 11 $MAPFILE | tail -n 10` +TEXTADDR=$(findaddr "\\.text[^.]") +LOADSTRING="add-symbol-file $MODULEFILE $TEXTADDR" +SEGADDRS=`echo "$SEGMENTS" | awk '//{ + if ($1 != ".text" && $1 != ".this" && + $1 != ".kstrtab" && $1 != ".kmodtab") { + print " -s " $1 " 0x" $3 " " + } +}'` +LOADSTRING="$LOADSTRING $SEGADDRS" +echo Generating script $GDBSCRIPT +echo $LOADSTRING > $GDBSCRIPT diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/drivers/block/Kconfig xx/drivers/block/Kconfig --- 2.6.5-rc2/drivers/block/Kconfig 2004-03-21 15:09:24.000000000 +0100 +++ xx/drivers/block/Kconfig 2004-03-21 15:21:40.000000000 +0100 @@ -346,6 +346,12 @@ config LBD your machine, or if you want to have a raid or loopback device bigger than 2TB. Otherwise say N. +config CIPHER_TWOFISH + tristate "Twofish encryption for loop device for old S.u.S.E. crypto partitions" + depends on BLK_DEV_LOOP + help + Say Y here if you want to support old S.u.S.E. crypto partitions. + source "drivers/s390/block/Kconfig" endmenu diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/drivers/block/loop_fish2.c xx/drivers/block/loop_fish2.c --- 2.6.5-rc2/drivers/block/loop_fish2.c 1970-01-01 01:00:00.000000000 +0100 +++ xx/drivers/block/loop_fish2.c 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,625 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ROL(x,c) (((x) << (c)) | ((x) >> (32-(c)))) +#define ROR(x,c) (((x) >> (c)) | ((x) << (32-(c)))) +#define Bswap(x) __le32_to_cpu(x) + +#define DWORD __u32 +#define BYTE unsigned char + +typedef struct fish2_key +{ int keyLen; /* Key Length in Bit */ + DWORD sboxKeys[4]; + DWORD subKeys[40]; + BYTE key[32]; + DWORD sbox_full[1024]; /* This have to be 1024 DWORDs */ +} fish2_key; + + +/* Mul_5B[i] is 0x5B * i in GF(256), whatever that means... */ + +static unsigned char Mul_5B[256] = { + 0x00,0x5B,0xB6,0xED,0x05,0x5E,0xB3,0xE8, + 0x0A,0x51,0xBC,0xE7,0x0F,0x54,0xB9,0xE2, + 0x14,0x4F,0xA2,0xF9,0x11,0x4A,0xA7,0xFC, + 0x1E,0x45,0xA8,0xF3,0x1B,0x40,0xAD,0xF6, + 0x28,0x73,0x9E,0xC5,0x2D,0x76,0x9B,0xC0, + 0x22,0x79,0x94,0xCF,0x27,0x7C,0x91,0xCA, + 0x3C,0x67,0x8A,0xD1,0x39,0x62,0x8F,0xD4, + 0x36,0x6D,0x80,0xDB,0x33,0x68,0x85,0xDE, + 0x50,0x0B,0xE6,0xBD,0x55,0x0E,0xE3,0xB8, + 0x5A,0x01,0xEC,0xB7,0x5F,0x04,0xE9,0xB2, + 0x44,0x1F,0xF2,0xA9,0x41,0x1A,0xF7,0xAC, + 0x4E,0x15,0xF8,0xA3,0x4B,0x10,0xFD,0xA6, + 0x78,0x23,0xCE,0x95,0x7D,0x26,0xCB,0x90, + 0x72,0x29,0xC4,0x9F,0x77,0x2C,0xC1,0x9A, + 0x6C,0x37,0xDA,0x81,0x69,0x32,0xDF,0x84, + 0x66,0x3D,0xD0,0x8B,0x63,0x38,0xD5,0x8E, + 0xA0,0xFB,0x16,0x4D,0xA5,0xFE,0x13,0x48, + 0xAA,0xF1,0x1C,0x47,0xAF,0xF4,0x19,0x42, + 0xB4,0xEF,0x02,0x59,0xB1,0xEA,0x07,0x5C, + 0xBE,0xE5,0x08,0x53,0xBB,0xE0,0x0D,0x56, + 0x88,0xD3,0x3E,0x65,0x8D,0xD6,0x3B,0x60, + 0x82,0xD9,0x34,0x6F,0x87,0xDC,0x31,0x6A, + 0x9C,0xC7,0x2A,0x71,0x99,0xC2,0x2F,0x74, + 0x96,0xCD,0x20,0x7B,0x93,0xC8,0x25,0x7E, + 0xF0,0xAB,0x46,0x1D,0xF5,0xAE,0x43,0x18, + 0xFA,0xA1,0x4C,0x17,0xFF,0xA4,0x49,0x12, + 0xE4,0xBF,0x52,0x09,0xE1,0xBA,0x57,0x0C, + 0xEE,0xB5,0x58,0x03,0xEB,0xB0,0x5D,0x06, + 0xD8,0x83,0x6E,0x35,0xDD,0x86,0x6B,0x30, + 0xD2,0x89,0x64,0x3F,0xD7,0x8C,0x61,0x3A, + 0xCC,0x97,0x7A,0x21,0xC9,0x92,0x7F,0x24, + 0xC6,0x9D,0x70,0x2B,0xC3,0x98,0x75,0x2E }; + + +/* Mul_EF[i] is 0xEF * i in GF(256), whatever that means... */ + +static unsigned char Mul_EF[256] = { + 0x00,0xEF,0xB7,0x58,0x07,0xE8,0xB0,0x5F, + 0x0E,0xE1,0xB9,0x56,0x09,0xE6,0xBE,0x51, + 0x1C,0xF3,0xAB,0x44,0x1B,0xF4,0xAC,0x43, + 0x12,0xFD,0xA5,0x4A,0x15,0xFA,0xA2,0x4D, + 0x38,0xD7,0x8F,0x60,0x3F,0xD0,0x88,0x67, + 0x36,0xD9,0x81,0x6E,0x31,0xDE,0x86,0x69, + 0x24,0xCB,0x93,0x7C,0x23,0xCC,0x94,0x7B, + 0x2A,0xC5,0x9D,0x72,0x2D,0xC2,0x9A,0x75, + 0x70,0x9F,0xC7,0x28,0x77,0x98,0xC0,0x2F, + 0x7E,0x91,0xC9,0x26,0x79,0x96,0xCE,0x21, + 0x6C,0x83,0xDB,0x34,0x6B,0x84,0xDC,0x33, + 0x62,0x8D,0xD5,0x3A,0x65,0x8A,0xD2,0x3D, + 0x48,0xA7,0xFF,0x10,0x4F,0xA0,0xF8,0x17, + 0x46,0xA9,0xF1,0x1E,0x41,0xAE,0xF6,0x19, + 0x54,0xBB,0xE3,0x0C,0x53,0xBC,0xE4,0x0B, + 0x5A,0xB5,0xED,0x02,0x5D,0xB2,0xEA,0x05, + 0xE0,0x0F,0x57,0xB8,0xE7,0x08,0x50,0xBF, + 0xEE,0x01,0x59,0xB6,0xE9,0x06,0x5E,0xB1, + 0xFC,0x13,0x4B,0xA4,0xFB,0x14,0x4C,0xA3, + 0xF2,0x1D,0x45,0xAA,0xF5,0x1A,0x42,0xAD, + 0xD8,0x37,0x6F,0x80,0xDF,0x30,0x68,0x87, + 0xD6,0x39,0x61,0x8E,0xD1,0x3E,0x66,0x89, + 0xC4,0x2B,0x73,0x9C,0xC3,0x2C,0x74,0x9B, + 0xCA,0x25,0x7D,0x92,0xCD,0x22,0x7A,0x95, + 0x90,0x7F,0x27,0xC8,0x97,0x78,0x20,0xCF, + 0x9E,0x71,0x29,0xC6,0x99,0x76,0x2E,0xC1, + 0x8C,0x63,0x3B,0xD4,0x8B,0x64,0x3C,0xD3, + 0x82,0x6D,0x35,0xDA,0x85,0x6A,0x32,0xDD, + 0xA8,0x47,0x1F,0xF0,0xAF,0x40,0x18,0xF7, + 0xA6,0x49,0x11,0xFE,0xA1,0x4E,0x16,0xF9, + 0xB4,0x5B,0x03,0xEC,0xB3,0x5C,0x04,0xEB, + 0xBA,0x55,0x0D,0xE2,0xBD,0x52,0x0A,0xE5 }; + +static inline DWORD mds_mul(BYTE *y) +{ DWORD z; + + z=Mul_EF[y[0]] ^ y[1] ^ Mul_EF[y[2]] ^ Mul_5B[y[3]]; + z<<=8; + z|=Mul_EF[y[0]] ^ Mul_5B[y[1]] ^ y[2] ^ Mul_EF[y[3]]; + z<<=8; + z|=Mul_5B[y[0]] ^ Mul_EF[y[1]] ^ Mul_EF[y[2]] ^ y[3]; + z<<=8; + z|=y[0] ^ Mul_EF[y[1]] ^ Mul_5B[y[2]] ^ Mul_5B[y[3]]; + + return z; +} + +/* q0 and q1 are the lookup substitutions done in twofish */ + +static unsigned char q0[256] = +{ 0xA9, 0x67, 0xB3, 0xE8, 0x04, 0xFD, 0xA3, 0x76, + 0x9A, 0x92, 0x80, 0x78, 0xE4, 0xDD, 0xD1, 0x38, + 0x0D, 0xC6, 0x35, 0x98, 0x18, 0xF7, 0xEC, 0x6C, + 0x43, 0x75, 0x37, 0x26, 0xFA, 0x13, 0x94, 0x48, + 0xF2, 0xD0, 0x8B, 0x30, 0x84, 0x54, 0xDF, 0x23, + 0x19, 0x5B, 0x3D, 0x59, 0xF3, 0xAE, 0xA2, 0x82, + 0x63, 0x01, 0x83, 0x2E, 0xD9, 0x51, 0x9B, 0x7C, + 0xA6, 0xEB, 0xA5, 0xBE, 0x16, 0x0C, 0xE3, 0x61, + 0xC0, 0x8C, 0x3A, 0xF5, 0x73, 0x2C, 0x25, 0x0B, + 0xBB, 0x4E, 0x89, 0x6B, 0x53, 0x6A, 0xB4, 0xF1, + 0xE1, 0xE6, 0xBD, 0x45, 0xE2, 0xF4, 0xB6, 0x66, + 0xCC, 0x95, 0x03, 0x56, 0xD4, 0x1C, 0x1E, 0xD7, + 0xFB, 0xC3, 0x8E, 0xB5, 0xE9, 0xCF, 0xBF, 0xBA, + 0xEA, 0x77, 0x39, 0xAF, 0x33, 0xC9, 0x62, 0x71, + 0x81, 0x79, 0x09, 0xAD, 0x24, 0xCD, 0xF9, 0xD8, + 0xE5, 0xC5, 0xB9, 0x4D, 0x44, 0x08, 0x86, 0xE7, + 0xA1, 0x1D, 0xAA, 0xED, 0x06, 0x70, 0xB2, 0xD2, + 0x41, 0x7B, 0xA0, 0x11, 0x31, 0xC2, 0x27, 0x90, + 0x20, 0xF6, 0x60, 0xFF, 0x96, 0x5C, 0xB1, 0xAB, + 0x9E, 0x9C, 0x52, 0x1B, 0x5F, 0x93, 0x0A, 0xEF, + 0x91, 0x85, 0x49, 0xEE, 0x2D, 0x4F, 0x8F, 0x3B, + 0x47, 0x87, 0x6D, 0x46, 0xD6, 0x3E, 0x69, 0x64, + 0x2A, 0xCE, 0xCB, 0x2F, 0xFC, 0x97, 0x05, 0x7A, + 0xAC, 0x7F, 0xD5, 0x1A, 0x4B, 0x0E, 0xA7, 0x5A, + 0x28, 0x14, 0x3F, 0x29, 0x88, 0x3C, 0x4C, 0x02, + 0xB8, 0xDA, 0xB0, 0x17, 0x55, 0x1F, 0x8A, 0x7D, + 0x57, 0xC7, 0x8D, 0x74, 0xB7, 0xC4, 0x9F, 0x72, + 0x7E, 0x15, 0x22, 0x12, 0x58, 0x07, 0x99, 0x34, + 0x6E, 0x50, 0xDE, 0x68, 0x65, 0xBC, 0xDB, 0xF8, + 0xC8, 0xA8, 0x2B, 0x40, 0xDC, 0xFE, 0x32, 0xA4, + 0xCA, 0x10, 0x21, 0xF0, 0xD3, 0x5D, 0x0F, 0x00, + 0x6F, 0x9D, 0x36, 0x42, 0x4A, 0x5E, 0xC1, 0xE0}; + +static unsigned char q1[256] = +{ 0x75, 0xF3, 0xC6, 0xF4, 0xDB, 0x7B, 0xFB, 0xC8, + 0x4A, 0xD3, 0xE6, 0x6B, 0x45, 0x7D, 0xE8, 0x4B, + 0xD6, 0x32, 0xD8, 0xFD, 0x37, 0x71, 0xF1, 0xE1, + 0x30, 0x0F, 0xF8, 0x1B, 0x87, 0xFA, 0x06, 0x3F, + 0x5E, 0xBA, 0xAE, 0x5B, 0x8A, 0x00, 0xBC, 0x9D, + 0x6D, 0xC1, 0xB1, 0x0E, 0x80, 0x5D, 0xD2, 0xD5, + 0xA0, 0x84, 0x07, 0x14, 0xB5, 0x90, 0x2C, 0xA3, + 0xB2, 0x73, 0x4C, 0x54, 0x92, 0x74, 0x36, 0x51, + 0x38, 0xB0, 0xBD, 0x5A, 0xFC, 0x60, 0x62, 0x96, + 0x6C, 0x42, 0xF7, 0x10, 0x7C, 0x28, 0x27, 0x8C, + 0x13, 0x95, 0x9C, 0xC7, 0x24, 0x46, 0x3B, 0x70, + 0xCA, 0xE3, 0x85, 0xCB, 0x11, 0xD0, 0x93, 0xB8, + 0xA6, 0x83, 0x20, 0xFF, 0x9F, 0x77, 0xC3, 0xCC, + 0x03, 0x6F, 0x08, 0xBF, 0x40, 0xE7, 0x2B, 0xE2, + 0x79, 0x0C, 0xAA, 0x82, 0x41, 0x3A, 0xEA, 0xB9, + 0xE4, 0x9A, 0xA4, 0x97, 0x7E, 0xDA, 0x7A, 0x17, + 0x66, 0x94, 0xA1, 0x1D, 0x3D, 0xF0, 0xDE, 0xB3, + 0x0B, 0x72, 0xA7, 0x1C, 0xEF, 0xD1, 0x53, 0x3E, + 0x8F, 0x33, 0x26, 0x5F, 0xEC, 0x76, 0x2A, 0x49, + 0x81, 0x88, 0xEE, 0x21, 0xC4, 0x1A, 0xEB, 0xD9, + 0xC5, 0x39, 0x99, 0xCD, 0xAD, 0x31, 0x8B, 0x01, + 0x18, 0x23, 0xDD, 0x1F, 0x4E, 0x2D, 0xF9, 0x48, + 0x4F, 0xF2, 0x65, 0x8E, 0x78, 0x5C, 0x58, 0x19, + 0x8D, 0xE5, 0x98, 0x57, 0x67, 0x7F, 0x05, 0x64, + 0xAF, 0x63, 0xB6, 0xFE, 0xF5, 0xB7, 0x3C, 0xA5, + 0xCE, 0xE9, 0x68, 0x44, 0xE0, 0x4D, 0x43, 0x69, + 0x29, 0x2E, 0xAC, 0x15, 0x59, 0xA8, 0x0A, 0x9E, + 0x6E, 0x47, 0xDF, 0x34, 0x35, 0x6A, 0xCF, 0xDC, + 0x22, 0xC9, 0xC0, 0x9B, 0x89, 0xD4, 0xED, 0xAB, + 0x12, 0xA2, 0x0D, 0x52, 0xBB, 0x02, 0x2F, 0xA9, + 0xD7, 0x61, 0x1E, 0xB4, 0x50, 0x04, 0xF6, 0xC2, + 0x16, 0x25, 0x86, 0x56, 0x55, 0x09, 0xBE, 0x91 + }; + + +static DWORD f32(DWORD x, const DWORD * k32, int keyLen) +{ + BYTE b[4]; + + /* Run each byte thru 8x8 S-boxes, xoring with key byte at each stage. */ + /* Note that each byte goes through a different combination of S-boxes. */ + + *((DWORD *) b) = Bswap(x); /* make b[0] = LSB, b[3] = MSB */ + + switch (((keyLen + 63) / 64) & 3) + { + case 0: /* 256 bits of key */ + b[0] = q1[b[0]]; + b[1] = q0[b[1]]; + b[2] = q0[b[2]]; + b[3] = q1[b[3]]; + + *((DWORD *) b) ^= k32[3]; + + /* fall thru, having pre-processed b[0]..b[3] with k32[3] */ + case 3: /* 192 bits of key */ + b[0] = q1[b[0]]; + b[1] = q1[b[1]]; + b[2] = q0[b[2]]; + b[3] = q0[b[3]]; + + *((DWORD *) b) ^= k32[2]; + + /* fall thru, having pre-processed b[0]..b[3] with k32[2] */ + case 2: /* 128 bits of key */ + b[0] = q0[b[0]]; + b[1] = q1[b[1]]; + b[2] = q0[b[2]]; + b[3] = q1[b[3]]; + + *((DWORD *) b) ^= k32[1]; + + b[0] = q0[b[0]]; + b[1] = q0[b[1]]; + b[2] = q1[b[2]]; + b[3] = q1[b[3]]; + + *((DWORD *) b) ^= k32[0]; + + b[0] = q1[b[0]]; + b[1] = q0[b[1]]; + b[2] = q1[b[2]]; + b[3] = q0[b[3]]; + } + + + /* Now perform the MDS matrix multiply inline. */ + return mds_mul(b); +} + + +static void init_sbox(fish2_key *key) +{ DWORD x,*sbox,z,*k32; + int i,keyLen; + BYTE b[4]; + + k32=key->sboxKeys; + keyLen=key->keyLen; + sbox=key->sbox_full; + + x=0; + for (i=0;i<256;i++,x+=0x01010101) + { + *((DWORD *) b) = Bswap(x); /* make b[0] = LSB, b[3] = MSB */ + + switch (((keyLen + 63) / 64) & 3) + { + case 0: /* 256 bits of key */ + b[0] = q1[b[0]]; + b[1] = q0[b[1]]; + b[2] = q0[b[2]]; + b[3] = q1[b[3]]; + + *((DWORD *) b) ^= k32[3]; + + /* fall thru, having pre-processed b[0]..b[3] with k32[3] */ + case 3: /* 192 bits of key */ + b[0] = q1[b[0]]; + b[1] = q1[b[1]]; + b[2] = q0[b[2]]; + b[3] = q0[b[3]]; + + *((DWORD *) b) ^= k32[2]; + + /* fall thru, having pre-processed b[0]..b[3] with k32[2] */ + case 2: /* 128 bits of key */ + b[0] = q0[b[0]]; + b[1] = q1[b[1]]; + b[2] = q0[b[2]]; + b[3] = q1[b[3]]; + + *((DWORD *) b) ^= k32[1]; + + b[0] = q0[b[0]]; + b[1] = q0[b[1]]; + b[2] = q1[b[2]]; + b[3] = q1[b[3]]; + + *((DWORD *) b) ^= k32[0]; + + b[0] = q1[b[0]]; + b[1] = q0[b[1]]; + b[2] = q1[b[2]]; + b[3] = q0[b[3]]; + } + + z=Mul_EF[b[0]]; + z<<=8; + z|=Mul_EF[b[0]]; + z<<=8; + z|=Mul_5B[b[0]]; + z<<=8; + z|=b[0]; + + sbox[i]=z; + + z=b[1]; + z<<=8; + z|=Mul_5B[b[1]]; + z<<=8; + z|=Mul_EF[b[1]]; + z<<=8; + z|=Mul_EF[b[1]]; + + sbox[i+256]=z; + + z=Mul_EF[b[2]]; + z<<=8; + z|=b[2]; + z<<=8; + z|=Mul_EF[b[2]]; + z<<=8; + z|=Mul_5B[b[2]]; + + sbox[i+512]=z; + + z=Mul_5B[b[3]]; + z<<=8; + z|=Mul_EF[b[3]]; + z<<=8; + z|=b[3]; + z<<=8; + z|=Mul_5B[b[3]]; + + sbox[i+768]=z; + } +} + + +/* Reed-Solomon code parameters: (12,8) reversible code + g(x) = x**4 + (a + 1/a) x**3 + a x**2 + (a + 1/a) x + 1 + where a = primitive root of field generator 0x14D */ +#define RS_GF_FDBK 0x14D /* field generator */ +#define RS_rem(x) \ + { BYTE b = x >> 24; \ + DWORD g2 = ((b << 1) ^ ((b & 0x80) ? RS_GF_FDBK : 0 )) & 0xFF; \ + DWORD g3 = ((b >> 1) & 0x7F) ^ ((b & 1) ? RS_GF_FDBK >> 1 : 0 ) ^ g2 ; \ + x = (x << 8) ^ (g3 << 24) ^ (g2 << 16) ^ (g3 << 8) ^ b; \ + } + +static DWORD rs_mds(DWORD k0, DWORD k1) +{ + int i, j; + DWORD r; + + for (i = r = 0; i < 2; i++) + { + r ^= (i) ? k0 : k1; /* merge in 32 more key bits */ + for (j = 0; j < 4; j++) /* shift one byte at a time */ + RS_rem(r); + } + return r; +} + + +#define INPUT_WHITEN 0 /* subkey array indices */ +#define OUTPUT_WHITEN 4 +#define ROUND_SUBKEYS 8 /* use 2 * (# rounds) */ +#define TOTAL_SUBKEYS 40 + +static void init_key(fish2_key * key) +{ + int i, k64Cnt; + int keyLen = key->keyLen; + int subkeyCnt = TOTAL_SUBKEYS; + DWORD A, B; + DWORD k32e[4], k32o[4]; /* even/odd key dwords */ + + k64Cnt = (keyLen + 63) / 64; /* round up to next multiple of 64 bits */ + for (i = 0; i < k64Cnt; i++) + { /* split into even/odd key dwords */ + k32e[i] = ((DWORD *)key->key)[2 * i]; + k32o[i] = ((DWORD *)key->key)[2 * i + 1]; + /* compute S-box keys using (12,8) Reed-Solomon code over GF(256) */ + /* store in reverse order */ + key->sboxKeys[k64Cnt - 1 - i] = + Bswap(rs_mds(Bswap(k32e[i]), Bswap(k32o[i]))); + + } + + for (i = 0; i < subkeyCnt / 2; i++) /* compute round subkeys for PHT */ + { + A = f32(i * 0x02020202, k32e, keyLen); /* A uses even key dwords */ + B = f32(i * 0x02020202 + 0x01010101, k32o, keyLen); /* B uses odd key + dwords */ + B = ROL(B, 8); + key->subKeys[2 * i] = A + B; /* combine with a PHT */ + key->subKeys[2 * i + 1] = ROL(A + 2 * B, 9); + } + + init_sbox(key); +} + + +static inline DWORD f32_sbox(DWORD x,DWORD *sbox) +{ + /* Run each byte thru 8x8 S-boxes, xoring with key byte at each stage. */ + /* Note that each byte goes through a different combination of S-boxes. */ + + return (sbox[ (x) &0xff]^ + sbox[256 + (((x)>> 8)&0xff)]^ + sbox[512 + (((x)>>16)&0xff)]^ + sbox[768 + (((x)>>24)&0xff)]); +} + +#define roundE_m(x0,x1,x2,x3,rnd) \ + t0 = f32_sbox( x0, key->sbox_full ) ; \ + t1 = f32_sbox( ROL(x1,8), key->sbox_full ); \ + x2 ^= t0 + t1 + key->subKeys[2*rnd+8]; \ + x3 = ROL(x3,1); \ + x3 ^= t0 + 2*t1 + key->subKeys[2*rnd+9]; \ + x2 = ROR(x2,1); + + +static int blockEncrypt_CBC(fish2_key *key,BYTE *src,BYTE *dst,int len) +{ DWORD xx0,xx1,xx2,xx3,t0,t1,iv0,iv1,iv2,iv3; + + if (len & 0xF) return -1; + + iv0=0; + iv1=0; + iv2=0; + iv3=0; + for (;len>=16;len-=16) + + { + if ( ( len & 0x1FF) == 0) + { iv0=0; + iv1=0; + iv2=0; + iv3=0; + } + + xx0=Bswap(((DWORD *)src)[0]) ^ key->subKeys[0] ^ iv0; + xx1=Bswap(((DWORD *)src)[1]) ^ key->subKeys[1] ^ iv1; + xx2=Bswap(((DWORD *)src)[2]) ^ key->subKeys[2] ^ iv2; + xx3=Bswap(((DWORD *)src)[3]) ^ key->subKeys[3] ^ iv3; + + src+=16; + + roundE_m(xx0,xx1,xx2,xx3,0); + roundE_m(xx2,xx3,xx0,xx1,1); + roundE_m(xx0,xx1,xx2,xx3,2); + roundE_m(xx2,xx3,xx0,xx1,3); + roundE_m(xx0,xx1,xx2,xx3,4); + roundE_m(xx2,xx3,xx0,xx1,5); + roundE_m(xx0,xx1,xx2,xx3,6); + roundE_m(xx2,xx3,xx0,xx1,7); + roundE_m(xx0,xx1,xx2,xx3,8); + roundE_m(xx2,xx3,xx0,xx1,9); + roundE_m(xx0,xx1,xx2,xx3,10); + roundE_m(xx2,xx3,xx0,xx1,11); + roundE_m(xx0,xx1,xx2,xx3,12); + roundE_m(xx2,xx3,xx0,xx1,13); + roundE_m(xx0,xx1,xx2,xx3,14); + roundE_m(xx2,xx3,xx0,xx1,15); + + iv0=xx2 ^ key->subKeys[4]; + iv1=xx3 ^ key->subKeys[5]; + iv2=xx0 ^ key->subKeys[6]; + iv3=xx1 ^ key->subKeys[7]; + + ((DWORD *)dst)[0] = Bswap(iv0); + ((DWORD *)dst)[1] = Bswap(iv1); + ((DWORD *)dst)[2] = Bswap(iv2); + ((DWORD *)dst)[3] = Bswap(iv3); + dst+=16; + } + return len; +} + +#define roundD_m(x0,x1,x2,x3,rnd) \ + t0 = f32_sbox( x0, key->sbox_full); \ + t1 = f32_sbox( ROL(x1,8),key->sbox_full); \ + x2 = ROL(x2,1); \ + x3 ^= t0 + 2*t1 + key->subKeys[rnd*2+9]; \ + x3 = ROR(x3,1); \ + x2 ^= t0 + t1 + key->subKeys[rnd*2+8]; + + +static int blockDecrypt_CBC(fish2_key *key,BYTE *src,BYTE *dst,int len) +{ DWORD xx0,xx1,xx2,xx3,t0,t1,lx0,lx1,lx2,lx3,iv0,iv1,iv2,iv3; + + if (len & 0xF) return -1; + + iv0=0; + iv1=0; + iv2=0; + iv3=0; + + for (;len>=16;len-=16) + { + if ( ( len & 0x1FF) == 0) + { iv0=0; + iv1=0; + iv2=0; + iv3=0; + } + + lx0=iv0;iv0=Bswap(((DWORD *)src)[0]);xx0=iv0 ^ key->subKeys[4]; + lx1=iv1;iv1=Bswap(((DWORD *)src)[1]);xx1=iv1 ^ key->subKeys[5]; + lx2=iv2;iv2=Bswap(((DWORD *)src)[2]);xx2=iv2 ^ key->subKeys[6]; + lx3=iv3;iv3=Bswap(((DWORD *)src)[3]);xx3=iv3 ^ key->subKeys[7]; + src+=16; + + roundD_m(xx0,xx1,xx2,xx3,15); + roundD_m(xx2,xx3,xx0,xx1,14); + roundD_m(xx0,xx1,xx2,xx3,13); + roundD_m(xx2,xx3,xx0,xx1,12); + roundD_m(xx0,xx1,xx2,xx3,11); + roundD_m(xx2,xx3,xx0,xx1,10); + roundD_m(xx0,xx1,xx2,xx3,9); + roundD_m(xx2,xx3,xx0,xx1,8); + roundD_m(xx0,xx1,xx2,xx3,7); + roundD_m(xx2,xx3,xx0,xx1,6); + roundD_m(xx0,xx1,xx2,xx3,5); + roundD_m(xx2,xx3,xx0,xx1,4); + roundD_m(xx0,xx1,xx2,xx3,3); + roundD_m(xx2,xx3,xx0,xx1,2); + roundD_m(xx0,xx1,xx2,xx3,1); + roundD_m(xx2,xx3,xx0,xx1,0); + + ((DWORD *)dst)[0] = Bswap(xx2 ^ key->subKeys[0] ^ lx0); + ((DWORD *)dst)[1] = Bswap(xx3 ^ key->subKeys[1] ^ lx1); + ((DWORD *)dst)[2] = Bswap(xx0 ^ key->subKeys[2] ^ lx2); + ((DWORD *)dst)[3] = Bswap(xx1 ^ key->subKeys[3] ^ lx3); + dst+=16; + } + return len; +} + + +int transfer_fish2(struct loop_device *lo, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t IV) +{ + char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off; + char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off; + + if (cmd == READ) + blockDecrypt_CBC((fish2_key *)lo->key_data,raw_buf,loop_buf,size); + else + blockEncrypt_CBC((fish2_key *)lo->key_data,loop_buf,raw_buf,size); + + kunmap_atomic(raw_buf, KM_USER0); + kunmap_atomic(loop_buf, KM_USER1); + cond_resched(); + + return 0; +} + +int fish2_init(struct loop_device *lo,const struct loop_info64 *info) +{ fish2_key *key; + + if (info->lo_encrypt_key_size<16 || info->lo_encrypt_key_size>32) + return -EINVAL; + + key=(fish2_key *)kmalloc(sizeof(fish2_key),GFP_KERNEL); + + if (key==NULL) + return -ENOMEM; + + lo->key_data=key; + + memset(key->key,0,32); + + key->keyLen=info->lo_encrypt_key_size << 3; + memcpy(key->key,info->lo_encrypt_key,info->lo_encrypt_key_size); + + init_key(key); + + return 0; +} + +static int fish2_release(struct loop_device *lo) +{ if (lo->key_data!=NULL) + { + kfree(lo->key_data); + lo->key_data=NULL; + } + return(0); +} + +static struct loop_func_table fish2_funcs = +{ .number = LO_CRYPT_FISH2, + .transfer = transfer_fish2, + .init = fish2_init, + .release = fish2_release, + .owner = THIS_MODULE +}; + +int __init loop_fish2_init(void) +{ + int err; + + if ((err=loop_register_transfer(&fish2_funcs))) + { + printk(KERN_WARNING "Couldn't register Twofish encryption\n"); + return err; + } + printk(KERN_INFO "loop: registered Twofish encryption \n"); + return 0; +} + +void __exit loop_fish2_exit(void) +{ + if (loop_unregister_transfer(LO_CRYPT_FISH2)) + printk(KERN_WARNING "Couldn't unregister Twofish encryption\n"); + printk(KERN_INFO "loop: unregistered Twofish encryption \n"); +} + +module_init(loop_fish2_init); +module_exit(loop_fish2_exit); +MODULE_LICENSE("GPL"); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/drivers/block/Makefile xx/drivers/block/Makefile --- 2.6.5-rc2/drivers/block/Makefile 2004-03-21 15:09:24.000000000 +0100 +++ xx/drivers/block/Makefile 2004-03-21 15:21:40.000000000 +0100 @@ -42,3 +42,4 @@ obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryp obj-$(CONFIG_VIODASD) += viodasd.o obj-$(CONFIG_BLK_DEV_CARMEL) += carmel.o +obj-$(CONFIG_CIPHER_TWOFISH) += loop_fish2.o diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/drivers/char/keyboard.c xx/drivers/char/keyboard.c --- 2.6.5-rc2/drivers/char/keyboard.c 2004-02-20 17:26:37.000000000 +0100 +++ xx/drivers/char/keyboard.c 2004-03-21 15:21:40.000000000 +0100 @@ -1066,6 +1066,9 @@ void kbd_keycode(unsigned int keycode, i } if (sysrq_down && down && !rep) { handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty); +#ifdef CONFIG_KGDB_SYSRQ + sysrq_down = 0; /* in case we miss the "up" event */ +#endif return; } #endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/drivers/char/sysrq.c xx/drivers/char/sysrq.c --- 2.6.5-rc2/drivers/char/sysrq.c 2004-02-04 16:06:46.000000000 +0100 +++ xx/drivers/char/sysrq.c 2004-03-21 15:21:40.000000000 +0100 @@ -35,6 +35,25 @@ #include #include +#ifdef CONFIG_KGDB_SYSRQ + +#define GDB_OP &kgdb_op +static void kgdb_sysrq(int key, struct pt_regs *pt_regs, struct tty_struct *tty) +{ + printk("kgdb sysrq\n"); + breakpoint(); +} + +static struct sysrq_key_op kgdb_op = { + .handler = kgdb_sysrq, + .help_msg = "kGdb|Fgdb", + .action_msg = "Debug breakpoint\n", +}; + +#else +#define GDB_OP NULL +#endif + extern void reset_vc(unsigned int); @@ -238,8 +257,8 @@ static struct sysrq_key_op *sysrq_key_ta /* c */ NULL, /* d */ NULL, /* e */ &sysrq_term_op, -/* f */ NULL, -/* g */ NULL, +/* f */ GDB_OP, +/* g */ GDB_OP, /* h */ NULL, /* i */ &sysrq_kill_op, /* j */ NULL, diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/drivers/media/video/video-buf.c xx/drivers/media/video/video-buf.c --- 2.6.5-rc2/drivers/media/video/video-buf.c 2004-03-21 15:09:24.000000000 +0100 +++ xx/drivers/media/video/video-buf.c 2004-03-21 15:21:42.000000000 +0100 @@ -1176,7 +1176,7 @@ int videobuf_mmap_mapper(struct vm_area_ map->end = vma->vm_end; map->q = q; vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */ vma->vm_private_data = map; dprintk(1,"mmap %p: %08lx-%08lx pgoff %08lx bufs %d-%d\n", diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/drivers/serial/8250.c xx/drivers/serial/8250.c --- 2.6.5-rc2/drivers/serial/8250.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/drivers/serial/8250.c 2004-03-21 15:21:40.000000000 +0100 @@ -834,7 +834,7 @@ receive_chars(struct uart_8250_port *up, if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) { tty->flip.work.func((void *)tty); if (tty->flip.count >= TTY_FLIPBUF_SIZE) - return; // if TTY_DONT_FLIP is set + return; /* if TTY_DONT_FLIP is set */ } ch = serial_inp(up, UART_RX); *tty->flip.char_buf_ptr = ch; @@ -1195,12 +1195,21 @@ static void serial8250_break_ctl(struct spin_unlock_irqrestore(&up->port.lock, flags); } +#ifdef CONFIG_KGDB +static int kgdb_irq = -1; +#endif + static int serial8250_startup(struct uart_port *port) { struct uart_8250_port *up = (struct uart_8250_port *)port; unsigned long flags; int retval; +#ifdef CONFIG_KGDB + if (up->port.irq == kgdb_irq) + return -EBUSY; +#endif + up->capabilities = uart_config[up->port.type].flags; if (up->port.type == PORT_16C950) { @@ -1866,6 +1875,10 @@ static void __init serial8250_register_p for (i = 0; i < UART_NR; i++) { struct uart_8250_port *up = &serial8250_ports[i]; +#ifdef CONFIG_KGDB + if (up->port.irq == kgdb_irq) + up->port.kgdb = 1; +#endif up->port.line = i; up->port.ops = &serial8250_pops; init_timer(&up->timer); @@ -2145,6 +2158,31 @@ void serial8250_resume_port(int line) uart_resume_port(&serial8250_reg, &serial8250_ports[line].port); } +#ifdef CONFIG_KGDB +/* + * Find all the ports using the given irq and shut them down. + * Result should be that the irq will be released. + */ +void shutdown_for_kgdb(struct async_struct * info) +{ + int irq = info->state->irq; + struct uart_8250_port *up; + int ttyS; + + kgdb_irq = irq; /* save for later init */ + for (ttyS = 0; ttyS < UART_NR; ttyS++){ + up = &serial8250_ports[ttyS]; + if (up->port.irq == irq && (irq_lists + irq)->head) { +#ifdef CONFIG_DEBUG_SPINLOCK /* ugly business... */ + if(up->port.lock.magic != SPINLOCK_MAGIC) + spin_lock_init(&up->port.lock); +#endif + serial8250_shutdown(&up->port); + } + } +} +#endif /* CONFIG_KGDB */ + static int __init serial8250_init(void) { int ret, i; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/drivers/serial/serial_core.c xx/drivers/serial/serial_core.c --- 2.6.5-rc2/drivers/serial/serial_core.c 2004-03-11 08:27:39.000000000 +0100 +++ xx/drivers/serial/serial_core.c 2004-03-21 15:21:40.000000000 +0100 @@ -1985,6 +1985,11 @@ uart_configure_port(struct uart_driver * { unsigned int flags; +#ifdef CONFIG_KGDB + if (port->kgdb) + return; +#endif + /* * If there isn't a port here, don't do anything further. */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/fs/buffer.c xx/fs/buffer.c --- 2.6.5-rc2/fs/buffer.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/fs/buffer.c 2004-03-21 15:21:42.000000000 +0100 @@ -396,7 +396,7 @@ out: * Hack idea: for the blockdev mapping, i_bufferlist_lock contention * may be quite high. This code could TryLock the page, and if that * succeeds, there is no need to take private_lock. (But if - * private_lock is contended then so is mapping->page_lock). + * private_lock is contended then so is mapping->tree_lock). */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block, int unused) @@ -825,12 +825,6 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean * page on the dirty page list. * - * There is also a small window where the page is dirty, and not on dirty_pages. - * Also a possibility that by the time the page is added to dirty_pages, it has - * been set clean. The page lists are somewhat approximate in this regard. - * It's better to have clean pages accidentally attached to dirty_pages than to - * leave dirty pages attached to clean_pages. - * * We use private_lock to lock against try_to_free_buffers while using the * page's buffer list. Also use this to protect against clean buffers being * added to the page after it was set dirty. @@ -843,7 +837,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); */ int __set_page_dirty_buffers(struct page *page) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = page_mapping(page); int ret = 0; if (mapping == NULL) { @@ -867,14 +861,14 @@ int __set_page_dirty_buffers(struct page spin_unlock(&mapping->private_lock); if (!TestSetPageDirty(page)) { - spin_lock(&mapping->page_lock); - if (page->mapping) { /* Race with truncate? */ + spin_lock_irq(&mapping->tree_lock); + if (page_mapping(page)) { /* Race with truncate? */ if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); - list_del(&page->list); - list_add(&page->list, &mapping->dirty_pages); + radix_tree_tag_set(&mapping->page_tree, !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_DIRTY); } - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } @@ -1226,7 +1220,7 @@ __getblk_slow(struct block_device *bdev, * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and - * the page appears on its address_space.dirty_pages list. + * the page is tagged dirty in its radix tree. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is @@ -1248,13 +1242,13 @@ __getblk_slow(struct block_device *bdev, /** * mark_buffer_dirty - mark a buffer_head as needing writeout * - * mark_buffer_dirty() will set the dirty bit against the buffer, - * then set its backing page dirty, then attach the page to its - * address_space's dirty_pages list and then attach the address_space's - * inode to its superblock's dirty inode list. + * mark_buffer_dirty() will set the dirty bit against the buffer, then set its + * backing page dirty, then tag the page as dirty in its address_space's radix + * tree and then attach the address_space's inode to its superblock's dirty + * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->page_lock and the global inode_lock. + * mapping->tree_lock and the global inode_lock. */ void fastcall mark_buffer_dirty(struct buffer_head *bh) { @@ -1578,8 +1572,7 @@ int try_to_release_page(struct page *pag { struct address_space * const mapping = page->mapping; - if (!PageLocked(page)) - BUG(); + BUG_ON(!PageLocked(page)); if (PageWriteback(page)) return 0; @@ -1826,7 +1819,7 @@ static int __block_write_full_page(struc } while ((bh = bh->b_this_page) != head); BUG_ON(PageWriteback(page)); - SetPageWriteback(page); /* Keeps try_to_free_buffers() away */ + set_page_writeback(page); /* Keeps try_to_free_buffers() away */ unlock_page(page); /* @@ -1889,7 +1882,7 @@ recover: } while ((bh = bh->b_this_page) != head); SetPageError(page); BUG_ON(PageWriteback(page)); - SetPageWriteback(page); + set_page_writeback(page); unlock_page(page); do { struct buffer_head *next = bh->b_this_page; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/fs/cifs/file.c xx/fs/cifs/file.c --- 2.6.5-rc2/fs/cifs/file.c 2004-02-20 17:26:48.000000000 +0100 +++ xx/fs/cifs/file.c 2004-03-21 15:21:42.000000000 +0100 @@ -898,11 +898,9 @@ static void cifs_copy_cache_pages(struct if(list_empty(pages)) break; - spin_lock(&mapping->page_lock); - page = list_entry(pages->prev, struct page, list); + page = list_entry(pages->prev, struct page, lru); - list_del(&page->list); - spin_unlock(&mapping->page_lock); + list_del(&page->lru); if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { page_cache_release(page); @@ -962,14 +960,10 @@ cifs_readpages(struct file *file, struct pagevec_init(&lru_pvec, 0); for(i = 0;ipage_lock); - if(list_empty(page_list)) { - spin_unlock(&mapping->page_lock); + if(list_empty(page_list)) break; - } - page = list_entry(page_list->prev, struct page, list); + page = list_entry(page_list->prev, struct page, lru); offset = (loff_t)page->index << PAGE_CACHE_SHIFT; - spin_unlock(&mapping->page_lock); /* for reads over a certain size could initiate async read ahead */ @@ -989,12 +983,11 @@ cifs_readpages(struct file *file, struct cFYI(1,("Read error in readpages: %d",rc)); /* clean up remaing pages off list */ - spin_lock(&mapping->page_lock); while (!list_empty(page_list) && (i < num_pages)) { - page = list_entry(page_list->prev, struct page, list); - list_del(&page->list); + page = list_entry(page_list->prev, + struct page, lru); + list_del(&page->lru); } - spin_unlock(&mapping->page_lock); break; } else if (bytes_read > 0) { pSMBr = (struct smb_com_read_rsp *)smb_read_data; @@ -1010,8 +1003,9 @@ cifs_readpages(struct file *file, struct cFYI(1,("No bytes read cleaning remaining pages off readahead list")); /* BB turn off caching and do new lookup on file size at server? */ while (!list_empty(page_list) && (i < num_pages)) { - page = list_entry(page_list->prev, struct page, list); - list_del(&page->list); + page = list_entry(page_list->prev, + struct page, lru); + list_del(&page->lru); } break; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/fs/exec.c xx/fs/exec.c --- 2.6.5-rc2/fs/exec.c 2004-03-11 08:27:40.000000000 +0100 +++ xx/fs/exec.c 2004-03-21 15:21:42.000000000 +0100 @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include @@ -296,21 +296,21 @@ EXPORT_SYMBOL(copy_strings_kernel); * tsk->mmap_sem is held for writing. */ void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot) + unsigned long address, pgprot_t prot, + struct vm_area_struct *vma) { pgd_t * pgd; pmd_t * pmd; pte_t * pte; - struct pte_chain *pte_chain; if (page_count(page) != 1) printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); - pgd = pgd_offset(tsk->mm, address); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) + if (unlikely(anon_vma_prepare(vma))) goto out_sig; + + pgd = pgd_offset(tsk->mm, address); spin_lock(&tsk->mm->page_table_lock); pmd = pmd_alloc(tsk->mm, pgd, address); if (!pmd) @@ -325,20 +325,18 @@ void put_dirty_page(struct task_struct * lru_cache_add_active(page); flush_dcache_page(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); - pte_chain = page_add_rmap(page, pte, pte_chain); + page_add_rmap(page, vma, address, 1); pte_unmap(pte); tsk->mm->rss++; spin_unlock(&tsk->mm->page_table_lock); /* no need for flush_tlb */ - pte_chain_free(pte_chain); return; out: spin_unlock(&tsk->mm->page_table_lock); out_sig: __free_page(page); force_sig(SIGKILL, tsk); - pte_chain_free(pte_chain); return; } @@ -428,9 +426,11 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7]; mpnt->vm_flags = VM_STACK_FLAGS; mpnt->vm_ops = NULL; - mpnt->vm_pgoff = 0; + mpnt->vm_pgoff = mpnt->vm_start >> PAGE_SHIFT; mpnt->vm_file = NULL; INIT_LIST_HEAD(&mpnt->shared); + /* insert_vm_struct takes care of anon_vma_node */ + mpnt->anon_vma = NULL; mpnt->vm_private_data = (void *) 0; insert_vm_struct(mm, mpnt); mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; @@ -441,7 +441,7 @@ int setup_arg_pages(struct linux_binprm if (page) { bprm->page[i] = NULL; put_dirty_page(current, page, stack_base, - mpnt->vm_page_prot); + mpnt->vm_page_prot, mpnt); } stack_base += PAGE_SIZE; } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/fs/fs-writeback.c xx/fs/fs-writeback.c --- 2.6.5-rc2/fs/fs-writeback.c 2004-03-11 08:27:41.000000000 +0100 +++ xx/fs/fs-writeback.c 2004-03-21 15:21:40.000000000 +0100 @@ -123,12 +123,6 @@ static void write_inode(struct inode *in * starvation of particular inodes when others are being redirtied, prevent * livelocks, etc. * - * So what we do is to move all pages which are to be written from dirty_pages - * onto io_pages. And keep on writing io_pages until it's empty. Refusing to - * move more pages onto io_pages until io_pages is empty. Once that point has - * been reached, we are ready to take another pass across the inode's dirty - * pages. - * * Called under inode_lock. */ static void @@ -152,10 +146,6 @@ __sync_single_inode(struct inode *inode, * read speculatively by this cpu before &= ~I_DIRTY -- mikulas */ - spin_lock(&mapping->page_lock); - if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) - list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); spin_unlock(&inode_lock); do_writepages(mapping, wbc); @@ -170,10 +160,7 @@ __sync_single_inode(struct inode *inode, spin_lock(&inode_lock); inode->i_state &= ~I_LOCK; if (!(inode->i_state & I_FREEING)) { - if (!list_empty(&mapping->io_pages)) { - /* Needs more writeback */ - inode->i_state |= I_DIRTY_PAGES; - } else if (!list_empty(&mapping->dirty_pages)) { + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* Redirtied */ inode->i_state |= I_DIRTY_PAGES; mapping->dirtied_when = jiffies; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/fs/inode.c xx/fs/inode.c --- 2.6.5-rc2/fs/inode.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/fs/inode.c 2004-03-21 15:21:41.000000000 +0100 @@ -176,15 +176,11 @@ void inode_init_once(struct inode *inode { memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); - INIT_LIST_HEAD(&inode->i_data.clean_pages); - INIT_LIST_HEAD(&inode->i_data.dirty_pages); - INIT_LIST_HEAD(&inode->i_data.locked_pages); - INIT_LIST_HEAD(&inode->i_data.io_pages); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.page_lock); + spin_lock_init(&inode->i_data.tree_lock); init_MUTEX(&inode->i_data.i_shared_sem); atomic_set(&inode->i_data.truncate_count, 0); INIT_LIST_HEAD(&inode->i_data.private_list); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/fs/mpage.c xx/fs/mpage.c --- 2.6.5-rc2/fs/mpage.c 2003-08-31 02:37:22.000000000 +0200 +++ xx/fs/mpage.c 2004-03-21 15:21:42.000000000 +0100 @@ -329,10 +329,10 @@ mpage_readpages(struct address_space *ma pagevec_init(&lru_pvec, 0); for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, list); + struct page *page = list_entry(pages->prev, struct page, lru); prefetchw(&page->flags); - list_del(&page->list); + list_del(&page->lru); if (!add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { bio = do_mpage_readpage(bio, page, @@ -546,7 +546,7 @@ alloc_new: } BUG_ON(PageWriteback(page)); - SetPageWriteback(page); + set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { bio = mpage_bio_submit(WRITE, bio); @@ -589,31 +589,13 @@ out: * This is a library function, which implements the writepages() * address_space_operation. * - * (The next two paragraphs refer to code which isn't here yet, but they - * explain the presence of address_space.io_pages) - * - * Pages can be moved from clean_pages or locked_pages onto dirty_pages - * at any time - it's not possible to lock against that. So pages which - * have already been added to a BIO may magically reappear on the dirty_pages - * list. And mpage_writepages() will again try to lock those pages. - * But I/O has not yet been started against the page. Thus deadlock. - * - * To avoid this, mpage_writepages() will only write pages from io_pages. The - * caller must place them there. We walk io_pages, locking the pages and - * submitting them for I/O, moving them to locked_pages. - * - * This has the added benefit of preventing a livelock which would otherwise - * occur if pages are being dirtied faster than we can write them out. - * * If a page is already under I/O, generic_writepages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. So if called_for_sync() - * is true, we must wait for existing IO to complete. - * - * It's fairly rare for PageWriteback pages to be on ->dirty_pages. It - * means that someone redirtied the page while it was under I/O. + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. */ int mpage_writepages(struct address_space *mapping, @@ -625,6 +607,9 @@ mpage_writepages(struct address_space *m int ret = 0; int done = 0; int (*writepage)(struct page *page, struct writeback_control *wbc); + struct pagevec pvec; + int nr_pages; + pgoff_t index; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; @@ -635,72 +620,58 @@ mpage_writepages(struct address_space *m if (get_block == NULL) writepage = mapping->a_ops->writepage; - spin_lock(&mapping->page_lock); - while (!list_empty(&mapping->io_pages) && !done) { - struct page *page = list_entry(mapping->io_pages.prev, - struct page, list); - list_del(&page->list); - if (PageWriteback(page) && wbc->sync_mode == WB_SYNC_NONE) { - if (PageDirty(page)) { - list_add(&page->list, &mapping->dirty_pages); - continue; - } - list_add(&page->list, &mapping->locked_pages); - continue; - } - if (!PageDirty(page)) { - list_add(&page->list, &mapping->clean_pages); - continue; - } - list_add(&page->list, &mapping->locked_pages); - - page_cache_get(page); - spin_unlock(&mapping->page_lock); - - /* - * At this point we hold neither mapping->page_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file mapping. - */ - - lock_page(page); - - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (page->mapping == mapping && !PageWriteback(page) && - test_clear_page_dirty(page)) { - if (writepage) { - ret = (*writepage)(page, wbc); - if (ret) { - if (ret == -ENOSPC) - set_bit(AS_ENOSPC, - &mapping->flags); - else - set_bit(AS_EIO, - &mapping->flags); + pagevec_init(&pvec, 0); + index = 0; + while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + + lock_page(page); + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (page->mapping == mapping && !PageWriteback(page) && + clear_page_dirty_for_io(page)) { + if (writepage) { + ret = (*writepage)(page, wbc); + if (ret) { + if (ret == -ENOSPC) + set_bit(AS_ENOSPC, + &mapping->flags); + else + set_bit(AS_EIO, + &mapping->flags); + } + } else { + bio = mpage_writepage(bio, page, + get_block, &last_block_in_bio, + &ret, wbc); + } + if (ret || (--(wbc->nr_to_write) <= 0)) + done = 1; + if (wbc->nonblocking && + bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; } } else { - bio = mpage_writepage(bio, page, get_block, - &last_block_in_bio, &ret, wbc); - } - if (ret || (--(wbc->nr_to_write) <= 0)) - done = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; + unlock_page(page); } - } else { - unlock_page(page); } - page_cache_release(page); - spin_lock(&mapping->page_lock); + pagevec_release(&pvec); } - /* - * Leave any remaining dirty pages on ->io_pages - */ - spin_unlock(&mapping->page_lock); if (bio) mpage_bio_submit(WRITE, bio); return ret; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/fs/nfs/write.c xx/fs/nfs/write.c --- 2.6.5-rc2/fs/nfs/write.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/fs/nfs/write.c 2004-03-21 15:21:40.000000000 +0100 @@ -768,7 +768,7 @@ nfs_write_rpcsetup(struct list_head *hea req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_list_add_request(req, &data->pages); - SetPageWriteback(req->wb_page); + set_page_writeback(req->wb_page); *pages++ = req->wb_page; count += req->wb_bytes; } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/fs/ntfs/aops.c xx/fs/ntfs/aops.c --- 2.6.5-rc2/fs/ntfs/aops.c 2003-08-31 02:37:48.000000000 +0200 +++ xx/fs/ntfs/aops.c 2004-03-21 15:21:40.000000000 +0100 @@ -743,7 +743,7 @@ lock_retry_remap: } BUG_ON(PageWriteback(page)); - SetPageWriteback(page); /* Keeps try_to_free_buffers() away. */ + set_page_writeback(page); /* Keeps try_to_free_buffers() away. */ unlock_page(page); /* @@ -885,7 +885,7 @@ static int ntfs_writepage(struct page *p // FIXME: Make sure it is ok to SetPageError() on unlocked page under // writeback before doing the change! #if 0 - SetPageWriteback(page); + set_page_writeback(page); unlock_page(page); #endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/fs/reiserfs/inode.c xx/fs/reiserfs/inode.c --- 2.6.5-rc2/fs/reiserfs/inode.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/fs/reiserfs/inode.c 2004-03-21 15:21:40.000000000 +0100 @@ -2134,7 +2134,7 @@ static int reiserfs_write_full_page(stru } while(bh != head) ; BUG_ON(PageWriteback(page)); - SetPageWriteback(page); + set_page_writeback(page); unlock_page(page); /* @@ -2198,7 +2198,7 @@ fail: } while(bh != head); SetPageError(page); BUG_ON(PageWriteback(page)); - SetPageWriteback(page); + set_page_writeback(page); unlock_page(page); do { struct buffer_head *next = bh->b_this_page; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/fs/xfs/linux/xfs_aops.c xx/fs/xfs/linux/xfs_aops.c --- 2.6.5-rc2/fs/xfs/linux/xfs_aops.c 2004-03-11 08:27:42.000000000 +0100 +++ xx/fs/xfs/linux/xfs_aops.c 2004-03-21 15:21:40.000000000 +0100 @@ -566,7 +566,7 @@ xfs_submit_page( int i; BUG_ON(PageWriteback(page)); - SetPageWriteback(page); + set_page_writeback(page); clear_page_dirty(page); unlock_page(page); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/fs/xfs/linux/xfs_vnode.h xx/fs/xfs/linux/xfs_vnode.h --- 2.6.5-rc2/fs/xfs/linux/xfs_vnode.h 2004-02-04 16:07:00.000000000 +0100 +++ xx/fs/xfs/linux/xfs_vnode.h 2004-03-21 15:21:40.000000000 +0100 @@ -600,7 +600,8 @@ static __inline__ void vn_flagclr(struct (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \ (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared)))) #define VN_CACHED(vp) (LINVFS_GET_IP(vp)->i_mapping->nrpages) -#define VN_DIRTY(vp) (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->dirty_pages))) +#define VN_DIRTY(vp) mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \ + PAGECACHE_TAG_DIRTY) #define VMODIFY(vp) VN_FLAGSET(vp, VMODIFIED) #define VUNMODIFY(vp) VN_FLAGCLR(vp, VMODIFIED) diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-alpha/rmap.h xx/include/asm-alpha/rmap.h --- 2.6.5-rc2/include/asm-alpha/rmap.h 2002-07-19 20:08:35.000000000 +0200 +++ xx/include/asm-alpha/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _ALPHA_RMAP_H -#define _ALPHA_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-arm/rmap.h xx/include/asm-arm/rmap.h --- 2.6.5-rc2/include/asm-arm/rmap.h 2002-07-27 22:32:02.000000000 +0200 +++ xx/include/asm-arm/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,6 +0,0 @@ -#ifndef _ARM_RMAP_H -#define _ARM_RMAP_H - -#include - -#endif /* _ARM_RMAP_H */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-arm26/rmap.h xx/include/asm-arm26/rmap.h --- 2.6.5-rc2/include/asm-arm26/rmap.h 2003-06-08 18:21:42.000000000 +0200 +++ xx/include/asm-arm26/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,66 +0,0 @@ -#ifndef _ARM_RMAP_H -#define _ARM_RMAP_H - -/* - * linux/include/asm-arm26/proc-armv/rmap.h - * - * Architecture dependant parts of the reverse mapping code, - * - * ARM is different since hardware page tables are smaller than - * the page size and Linux uses a "duplicate" one with extra info. - * For rmap this means that the first 2 kB of a page are the hardware - * page tables and the last 2 kB are the software page tables. - */ - -static inline void pgtable_add_rmap(struct page *page, struct mm_struct * mm, unsigned long address) -{ - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page *page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = virt_to_page(ptep); - return (struct mm_struct *)page->mapping; -} - -/* The page table takes half of the page */ -#define PTE_MASK ((PAGE_SIZE / 2) - 1) - -static inline unsigned long ptep_to_address(pte_t * ptep) -{ - struct page * page = virt_to_page(ptep); - unsigned long low_bits; - - low_bits = ((unsigned long)ptep & PTE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; -} - -//FIXME!!! IS these correct? -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - return (pte_addr_t)ptep; -} - -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - return (pte_t *)pte_paddr; -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - return; -} - - -//#include - -#endif /* _ARM_RMAP_H */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-cris/rmap.h xx/include/asm-cris/rmap.h --- 2.6.5-rc2/include/asm-cris/rmap.h 2002-07-19 20:08:35.000000000 +0200 +++ xx/include/asm-cris/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _CRIS_RMAP_H -#define _CRIS_RMAP_H - -/* nothing to see, move along :) */ -#include - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-generic/rmap.h xx/include/asm-generic/rmap.h --- 2.6.5-rc2/include/asm-generic/rmap.h 2003-05-14 01:56:46.000000000 +0200 +++ xx/include/asm-generic/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,90 +0,0 @@ -#ifndef _GENERIC_RMAP_H -#define _GENERIC_RMAP_H -/* - * linux/include/asm-generic/rmap.h - * - * Architecture dependent parts of the reverse mapping code, - * this version should work for most architectures with a - * 'normal' page table layout. - * - * We use the struct page of the page table page to find out - * the process and full address of a page table entry: - * - page->mapping points to the process' mm_struct - * - page->index has the high bits of the address - * - the lower bits of the address are calculated from the - * offset of the page table entry within the page table page - * - * For CONFIG_HIGHPTE, we need to represent the address of a pte in a - * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE - * bits and is then ORed with the byte offset of the pte within its page. - * - * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for - * the offset. - * - * For CONFIG_HIGHMEM64G, the pte_addr_t is 64 bits. 52 for the pfn, 12 for - * the offset. - */ -#include - -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) -{ -#ifdef BROKEN_PPC_PTE_ALLOC_ONE - /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ - extern int mem_init_done; - - if (!mem_init_done) - return; -#endif - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page * page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; -} - -static inline unsigned long ptep_to_address(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - unsigned long low_bits; - low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; -} - -#ifdef CONFIG_HIGHPTE -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - pte_addr_t paddr; - paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT; - return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK); -} -#else -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - return (pte_addr_t)ptep; -} -#endif - -#ifndef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - return (pte_t *)pte_paddr; -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - return; -} -#endif - -#endif /* _GENERIC_RMAP_H */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-i386/bugs.h xx/include/asm-i386/bugs.h --- 2.6.5-rc2/include/asm-i386/bugs.h 2003-09-01 20:15:27.000000000 +0200 +++ xx/include/asm-i386/bugs.h 2004-03-21 15:21:40.000000000 +0100 @@ -1,11 +1,11 @@ /* * include/asm-i386/bugs.h * - * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 1994 Linus Torvalds * * Cyrix stuff, June 1998 by: * - Rafael R. Reilova (moved everything from head.S), - * + * * - Channing Corn (tests & fixes), * - Andrew D. Balsa (code cleanup). * @@ -25,7 +25,20 @@ #include #include #include - +#ifdef CONFIG_KGDB +/* + * Provied the command line "gdb" initial break + */ +int __init kgdb_initial_break(char * str) +{ + if (*str == '\0'){ + breakpoint(); + return 1; + } + return 0; +} +__setup("gdb",kgdb_initial_break); +#endif static int __init no_halt(char *s) { boot_cpu_data.hlt_works_ok = 0; @@ -140,7 +153,7 @@ static void __init check_popad(void) : "ecx", "edi" ); /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ if (res != 12345678) printk( "Buggy.\n" ); - else printk( "OK.\n" ); + else printk( "OK.\n" ); #endif } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-i386/kgdb.h xx/include/asm-i386/kgdb.h --- 2.6.5-rc2/include/asm-i386/kgdb.h 1970-01-01 01:00:00.000000000 +0100 +++ xx/include/asm-i386/kgdb.h 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,59 @@ +#ifndef __KGDB +#define __KGDB + +/* + * This file should not include ANY others. This makes it usable + * most anywhere without the fear of include order or inclusion. + * Make it so! + * + * This file may be included all the time. It is only active if + * CONFIG_KGDB is defined, otherwise it stubs out all the macros + * and entry points. + */ +#if defined(CONFIG_KGDB) && !defined(__ASSEMBLY__) + +extern void breakpoint(void); +#define INIT_KGDB_INTS kgdb_enable_ints() + +#ifndef BREAKPOINT +#define BREAKPOINT asm(" int $3") +#endif +/* + * GDB debug stub (or any debug stub) can point the 'linux_debug_hook' + * pointer to its routine and it will be entered as the first thing + * when a trap occurs. + * + * Return values are, at present, undefined. + * + * The debug hook routine does not necessarily return to its caller. + * It has the register image and thus may choose to resume execution + * anywhere it pleases. + */ +struct pt_regs; + +extern int kgdb_handle_exception(int trapno, + int signo, int err_code, struct pt_regs *regs); +extern int in_kgdb(struct pt_regs *regs); + +#ifdef CONFIG_KGDB_TS +void kgdb_tstamp(int line, char *source, int data0, int data1); +/* + * This is the time stamp function. The macro adds the source info and + * does a cast on the data to allow most any 32-bit value. + */ + +#define kgdb_ts(data0,data1) kgdb_tstamp(__LINE__,__FILE__,(int)data0,(int)data1) +#else +#define kgdb_ts(data0,data1) +#endif +#else /* CONFIG_KGDB && ! __ASSEMBLY__ ,stubs follow... */ +#ifndef BREAKPOINT +#define BREAKPOINT +#endif +#define kgdb_ts(data0,data1) +#define in_kgdb +#define kgdb_handle_exception +#define breakpoint +#define INIT_KGDB_INTS +#endif +#endif /* __KGDB */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-i386/kgdb_local.h xx/include/asm-i386/kgdb_local.h --- 2.6.5-rc2/include/asm-i386/kgdb_local.h 1970-01-01 01:00:00.000000000 +0100 +++ xx/include/asm-i386/kgdb_local.h 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,102 @@ +#ifndef __KGDB_LOCAL +#define ___KGDB_LOCAL +#include +#include +#include +#include +#include +#include +#include +#include + +#define PORT 0x3f8 +#ifdef CONFIG_KGDB_PORT +#undef PORT +#define PORT CONFIG_KGDB_PORT +#endif +#define IRQ 4 +#ifdef CONFIG_KGDB_IRQ +#undef IRQ +#define IRQ CONFIG_KGDB_IRQ +#endif +#define SB_CLOCK 1843200 +#define SB_BASE (SB_CLOCK/16) +#define SB_BAUD9600 SB_BASE/9600 +#define SB_BAUD192 SB_BASE/19200 +#define SB_BAUD384 SB_BASE/38400 +#define SB_BAUD576 SB_BASE/57600 +#define SB_BAUD1152 SB_BASE/115200 +#ifdef CONFIG_KGDB_9600BAUD +#define SB_BAUD SB_BAUD9600 +#endif +#ifdef CONFIG_KGDB_19200BAUD +#define SB_BAUD SB_BAUD192 +#endif +#ifdef CONFIG_KGDB_38400BAUD +#define SB_BAUD SB_BAUD384 +#endif +#ifdef CONFIG_KGDB_57600BAUD +#define SB_BAUD SB_BAUD576 +#endif +#ifdef CONFIG_KGDB_115200BAUD +#define SB_BAUD SB_BAUD1152 +#endif +#ifndef SB_BAUD +#define SB_BAUD SB_BAUD1152 /* Start with this if not given */ +#endif + +#ifndef CONFIG_X86_TSC +#undef rdtsc +#define rdtsc(a,b) if (a++ > 10000){a = 0; b++;} +#undef rdtscll +#define rdtscll(s) s++ +#endif + +#ifdef _raw_read_unlock /* must use a name that is "define"ed, not an inline */ +#undef spin_lock +#undef spin_trylock +#undef spin_unlock +#define spin_lock _raw_spin_lock +#define spin_trylock _raw_spin_trylock +#define spin_unlock _raw_spin_unlock +#else +#endif +#undef spin_unlock_wait +#define spin_unlock_wait(x) do { cpu_relax(); barrier();} \ + while(spin_is_locked(x)) + +#define SB_IER 1 +#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS + +#define FLAGS 0 +#define SB_STATE { \ + magic: SSTATE_MAGIC, \ + baud_base: SB_BASE, \ + port: PORT, \ + irq: IRQ, \ + flags: FLAGS, \ + custom_divisor:SB_BAUD} +#define SB_INFO { \ + magic: SERIAL_MAGIC, \ + port: PORT,0,FLAGS, \ + state: &state, \ + tty: (struct tty_struct *)&state, \ + IER: SB_IER, \ + MCR: SB_MCR} +extern void putDebugChar(int); +/* RTAI support needs us to really stop/start interrupts */ + +#define kgdb_sti() __asm__ __volatile__("sti": : :"memory") +#define kgdb_cli() __asm__ __volatile__("cli": : :"memory") +#define kgdb_local_save_flags(x) __asm__ __volatile__(\ + "pushfl ; popl %0":"=g" (x): /* no input */) +#define kgdb_local_irq_restore(x) __asm__ __volatile__(\ + "pushl %0 ; popfl": \ + /* no output */ :"g" (x):"memory", "cc") +#define kgdb_local_irq_save(x) kgdb_local_save_flags(x); kgdb_cli() + +#ifdef CONFIG_SERIAL +extern void shutdown_for_kgdb(struct async_struct *info); +#endif +#define INIT_KDEBUG putDebugChar("+"); +#endif /* __KGDB_LOCAL */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-i386/pgtable-3level.h xx/include/asm-i386/pgtable-3level.h --- 2.6.5-rc2/include/asm-i386/pgtable-3level.h 2003-08-16 14:59:52.000000000 +0200 +++ xx/include/asm-i386/pgtable-3level.h 2004-03-21 15:21:41.000000000 +0100 @@ -123,4 +123,6 @@ static inline pmd_t pfn_pmd(unsigned lon #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) #define PTE_FILE_MAX_BITS 32 +extern struct kmem_cache_s *pae_pgd_cachep; + #endif /* _I386_PGTABLE_3LEVEL_H */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-i386/pgtable.h xx/include/asm-i386/pgtable.h --- 2.6.5-rc2/include/asm-i386/pgtable.h 2004-03-21 15:09:25.000000000 +0100 +++ xx/include/asm-i386/pgtable.h 2004-03-21 15:21:41.000000000 +0100 @@ -21,27 +21,15 @@ #include #endif -#include -#include -#include +extern pgd_t swapper_pg_dir[1024]; +extern void paging_init(void); /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern unsigned long empty_zero_page[1024]; -extern pgd_t swapper_pg_dir[1024]; -extern kmem_cache_t *pgd_cache; -extern kmem_cache_t *pmd_cache; -extern spinlock_t pgd_lock; -extern struct list_head pgd_list; - -void pmd_ctor(void *, kmem_cache_t *, unsigned long); -void pgd_ctor(void *, kmem_cache_t *, unsigned long); -void pgd_dtor(void *, kmem_cache_t *, unsigned long); -void pgtable_cache_init(void); -void paging_init(void); +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) #endif /* !__ASSEMBLY__ */ @@ -53,8 +41,20 @@ void paging_init(void); #ifndef __ASSEMBLY__ #ifdef CONFIG_X86_PAE # include + +/* + * Need to initialise the X86 PAE caches + */ +extern void pgtable_cache_init(void); + #else # include + +/* + * No page table caches to initialise + */ +#define pgtable_cache_init() do { } while (0) + #endif #endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-i386/rmap.h xx/include/asm-i386/rmap.h --- 2.6.5-rc2/include/asm-i386/rmap.h 2002-09-10 20:09:43.000000000 +0200 +++ xx/include/asm-i386/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,21 +0,0 @@ -#ifndef _I386_RMAP_H -#define _I386_RMAP_H - -/* nothing to see, move along */ -#include - -#ifdef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); - unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK; - return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - kunmap_atomic(pte, KM_PTE2); -} -#endif - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-ia64/rmap.h xx/include/asm-ia64/rmap.h --- 2.6.5-rc2/include/asm-ia64/rmap.h 2002-08-13 06:26:30.000000000 +0200 +++ xx/include/asm-ia64/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _ASM_IA64_RMAP_H -#define _ASM_IA64_RMAP_H - -/* nothing to see, move along */ -#include - -#endif /* _ASM_IA64_RMAP_H */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-m68k/rmap.h xx/include/asm-m68k/rmap.h --- 2.6.5-rc2/include/asm-m68k/rmap.h 2002-07-19 20:08:35.000000000 +0200 +++ xx/include/asm-m68k/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _M68K_RMAP_H -#define _M68K_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-m68knommu/rmap.h xx/include/asm-m68knommu/rmap.h --- 2.6.5-rc2/include/asm-m68knommu/rmap.h 2002-11-01 20:44:49.000000000 +0100 +++ xx/include/asm-m68knommu/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,2 +0,0 @@ -/* Do not need anything here */ - diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-mips/rmap.h xx/include/asm-mips/rmap.h --- 2.6.5-rc2/include/asm-mips/rmap.h 2003-07-17 01:54:49.000000000 +0200 +++ xx/include/asm-mips/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef __ASM_RMAP_H -#define __ASM_RMAP_H - -/* nothing to see, move along */ -#include - -#endif /* __ASM_RMAP_H */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-parisc/rmap.h xx/include/asm-parisc/rmap.h --- 2.6.5-rc2/include/asm-parisc/rmap.h 2002-07-19 20:08:35.000000000 +0200 +++ xx/include/asm-parisc/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _PARISC_RMAP_H -#define _PARISC_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-ppc/rmap.h xx/include/asm-ppc/rmap.h --- 2.6.5-rc2/include/asm-ppc/rmap.h 2002-07-19 20:08:35.000000000 +0200 +++ xx/include/asm-ppc/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,9 +0,0 @@ -#ifndef _PPC_RMAP_H -#define _PPC_RMAP_H - -/* PPC calls pte_alloc() before mem_map[] is setup ... */ -#define BROKEN_PPC_PTE_ALLOC_ONE - -#include - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-ppc64/rmap.h xx/include/asm-ppc64/rmap.h --- 2.6.5-rc2/include/asm-ppc64/rmap.h 2002-07-24 05:28:36.000000000 +0200 +++ xx/include/asm-ppc64/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,9 +0,0 @@ -#ifndef _PPC64_RMAP_H -#define _PPC64_RMAP_H - -/* PPC64 calls pte_alloc() before mem_map[] is setup ... */ -#define BROKEN_PPC_PTE_ALLOC_ONE - -#include - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-s390/rmap.h xx/include/asm-s390/rmap.h --- 2.6.5-rc2/include/asm-s390/rmap.h 2002-07-19 20:08:35.000000000 +0200 +++ xx/include/asm-s390/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _S390_RMAP_H -#define _S390_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-sh/rmap.h xx/include/asm-sh/rmap.h --- 2.6.5-rc2/include/asm-sh/rmap.h 2002-07-19 20:08:35.000000000 +0200 +++ xx/include/asm-sh/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _SH_RMAP_H -#define _SH_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-sparc/rmap.h xx/include/asm-sparc/rmap.h --- 2.6.5-rc2/include/asm-sparc/rmap.h 2002-07-19 20:08:35.000000000 +0200 +++ xx/include/asm-sparc/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _SPARC_RMAP_H -#define _SPARC_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-sparc64/rmap.h xx/include/asm-sparc64/rmap.h --- 2.6.5-rc2/include/asm-sparc64/rmap.h 2002-07-19 20:08:35.000000000 +0200 +++ xx/include/asm-sparc64/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _SPARC64_RMAP_H -#define _SPARC64_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-um/rmap.h xx/include/asm-um/rmap.h --- 2.6.5-rc2/include/asm-um/rmap.h 2002-09-12 19:42:39.000000000 +0200 +++ xx/include/asm-um/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,6 +0,0 @@ -#ifndef __UM_RMAP_H -#define __UM_RMAP_H - -#include "asm/arch/rmap.h" - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-v850/rmap.h xx/include/asm-v850/rmap.h --- 2.6.5-rc2/include/asm-v850/rmap.h 2002-11-01 20:43:49.000000000 +0100 +++ xx/include/asm-v850/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1 +0,0 @@ -/* Do not need anything here */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-x86_64/kgdb.h xx/include/asm-x86_64/kgdb.h --- 2.6.5-rc2/include/asm-x86_64/kgdb.h 1970-01-01 01:00:00.000000000 +0100 +++ xx/include/asm-x86_64/kgdb.h 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,71 @@ +#ifndef __KGDB +#define __KGDB + +/* + * This file should not include ANY others. This makes it usable + * most anywhere without the fear of include order or inclusion. + * Make it so! + * + * This file may be included all the time. It is only active if + * CONFIG_KGDB is defined, otherwise it stubs out all the macros + * and entry points. + */ +#if defined(CONFIG_KGDB) && !defined(__ASSEMBLY__) + +extern void breakpoint(void); +#define INIT_KGDB_INTS kgdb_enable_ints() + +#ifndef BREAKPOINT +#define BREAKPOINT asm(" int $3") +#endif + +extern void kgdb_schedule_breakpoint(void); +extern void kgdb_process_breakpoint(void); + +extern int kgdb_tty_hook(void); +extern int kgdb_eth_hook(void); +extern int kgdboe; + +/* + * GDB debug stub (or any debug stub) can point the 'linux_debug_hook' + * pointer to its routine and it will be entered as the first thing + * when a trap occurs. + * + * Return values are, at present, undefined. + * + * The debug hook routine does not necessarily return to its caller. + * It has the register image and thus may choose to resume execution + * anywhere it pleases. + */ +struct pt_regs; + +extern int kgdb_handle_exception(int trapno, + int signo, int err_code, struct pt_regs *regs); +extern int in_kgdb(struct pt_regs *regs); + +extern void set_debug_traps(void); + +#ifdef CONFIG_KGDB_TS +void kgdb_tstamp(int line, char *source, int data0, int data1); +/* + * This is the time stamp function. The macro adds the source info and + * does a cast on the data to allow most any 32-bit value. + */ + +#define kgdb_ts(data0,data1) kgdb_tstamp(__LINE__,__FILE__,(int)data0,(int)data1) +#else +#define kgdb_ts(data0,data1) +#endif +#else /* CONFIG_KGDB && ! __ASSEMBLY__ ,stubs follow... */ +#ifndef BREAKPOINT +#define BREAKPOINT +#endif +#define kgdb_ts(data0,data1) +#define in_kgdb (0) +#define kgdb_handle_exception +#define breakpoint +#define INIT_KGDB_INTS +#define kgdb_process_breakpoint() do {} while(0) + +#endif +#endif /* __KGDB */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-x86_64/kgdb_local.h xx/include/asm-x86_64/kgdb_local.h --- 2.6.5-rc2/include/asm-x86_64/kgdb_local.h 1970-01-01 01:00:00.000000000 +0100 +++ xx/include/asm-x86_64/kgdb_local.h 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,102 @@ +#ifndef __KGDB_LOCAL +#define ___KGDB_LOCAL +#include +#include +#include +#include +#include +#include +#include +#include + +#define PORT 0x3f8 +#ifdef CONFIG_KGDB_PORT +#undef PORT +#define PORT CONFIG_KGDB_PORT +#endif +#define IRQ 4 +#ifdef CONFIG_KGDB_IRQ +#undef IRQ +#define IRQ CONFIG_KGDB_IRQ +#endif +#define SB_CLOCK 1843200 +#define SB_BASE (SB_CLOCK/16) +#define SB_BAUD9600 SB_BASE/9600 +#define SB_BAUD192 SB_BASE/19200 +#define SB_BAUD384 SB_BASE/38400 +#define SB_BAUD576 SB_BASE/57600 +#define SB_BAUD1152 SB_BASE/115200 +#ifdef CONFIG_KGDB_9600BAUD +#define SB_BAUD SB_BAUD9600 +#endif +#ifdef CONFIG_KGDB_19200BAUD +#define SB_BAUD SB_BAUD192 +#endif +#ifdef CONFIG_KGDB_38400BAUD +#define SB_BAUD SB_BAUD384 +#endif +#ifdef CONFIG_KGDB_57600BAUD +#define SB_BAUD SB_BAUD576 +#endif +#ifdef CONFIG_KGDB_115200BAUD +#define SB_BAUD SB_BAUD1152 +#endif +#ifndef SB_BAUD +#define SB_BAUD SB_BAUD1152 /* Start with this if not given */ +#endif + +#ifndef CONFIG_X86_TSC +#undef rdtsc +#define rdtsc(a,b) if (a++ > 10000){a = 0; b++;} +#undef rdtscll +#define rdtscll(s) s++ +#endif + +#ifdef _raw_read_unlock /* must use a name that is "define"ed, not an inline */ +#undef spin_lock +#undef spin_trylock +#undef spin_unlock +#define spin_lock _raw_spin_lock +#define spin_trylock _raw_spin_trylock +#define spin_unlock _raw_spin_unlock +#else +#endif +#undef spin_unlock_wait +#define spin_unlock_wait(x) do { cpu_relax(); barrier();} \ + while(spin_is_locked(x)) + +#define SB_IER 1 +#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS + +#define FLAGS 0 +#define SB_STATE { \ + magic: SSTATE_MAGIC, \ + baud_base: SB_BASE, \ + port: PORT, \ + irq: IRQ, \ + flags: FLAGS, \ + custom_divisor:SB_BAUD} +#define SB_INFO { \ + magic: SERIAL_MAGIC, \ + port: PORT,0,FLAGS, \ + state: &state, \ + tty: (struct tty_struct *)&state, \ + IER: SB_IER, \ + MCR: SB_MCR} +extern void putDebugChar(int); +/* RTAI support needs us to really stop/start interrupts */ + +#define kgdb_sti() __asm__ __volatile__("sti": : :"memory") +#define kgdb_cli() __asm__ __volatile__("cli": : :"memory") +#define kgdb_local_save_flags(x) __asm__ __volatile__(\ + "pushfl ; popl %0":"=g" (x): /* no input */) +#define kgdb_local_irq_restore(x) __asm__ __volatile__(\ + "pushl %0 ; popfl": \ + /* no output */ :"g" (x):"memory", "cc") +#define kgdb_local_irq_save(x) kgdb_local_save_flags(x); kgdb_cli() + +#ifdef CONFIG_SERIAL +extern void shutdown_for_kgdb(struct async_struct *info); +#endif +#define INIT_KDEBUG putDebugChar("+"); +#endif /* __KGDB_LOCAL */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/asm-x86_64/rmap.h xx/include/asm-x86_64/rmap.h --- 2.6.5-rc2/include/asm-x86_64/rmap.h 2002-10-13 02:07:58.000000000 +0200 +++ xx/include/asm-x86_64/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _X8664_RMAP_H -#define _X8664_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/config.h xx/include/linux/config.h --- 2.6.5-rc2/include/linux/config.h 2002-02-05 18:40:40.000000000 +0100 +++ xx/include/linux/config.h 2004-03-21 15:21:40.000000000 +0100 @@ -2,5 +2,8 @@ #define _LINUX_CONFIG_H #include +#ifdef CONFIG_X86 +#include +#endif #endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/dwarf2.h xx/include/linux/dwarf2.h --- 2.6.5-rc2/include/linux/dwarf2.h 1970-01-01 01:00:00.000000000 +0100 +++ xx/include/linux/dwarf2.h 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,738 @@ +/* Declarations and definitions of codes relating to the DWARF2 symbolic + debugging information format. + Copyright (C) 1992, 1993, 1995, 1996, 1997, 1999, 2000, 2001, 2002 + Free Software Foundation, Inc. + + Written by Gary Funck (gary@intrepid.com) The Ada Joint Program + Office (AJPO), Florida State Unviversity and Silicon Graphics Inc. + provided support for this effort -- June 21, 1995. + + Derived from the DWARF 1 implementation written by Ron Guilmette + (rfg@netcom.com), November 1990. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2, or (at your option) any later + version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING. If not, write to the Free + Software Foundation, 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. */ + +/* This file is derived from the DWARF specification (a public document) + Revision 2.0.0 (July 27, 1993) developed by the UNIX International + Programming Languages Special Interest Group (UI/PLSIG) and distributed + by UNIX International. Copies of this specification are available from + UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054. + + This file also now contains definitions from the DWARF 3 specification. */ + +/* This file is shared between GCC and GDB, and should not contain + prototypes. */ + +#ifndef _ELF_DWARF2_H +#define _ELF_DWARF2_H + +/* Structure found in the .debug_line section. */ +#ifndef __ASSEMBLY__ +typedef struct +{ + unsigned char li_length [4]; + unsigned char li_version [2]; + unsigned char li_prologue_length [4]; + unsigned char li_min_insn_length [1]; + unsigned char li_default_is_stmt [1]; + unsigned char li_line_base [1]; + unsigned char li_line_range [1]; + unsigned char li_opcode_base [1]; +} +DWARF2_External_LineInfo; + +typedef struct +{ + unsigned long li_length; + unsigned short li_version; + unsigned int li_prologue_length; + unsigned char li_min_insn_length; + unsigned char li_default_is_stmt; + int li_line_base; + unsigned char li_line_range; + unsigned char li_opcode_base; +} +DWARF2_Internal_LineInfo; + +/* Structure found in .debug_pubnames section. */ +typedef struct +{ + unsigned char pn_length [4]; + unsigned char pn_version [2]; + unsigned char pn_offset [4]; + unsigned char pn_size [4]; +} +DWARF2_External_PubNames; + +typedef struct +{ + unsigned long pn_length; + unsigned short pn_version; + unsigned long pn_offset; + unsigned long pn_size; +} +DWARF2_Internal_PubNames; + +/* Structure found in .debug_info section. */ +typedef struct +{ + unsigned char cu_length [4]; + unsigned char cu_version [2]; + unsigned char cu_abbrev_offset [4]; + unsigned char cu_pointer_size [1]; +} +DWARF2_External_CompUnit; + +typedef struct +{ + unsigned long cu_length; + unsigned short cu_version; + unsigned long cu_abbrev_offset; + unsigned char cu_pointer_size; +} +DWARF2_Internal_CompUnit; + +typedef struct +{ + unsigned char ar_length [4]; + unsigned char ar_version [2]; + unsigned char ar_info_offset [4]; + unsigned char ar_pointer_size [1]; + unsigned char ar_segment_size [1]; +} +DWARF2_External_ARange; + +typedef struct +{ + unsigned long ar_length; + unsigned short ar_version; + unsigned long ar_info_offset; + unsigned char ar_pointer_size; + unsigned char ar_segment_size; +} +DWARF2_Internal_ARange; + +#define ENUM(name) enum name { +#define IF_NOT_ASM(a) a +#define COMMA , +#else +#define ENUM(name) +#define IF_NOT_ASM(a) +#define COMMA + +#endif + +/* Tag names and codes. */ +ENUM(dwarf_tag) + + DW_TAG_padding = 0x00 COMMA + DW_TAG_array_type = 0x01 COMMA + DW_TAG_class_type = 0x02 COMMA + DW_TAG_entry_point = 0x03 COMMA + DW_TAG_enumeration_type = 0x04 COMMA + DW_TAG_formal_parameter = 0x05 COMMA + DW_TAG_imported_declaration = 0x08 COMMA + DW_TAG_label = 0x0a COMMA + DW_TAG_lexical_block = 0x0b COMMA + DW_TAG_member = 0x0d COMMA + DW_TAG_pointer_type = 0x0f COMMA + DW_TAG_reference_type = 0x10 COMMA + DW_TAG_compile_unit = 0x11 COMMA + DW_TAG_string_type = 0x12 COMMA + DW_TAG_structure_type = 0x13 COMMA + DW_TAG_subroutine_type = 0x15 COMMA + DW_TAG_typedef = 0x16 COMMA + DW_TAG_union_type = 0x17 COMMA + DW_TAG_unspecified_parameters = 0x18 COMMA + DW_TAG_variant = 0x19 COMMA + DW_TAG_common_block = 0x1a COMMA + DW_TAG_common_inclusion = 0x1b COMMA + DW_TAG_inheritance = 0x1c COMMA + DW_TAG_inlined_subroutine = 0x1d COMMA + DW_TAG_module = 0x1e COMMA + DW_TAG_ptr_to_member_type = 0x1f COMMA + DW_TAG_set_type = 0x20 COMMA + DW_TAG_subrange_type = 0x21 COMMA + DW_TAG_with_stmt = 0x22 COMMA + DW_TAG_access_declaration = 0x23 COMMA + DW_TAG_base_type = 0x24 COMMA + DW_TAG_catch_block = 0x25 COMMA + DW_TAG_const_type = 0x26 COMMA + DW_TAG_constant = 0x27 COMMA + DW_TAG_enumerator = 0x28 COMMA + DW_TAG_file_type = 0x29 COMMA + DW_TAG_friend = 0x2a COMMA + DW_TAG_namelist = 0x2b COMMA + DW_TAG_namelist_item = 0x2c COMMA + DW_TAG_packed_type = 0x2d COMMA + DW_TAG_subprogram = 0x2e COMMA + DW_TAG_template_type_param = 0x2f COMMA + DW_TAG_template_value_param = 0x30 COMMA + DW_TAG_thrown_type = 0x31 COMMA + DW_TAG_try_block = 0x32 COMMA + DW_TAG_variant_part = 0x33 COMMA + DW_TAG_variable = 0x34 COMMA + DW_TAG_volatile_type = 0x35 COMMA + /* DWARF 3. */ + DW_TAG_dwarf_procedure = 0x36 COMMA + DW_TAG_restrict_type = 0x37 COMMA + DW_TAG_interface_type = 0x38 COMMA + DW_TAG_namespace = 0x39 COMMA + DW_TAG_imported_module = 0x3a COMMA + DW_TAG_unspecified_type = 0x3b COMMA + DW_TAG_partial_unit = 0x3c COMMA + DW_TAG_imported_unit = 0x3d COMMA + /* SGI/MIPS Extensions. */ + DW_TAG_MIPS_loop = 0x4081 COMMA + /* GNU extensions. */ + DW_TAG_format_label = 0x4101 COMMA /* For FORTRAN 77 and Fortran 90. */ + DW_TAG_function_template = 0x4102 COMMA /* For C++. */ + DW_TAG_class_template = 0x4103 COMMA /* For C++. */ + DW_TAG_GNU_BINCL = 0x4104 COMMA + DW_TAG_GNU_EINCL = 0x4105 COMMA + /* Extensions for UPC. See: http://upc.gwu.edu/~upc. */ + DW_TAG_upc_shared_type = 0x8765 COMMA + DW_TAG_upc_strict_type = 0x8766 COMMA + DW_TAG_upc_relaxed_type = 0x8767 +IF_NOT_ASM(};) + +#define DW_TAG_lo_user 0x4080 +#define DW_TAG_hi_user 0xffff + +/* Flag that tells whether entry has a child or not. */ +#define DW_children_no 0 +#define DW_children_yes 1 + +/* Form names and codes. */ +ENUM(dwarf_form) + + DW_FORM_addr = 0x01 COMMA + DW_FORM_block2 = 0x03 COMMA + DW_FORM_block4 = 0x04 COMMA + DW_FORM_data2 = 0x05 COMMA + DW_FORM_data4 = 0x06 COMMA + DW_FORM_data8 = 0x07 COMMA + DW_FORM_string = 0x08 COMMA + DW_FORM_block = 0x09 COMMA + DW_FORM_block1 = 0x0a COMMA + DW_FORM_data1 = 0x0b COMMA + DW_FORM_flag = 0x0c COMMA + DW_FORM_sdata = 0x0d COMMA + DW_FORM_strp = 0x0e COMMA + DW_FORM_udata = 0x0f COMMA + DW_FORM_ref_addr = 0x10 COMMA + DW_FORM_ref1 = 0x11 COMMA + DW_FORM_ref2 = 0x12 COMMA + DW_FORM_ref4 = 0x13 COMMA + DW_FORM_ref8 = 0x14 COMMA + DW_FORM_ref_udata = 0x15 COMMA + DW_FORM_indirect = 0x16 +IF_NOT_ASM(};) + +/* Attribute names and codes. */ + +ENUM(dwarf_attribute) + + DW_AT_sibling = 0x01 COMMA + DW_AT_location = 0x02 COMMA + DW_AT_name = 0x03 COMMA + DW_AT_ordering = 0x09 COMMA + DW_AT_subscr_data = 0x0a COMMA + DW_AT_byte_size = 0x0b COMMA + DW_AT_bit_offset = 0x0c COMMA + DW_AT_bit_size = 0x0d COMMA + DW_AT_element_list = 0x0f COMMA + DW_AT_stmt_list = 0x10 COMMA + DW_AT_low_pc = 0x11 COMMA + DW_AT_high_pc = 0x12 COMMA + DW_AT_language = 0x13 COMMA + DW_AT_member = 0x14 COMMA + DW_AT_discr = 0x15 COMMA + DW_AT_discr_value = 0x16 COMMA + DW_AT_visibility = 0x17 COMMA + DW_AT_import = 0x18 COMMA + DW_AT_string_length = 0x19 COMMA + DW_AT_common_reference = 0x1a COMMA + DW_AT_comp_dir = 0x1b COMMA + DW_AT_const_value = 0x1c COMMA + DW_AT_containing_type = 0x1d COMMA + DW_AT_default_value = 0x1e COMMA + DW_AT_inline = 0x20 COMMA + DW_AT_is_optional = 0x21 COMMA + DW_AT_lower_bound = 0x22 COMMA + DW_AT_producer = 0x25 COMMA + DW_AT_prototyped = 0x27 COMMA + DW_AT_return_addr = 0x2a COMMA + DW_AT_start_scope = 0x2c COMMA + DW_AT_stride_size = 0x2e COMMA + DW_AT_upper_bound = 0x2f COMMA + DW_AT_abstract_origin = 0x31 COMMA + DW_AT_accessibility = 0x32 COMMA + DW_AT_address_class = 0x33 COMMA + DW_AT_artificial = 0x34 COMMA + DW_AT_base_types = 0x35 COMMA + DW_AT_calling_convention = 0x36 COMMA + DW_AT_count = 0x37 COMMA + DW_AT_data_member_location = 0x38 COMMA + DW_AT_decl_column = 0x39 COMMA + DW_AT_decl_file = 0x3a COMMA + DW_AT_decl_line = 0x3b COMMA + DW_AT_declaration = 0x3c COMMA + DW_AT_discr_list = 0x3d COMMA + DW_AT_encoding = 0x3e COMMA + DW_AT_external = 0x3f COMMA + DW_AT_frame_base = 0x40 COMMA + DW_AT_friend = 0x41 COMMA + DW_AT_identifier_case = 0x42 COMMA + DW_AT_macro_info = 0x43 COMMA + DW_AT_namelist_items = 0x44 COMMA + DW_AT_priority = 0x45 COMMA + DW_AT_segment = 0x46 COMMA + DW_AT_specification = 0x47 COMMA + DW_AT_static_link = 0x48 COMMA + DW_AT_type = 0x49 COMMA + DW_AT_use_location = 0x4a COMMA + DW_AT_variable_parameter = 0x4b COMMA + DW_AT_virtuality = 0x4c COMMA + DW_AT_vtable_elem_location = 0x4d COMMA + /* DWARF 3 values. */ + DW_AT_allocated = 0x4e COMMA + DW_AT_associated = 0x4f COMMA + DW_AT_data_location = 0x50 COMMA + DW_AT_stride = 0x51 COMMA + DW_AT_entry_pc = 0x52 COMMA + DW_AT_use_UTF8 = 0x53 COMMA + DW_AT_extension = 0x54 COMMA + DW_AT_ranges = 0x55 COMMA + DW_AT_trampoline = 0x56 COMMA + DW_AT_call_column = 0x57 COMMA + DW_AT_call_file = 0x58 COMMA + DW_AT_call_line = 0x59 COMMA + /* SGI/MIPS extensions. */ + DW_AT_MIPS_fde = 0x2001 COMMA + DW_AT_MIPS_loop_begin = 0x2002 COMMA + DW_AT_MIPS_tail_loop_begin = 0x2003 COMMA + DW_AT_MIPS_epilog_begin = 0x2004 COMMA + DW_AT_MIPS_loop_unroll_factor = 0x2005 COMMA + DW_AT_MIPS_software_pipeline_depth = 0x2006 COMMA + DW_AT_MIPS_linkage_name = 0x2007 COMMA + DW_AT_MIPS_stride = 0x2008 COMMA + DW_AT_MIPS_abstract_name = 0x2009 COMMA + DW_AT_MIPS_clone_origin = 0x200a COMMA + DW_AT_MIPS_has_inlines = 0x200b COMMA + /* GNU extensions. */ + DW_AT_sf_names = 0x2101 COMMA + DW_AT_src_info = 0x2102 COMMA + DW_AT_mac_info = 0x2103 COMMA + DW_AT_src_coords = 0x2104 COMMA + DW_AT_body_begin = 0x2105 COMMA + DW_AT_body_end = 0x2106 COMMA + DW_AT_GNU_vector = 0x2107 COMMA + /* VMS extensions. */ + DW_AT_VMS_rtnbeg_pd_address = 0x2201 COMMA + /* UPC extension. */ + DW_AT_upc_threads_scaled = 0x3210 +IF_NOT_ASM(};) + +#define DW_AT_lo_user 0x2000 /* Implementation-defined range start. */ +#define DW_AT_hi_user 0x3ff0 /* Implementation-defined range end. */ + +/* Location atom names and codes. */ +ENUM(dwarf_location_atom) + + DW_OP_addr = 0x03 COMMA + DW_OP_deref = 0x06 COMMA + DW_OP_const1u = 0x08 COMMA + DW_OP_const1s = 0x09 COMMA + DW_OP_const2u = 0x0a COMMA + DW_OP_const2s = 0x0b COMMA + DW_OP_const4u = 0x0c COMMA + DW_OP_const4s = 0x0d COMMA + DW_OP_const8u = 0x0e COMMA + DW_OP_const8s = 0x0f COMMA + DW_OP_constu = 0x10 COMMA + DW_OP_consts = 0x11 COMMA + DW_OP_dup = 0x12 COMMA + DW_OP_drop = 0x13 COMMA + DW_OP_over = 0x14 COMMA + DW_OP_pick = 0x15 COMMA + DW_OP_swap = 0x16 COMMA + DW_OP_rot = 0x17 COMMA + DW_OP_xderef = 0x18 COMMA + DW_OP_abs = 0x19 COMMA + DW_OP_and = 0x1a COMMA + DW_OP_div = 0x1b COMMA + DW_OP_minus = 0x1c COMMA + DW_OP_mod = 0x1d COMMA + DW_OP_mul = 0x1e COMMA + DW_OP_neg = 0x1f COMMA + DW_OP_not = 0x20 COMMA + DW_OP_or = 0x21 COMMA + DW_OP_plus = 0x22 COMMA + DW_OP_plus_uconst = 0x23 COMMA + DW_OP_shl = 0x24 COMMA + DW_OP_shr = 0x25 COMMA + DW_OP_shra = 0x26 COMMA + DW_OP_xor = 0x27 COMMA + DW_OP_bra = 0x28 COMMA + DW_OP_eq = 0x29 COMMA + DW_OP_ge = 0x2a COMMA + DW_OP_gt = 0x2b COMMA + DW_OP_le = 0x2c COMMA + DW_OP_lt = 0x2d COMMA + DW_OP_ne = 0x2e COMMA + DW_OP_skip = 0x2f COMMA + DW_OP_lit0 = 0x30 COMMA + DW_OP_lit1 = 0x31 COMMA + DW_OP_lit2 = 0x32 COMMA + DW_OP_lit3 = 0x33 COMMA + DW_OP_lit4 = 0x34 COMMA + DW_OP_lit5 = 0x35 COMMA + DW_OP_lit6 = 0x36 COMMA + DW_OP_lit7 = 0x37 COMMA + DW_OP_lit8 = 0x38 COMMA + DW_OP_lit9 = 0x39 COMMA + DW_OP_lit10 = 0x3a COMMA + DW_OP_lit11 = 0x3b COMMA + DW_OP_lit12 = 0x3c COMMA + DW_OP_lit13 = 0x3d COMMA + DW_OP_lit14 = 0x3e COMMA + DW_OP_lit15 = 0x3f COMMA + DW_OP_lit16 = 0x40 COMMA + DW_OP_lit17 = 0x41 COMMA + DW_OP_lit18 = 0x42 COMMA + DW_OP_lit19 = 0x43 COMMA + DW_OP_lit20 = 0x44 COMMA + DW_OP_lit21 = 0x45 COMMA + DW_OP_lit22 = 0x46 COMMA + DW_OP_lit23 = 0x47 COMMA + DW_OP_lit24 = 0x48 COMMA + DW_OP_lit25 = 0x49 COMMA + DW_OP_lit26 = 0x4a COMMA + DW_OP_lit27 = 0x4b COMMA + DW_OP_lit28 = 0x4c COMMA + DW_OP_lit29 = 0x4d COMMA + DW_OP_lit30 = 0x4e COMMA + DW_OP_lit31 = 0x4f COMMA + DW_OP_reg0 = 0x50 COMMA + DW_OP_reg1 = 0x51 COMMA + DW_OP_reg2 = 0x52 COMMA + DW_OP_reg3 = 0x53 COMMA + DW_OP_reg4 = 0x54 COMMA + DW_OP_reg5 = 0x55 COMMA + DW_OP_reg6 = 0x56 COMMA + DW_OP_reg7 = 0x57 COMMA + DW_OP_reg8 = 0x58 COMMA + DW_OP_reg9 = 0x59 COMMA + DW_OP_reg10 = 0x5a COMMA + DW_OP_reg11 = 0x5b COMMA + DW_OP_reg12 = 0x5c COMMA + DW_OP_reg13 = 0x5d COMMA + DW_OP_reg14 = 0x5e COMMA + DW_OP_reg15 = 0x5f COMMA + DW_OP_reg16 = 0x60 COMMA + DW_OP_reg17 = 0x61 COMMA + DW_OP_reg18 = 0x62 COMMA + DW_OP_reg19 = 0x63 COMMA + DW_OP_reg20 = 0x64 COMMA + DW_OP_reg21 = 0x65 COMMA + DW_OP_reg22 = 0x66 COMMA + DW_OP_reg23 = 0x67 COMMA + DW_OP_reg24 = 0x68 COMMA + DW_OP_reg25 = 0x69 COMMA + DW_OP_reg26 = 0x6a COMMA + DW_OP_reg27 = 0x6b COMMA + DW_OP_reg28 = 0x6c COMMA + DW_OP_reg29 = 0x6d COMMA + DW_OP_reg30 = 0x6e COMMA + DW_OP_reg31 = 0x6f COMMA + DW_OP_breg0 = 0x70 COMMA + DW_OP_breg1 = 0x71 COMMA + DW_OP_breg2 = 0x72 COMMA + DW_OP_breg3 = 0x73 COMMA + DW_OP_breg4 = 0x74 COMMA + DW_OP_breg5 = 0x75 COMMA + DW_OP_breg6 = 0x76 COMMA + DW_OP_breg7 = 0x77 COMMA + DW_OP_breg8 = 0x78 COMMA + DW_OP_breg9 = 0x79 COMMA + DW_OP_breg10 = 0x7a COMMA + DW_OP_breg11 = 0x7b COMMA + DW_OP_breg12 = 0x7c COMMA + DW_OP_breg13 = 0x7d COMMA + DW_OP_breg14 = 0x7e COMMA + DW_OP_breg15 = 0x7f COMMA + DW_OP_breg16 = 0x80 COMMA + DW_OP_breg17 = 0x81 COMMA + DW_OP_breg18 = 0x82 COMMA + DW_OP_breg19 = 0x83 COMMA + DW_OP_breg20 = 0x84 COMMA + DW_OP_breg21 = 0x85 COMMA + DW_OP_breg22 = 0x86 COMMA + DW_OP_breg23 = 0x87 COMMA + DW_OP_breg24 = 0x88 COMMA + DW_OP_breg25 = 0x89 COMMA + DW_OP_breg26 = 0x8a COMMA + DW_OP_breg27 = 0x8b COMMA + DW_OP_breg28 = 0x8c COMMA + DW_OP_breg29 = 0x8d COMMA + DW_OP_breg30 = 0x8e COMMA + DW_OP_breg31 = 0x8f COMMA + DW_OP_regx = 0x90 COMMA + DW_OP_fbreg = 0x91 COMMA + DW_OP_bregx = 0x92 COMMA + DW_OP_piece = 0x93 COMMA + DW_OP_deref_size = 0x94 COMMA + DW_OP_xderef_size = 0x95 COMMA + DW_OP_nop = 0x96 COMMA + /* DWARF 3 extensions. */ + DW_OP_push_object_address = 0x97 COMMA + DW_OP_call2 = 0x98 COMMA + DW_OP_call4 = 0x99 COMMA + DW_OP_call_ref = 0x9a COMMA + /* GNU extensions. */ + DW_OP_GNU_push_tls_address = 0xe0 +IF_NOT_ASM(};) + +#define DW_OP_lo_user 0xe0 /* Implementation-defined range start. */ +#define DW_OP_hi_user 0xff /* Implementation-defined range end. */ + +/* Type encodings. */ +ENUM(dwarf_type) + + DW_ATE_void = 0x0 COMMA + DW_ATE_address = 0x1 COMMA + DW_ATE_boolean = 0x2 COMMA + DW_ATE_complex_float = 0x3 COMMA + DW_ATE_float = 0x4 COMMA + DW_ATE_signed = 0x5 COMMA + DW_ATE_signed_char = 0x6 COMMA + DW_ATE_unsigned = 0x7 COMMA + DW_ATE_unsigned_char = 0x8 COMMA + /* DWARF 3. */ + DW_ATE_imaginary_float = 0x9 +IF_NOT_ASM(};) + +#define DW_ATE_lo_user 0x80 +#define DW_ATE_hi_user 0xff + +/* Array ordering names and codes. */ +ENUM(dwarf_array_dim_ordering) + + DW_ORD_row_major = 0 COMMA + DW_ORD_col_major = 1 +IF_NOT_ASM(};) + +/* Access attribute. */ +ENUM(dwarf_access_attribute) + + DW_ACCESS_public = 1 COMMA + DW_ACCESS_protected = 2 COMMA + DW_ACCESS_private = 3 +IF_NOT_ASM(};) + +/* Visibility. */ +ENUM(dwarf_visibility_attribute) + + DW_VIS_local = 1 COMMA + DW_VIS_exported = 2 COMMA + DW_VIS_qualified = 3 +IF_NOT_ASM(};) + +/* Virtuality. */ +ENUM(dwarf_virtuality_attribute) + + DW_VIRTUALITY_none = 0 COMMA + DW_VIRTUALITY_virtual = 1 COMMA + DW_VIRTUALITY_pure_virtual = 2 +IF_NOT_ASM(};) + +/* Case sensitivity. */ +ENUM(dwarf_id_case) + + DW_ID_case_sensitive = 0 COMMA + DW_ID_up_case = 1 COMMA + DW_ID_down_case = 2 COMMA + DW_ID_case_insensitive = 3 +IF_NOT_ASM(};) + +/* Calling convention. */ +ENUM(dwarf_calling_convention) + + DW_CC_normal = 0x1 COMMA + DW_CC_program = 0x2 COMMA + DW_CC_nocall = 0x3 +IF_NOT_ASM(};) + +#define DW_CC_lo_user 0x40 +#define DW_CC_hi_user 0xff + +/* Inline attribute. */ +ENUM(dwarf_inline_attribute) + + DW_INL_not_inlined = 0 COMMA + DW_INL_inlined = 1 COMMA + DW_INL_declared_not_inlined = 2 COMMA + DW_INL_declared_inlined = 3 +IF_NOT_ASM(};) + +/* Discriminant lists. */ +ENUM(dwarf_discrim_list) + + DW_DSC_label = 0 COMMA + DW_DSC_range = 1 +IF_NOT_ASM(};) + +/* Line number opcodes. */ +ENUM(dwarf_line_number_ops) + + DW_LNS_extended_op = 0 COMMA + DW_LNS_copy = 1 COMMA + DW_LNS_advance_pc = 2 COMMA + DW_LNS_advance_line = 3 COMMA + DW_LNS_set_file = 4 COMMA + DW_LNS_set_column = 5 COMMA + DW_LNS_negate_stmt = 6 COMMA + DW_LNS_set_basic_block = 7 COMMA + DW_LNS_const_add_pc = 8 COMMA + DW_LNS_fixed_advance_pc = 9 COMMA + /* DWARF 3. */ + DW_LNS_set_prologue_end = 10 COMMA + DW_LNS_set_epilogue_begin = 11 COMMA + DW_LNS_set_isa = 12 +IF_NOT_ASM(};) + +/* Line number extended opcodes. */ +ENUM(dwarf_line_number_x_ops) + + DW_LNE_end_sequence = 1 COMMA + DW_LNE_set_address = 2 COMMA + DW_LNE_define_file = 3 +IF_NOT_ASM(};) + +/* Call frame information. */ +ENUM(dwarf_call_frame_info) + + DW_CFA_advance_loc = 0x40 COMMA + DW_CFA_offset = 0x80 COMMA + DW_CFA_restore = 0xc0 COMMA + DW_CFA_nop = 0x00 COMMA + DW_CFA_set_loc = 0x01 COMMA + DW_CFA_advance_loc1 = 0x02 COMMA + DW_CFA_advance_loc2 = 0x03 COMMA + DW_CFA_advance_loc4 = 0x04 COMMA + DW_CFA_offset_extended = 0x05 COMMA + DW_CFA_restore_extended = 0x06 COMMA + DW_CFA_undefined = 0x07 COMMA + DW_CFA_same_value = 0x08 COMMA + DW_CFA_register = 0x09 COMMA + DW_CFA_remember_state = 0x0a COMMA + DW_CFA_restore_state = 0x0b COMMA + DW_CFA_def_cfa = 0x0c COMMA + DW_CFA_def_cfa_register = 0x0d COMMA + DW_CFA_def_cfa_offset = 0x0e COMMA + + /* DWARF 3. */ + DW_CFA_def_cfa_expression = 0x0f COMMA + DW_CFA_expression = 0x10 COMMA + DW_CFA_offset_extended_sf = 0x11 COMMA + DW_CFA_def_cfa_sf = 0x12 COMMA + DW_CFA_def_cfa_offset_sf = 0x13 COMMA + + /* SGI/MIPS specific. */ + DW_CFA_MIPS_advance_loc8 = 0x1d COMMA + + /* GNU extensions. */ + DW_CFA_GNU_window_save = 0x2d COMMA + DW_CFA_GNU_args_size = 0x2e COMMA + DW_CFA_GNU_negative_offset_extended = 0x2f +IF_NOT_ASM(};) + +#define DW_CIE_ID 0xffffffff +#define DW_CIE_VERSION 1 + +#define DW_CFA_extended 0 +#define DW_CFA_lo_user 0x1c +#define DW_CFA_hi_user 0x3f + +#define DW_CHILDREN_no 0x00 +#define DW_CHILDREN_yes 0x01 + +#define DW_ADDR_none 0 + +/* Source language names and codes. */ +ENUM(dwarf_source_language) + + DW_LANG_C89 = 0x0001 COMMA + DW_LANG_C = 0x0002 COMMA + DW_LANG_Ada83 = 0x0003 COMMA + DW_LANG_C_plus_plus = 0x0004 COMMA + DW_LANG_Cobol74 = 0x0005 COMMA + DW_LANG_Cobol85 = 0x0006 COMMA + DW_LANG_Fortran77 = 0x0007 COMMA + DW_LANG_Fortran90 = 0x0008 COMMA + DW_LANG_Pascal83 = 0x0009 COMMA + DW_LANG_Modula2 = 0x000a COMMA + DW_LANG_Java = 0x000b COMMA + /* DWARF 3. */ + DW_LANG_C99 = 0x000c COMMA + DW_LANG_Ada95 = 0x000d COMMA + DW_LANG_Fortran95 = 0x000e COMMA + /* MIPS. */ + DW_LANG_Mips_Assembler = 0x8001 COMMA + /* UPC. */ + DW_LANG_Upc = 0x8765 +IF_NOT_ASM(};) + +#define DW_LANG_lo_user 0x8000 /* Implementation-defined range start. */ +#define DW_LANG_hi_user 0xffff /* Implementation-defined range start. */ + +/* Names and codes for macro information. */ +ENUM(dwarf_macinfo_record_type) + + DW_MACINFO_define = 1 COMMA + DW_MACINFO_undef = 2 COMMA + DW_MACINFO_start_file = 3 COMMA + DW_MACINFO_end_file = 4 COMMA + DW_MACINFO_vendor_ext = 255 +IF_NOT_ASM(};) + +/* @@@ For use with GNU frame unwind information. */ + +#define DW_EH_PE_absptr 0x00 +#define DW_EH_PE_omit 0xff + +#define DW_EH_PE_uleb128 0x01 +#define DW_EH_PE_udata2 0x02 +#define DW_EH_PE_udata4 0x03 +#define DW_EH_PE_udata8 0x04 +#define DW_EH_PE_sleb128 0x09 +#define DW_EH_PE_sdata2 0x0A +#define DW_EH_PE_sdata4 0x0B +#define DW_EH_PE_sdata8 0x0C +#define DW_EH_PE_signed 0x08 + +#define DW_EH_PE_pcrel 0x10 +#define DW_EH_PE_textrel 0x20 +#define DW_EH_PE_datarel 0x30 +#define DW_EH_PE_funcrel 0x40 +#define DW_EH_PE_aligned 0x50 + +#define DW_EH_PE_indirect 0x80 + +#endif /* _ELF_DWARF2_H */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/dwarf2-lang.h xx/include/linux/dwarf2-lang.h --- 2.6.5-rc2/include/linux/dwarf2-lang.h 1970-01-01 01:00:00.000000000 +0100 +++ xx/include/linux/dwarf2-lang.h 2004-03-21 15:21:40.000000000 +0100 @@ -0,0 +1,132 @@ +#ifndef DWARF2_LANG +#define DWARF2_LANG +#include + +/* + * This is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2, or (at your option) any later + * version. + */ +/* + * This file defines macros that allow generation of DWARF debug records + * for asm files. This file is platform independent. Register numbers + * (which are about the only thing that is platform dependent) are to be + * supplied by a platform defined file. + */ +#define DWARF_preamble() .section .debug_frame,"",@progbits +/* + * This macro starts a debug frame section. The debug_frame describes + * where to find the registers that the enclosing function saved on + * entry. + * + * ORD is use by the label generator and should be the same as what is + * passed to CFI_postamble. + * + * pc, pc register gdb ordinal. + * + * code_align this is the factor used to define locations or regions + * where the given definitions apply. If you use labels to define these + * this should be 1. + * + * data_align this is the factor used to define register offsets. If + * you use struct offset, this should be the size of the register in + * bytes or the negative of that. This is how it is used: you will + * define a register as the reference register, say the stack pointer, + * then you will say where a register is located relative to this + * reference registers value, say 40 for register 3 (the gdb register + * number). The <40> will be multiplied by to define the + * byte offset of the given register (3, in this example). So if your + * <40> is the byte offset and the reference register points at the + * begining, you would want 1 for the data_offset. If <40> was the 40th + * 4-byte element in that structure you would want 4. And if your + * reference register points at the end of the structure you would want + * a negative data_align value(and you would have to do other math as + * well). + */ + +#define CFI_preamble(ORD, pc, code_align, data_align) \ +.section .debug_frame,"",@progbits ; \ +frame/**/_/**/ORD: \ + .long end/**/_/**/ORD-start/**/_/**/ORD; \ +start/**/_/**/ORD: \ + .long DW_CIE_ID; \ + .byte DW_CIE_VERSION; \ + .byte 0 ; \ + .uleb128 code_align; \ + .sleb128 data_align; \ + .byte pc; + +/* + * After the above macro and prior to the CFI_postamble, you need to + * define the initial state. This starts with defining the reference + * register and, usually the pc. Here are some helper macros: + */ + +#define CFA_define_reference(reg, offset) \ + .byte DW_CFA_def_cfa; \ + .uleb128 reg; \ + .uleb128 (offset); + +#define CFA_define_offset(reg, offset) \ + .byte (DW_CFA_offset + reg); \ + .uleb128 (offset); + +#define CFI_postamble(ORD) \ + .align 4; \ +end/**/_/**/ORD: +/* + * So now your code pushs stuff on the stack, you need a new location + * and the rules for what to do. This starts a running description of + * the call frame. You need to describe what changes with respect to + * the call registers as the location of the pc moves through the code. + * The following builds an FDE (fram descriptor entry?). Like the + * above, it has a preamble and a postamble. It also is tied to the CFI + * above. + * The first entry after the preamble must be the location in the code + * that the call frame is being described for. + */ +#define FDE_preamble(ORD, fde_no, initial_address, length) \ + .long FDE_end/**/_/**/fde_no-FDE_start/**/_/**/fde_no; \ +FDE_start/**/_/**/fde_no: \ + .long frame/**/_/**/ORD; \ + .long initial_address; \ + .long length; + +#define FDE_postamble(fde_no) \ + .align 4; \ +FDE_end/**/_/**/fde_no: +/* + * That done, you can now add registers, subtract registers, move the + * reference and even change the reference. You can also define a new + * area of code the info applies to. For discontinuous bits you should + * start a new FDE. You may have as many as you like. + */ + +/* + * To advance the address by + */ + +#define FDE_advance(bytes) \ + .byte DW_CFA_advance_loc4 \ + .long bytes + + + +/* + * With the above you can define all the register locations. But + * suppose the reference register moves... Takes the new offset NOT an + * increment. This is how esp is tracked if it is not saved. + */ + +#define CFA_define_cfa_offset(offset) \ + .byte $DW_CFA_def_cfa_offset; \ + .uleb128 (offset); +/* + * Or suppose you want to use a different reference register... + */ +#define CFA_define_cfa_register(reg) \ + .byte DW_CFA_def_cfa_register; \ + .uleb128 reg; + +#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/fs.h xx/include/linux/fs.h --- 2.6.5-rc2/include/linux/fs.h 2004-03-21 15:09:25.000000000 +0100 +++ xx/include/linux/fs.h 2004-03-21 15:21:41.000000000 +0100 @@ -322,11 +322,7 @@ struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ - spinlock_t page_lock; /* and spinlock protecting it */ - struct list_head clean_pages; /* list of clean pages */ - struct list_head dirty_pages; /* list of dirty pages */ - struct list_head locked_pages; /* list of locked pages */ - struct list_head io_pages; /* being prepared for I/O */ + spinlock_t tree_lock; /* and spinlock protecting it */ unsigned long nrpages; /* number of total pages */ struct address_space_operations *a_ops; /* methods */ struct list_head i_mmap; /* list of private mappings */ @@ -366,6 +362,15 @@ struct block_device { }; /* + * Radix-tre tags, for tagging dirty and writeback pages within the pagecache + * radix trees + */ +#define PAGECACHE_TAG_DIRTY 0 +#define PAGECACHE_TAG_WRITEBACK 1 + +int mapping_tagged(struct address_space *mapping, int tag); + +/* * Use sequence counter to get consistent i_size on 32-bit processors. */ #if BITS_PER_LONG==32 && defined(CONFIG_SMP) diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/mm.h xx/include/linux/mm.h --- 2.6.5-rc2/include/linux/mm.h 2004-03-21 15:09:25.000000000 +0100 +++ xx/include/linux/mm.h 2004-03-21 15:21:42.000000000 +0100 @@ -39,6 +39,22 @@ extern int page_cluster; * mmap() functions). */ +typedef struct anon_vma_s { + /* This serializes the accesses to the vma list. */ + spinlock_t anon_vma_lock; + + /* + * This is a list of anonymous "related" vmas, + * to scan if one of the pages pointing to this + * anon_vma needs to be unmapped. + * After we unlink the last vma we must garbage collect + * the object if the list is empty because we're + * guaranteed no page can be pointing to this anon_vma + * if there's no vma anymore. + */ + struct list_head anon_vma_head; +} anon_vma_t; + /* * This struct defines a memory VMM memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -69,6 +85,19 @@ struct vm_area_struct { */ struct list_head shared; + /* + * The same vma can be both queued into the i_mmap and in a + * anon_vma too, for example after a cow in + * a MAP_PRIVATE file mapping. However only the MAP_PRIVATE + * will go both in the i_mmap and anon_vma. A MAP_SHARED + * will only be in the i_mmap_shared and a MAP_ANONYMOUS (file = 0) + * will only be queued only in the anon_vma. + * The list is serialized by the anon_vma->lock. + */ + struct list_head anon_vma_node; + /* Serialized by the mmap_sem */ + anon_vma_t * anon_vma; + /* Function pointers to deal with this struct. */ struct vm_operations_struct * vm_ops; @@ -177,16 +206,32 @@ struct page { page_flags_t flags; /* atomic flags, some possibly updated asynchronously */ atomic_t count; /* Usage count, see below. */ - struct list_head list; /* ->mapping has some page lists. */ - struct address_space *mapping; /* The inode (or ...) we belong to. */ - unsigned long index; /* Our offset within mapping. */ + /* + * Number of ptes mapping this page. + * It's serialized by PG_maplock. + * This is needed only to maintain the nr_mapped global info + * so it would be nice to drop it. + */ + unsigned int mapcount; + + pgoff_t index; /* Our offset within mapping. */ struct list_head lru; /* Pageout list, eg. active_list; protected by zone->lru_lock !! */ - union { - struct pte_chain *chain;/* Reverse pte mapping pointer. - * protected by PG_chainlock */ - pte_addr_t direct; - } pte; + + /* + * Address space of this page. + * A page can be either mapped to a file or to be anonymous + * memory, so using the union is optimal here. The PG_anon + * bitflag tells if this is anonymous or a file-mapping. + * If PG_anon is clear we use the as.mapping otherwise we + * use the as.anon_vma. + * The inode address space if it's a file mapping. + * An anon_vma object if it's an anymous mapping. + * The anon_vma can't go away under us if we hold the + * PG_maplock. + */ + struct address_space * mapping; + unsigned long private; /* mapping-private opaque data */ /* @@ -239,24 +284,24 @@ extern void FASTCALL(__page_cache_releas static inline int page_count(struct page *p) { if (PageCompound(p)) - p = (struct page *)p->lru.next; + p = (struct page *)p->private; return atomic_read(&(p)->count); } static inline void get_page(struct page *page) { if (PageCompound(page)) - page = (struct page *)page->lru.next; + page = (struct page *)page->private; atomic_inc(&page->count); } static inline void put_page(struct page *page) { if (PageCompound(page)) { - page = (struct page *)page->lru.next; + page = (struct page *)page->private; if (put_page_testzero(page)) { - if (page->lru.prev) { /* destructor? */ - (*(void (*)(struct page *))page->lru.prev)(page); + if (page[1].mapping) { /* destructor? */ + (*(void (*)(struct page *))page[1].mapping)(page); } else { __page_cache_release(page); } @@ -401,13 +446,11 @@ void page_address_init(void); #endif /* - * Return true if this page is mapped into pagetables. Subtle: test pte.direct - * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain - * is only 32-bit. + * Return true if this page is mapped into pagetables. */ static inline int page_mapped(struct page *page) { - return page->pte.direct != 0; + return page->mapcount; } /* @@ -462,7 +505,8 @@ extern int handle_mm_fault(struct mm_str extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot); + unsigned long address, pgprot_t prot, + struct vm_area_struct *vma); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); @@ -470,6 +514,8 @@ int get_user_pages(struct task_struct *t int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); int set_page_dirty_lock(struct page *page); +int FASTCALL(set_page_dirty(struct page *page)); +int clear_page_dirty_for_io(struct page *page); /* * Prototype to add a shrinker callback for ageable caches. @@ -494,21 +540,16 @@ struct shrinker; extern struct shrinker *set_shrinker(int, shrinker_t); extern void remove_shrinker(struct shrinker *shrinker); -/* - * If the mapping doesn't provide a set_page_dirty a_op, then - * just fall through and assume that it wants buffer_heads. - * FIXME: make the method unconditional. - */ -static inline int set_page_dirty(struct page *page) -{ - if (page->mapping) { - int (*spd)(struct page *); - - spd = page->mapping->a_ops->set_page_dirty; - if (spd) - return (*spd)(page); - } - return __set_page_dirty_buffers(page); +static inline struct address_space * page_mapping(struct page * page) +{ + extern struct address_space swapper_space; + struct address_space * mapping = NULL; + + if (unlikely(PageSwapCache(page))) + mapping = &swapper_space; + else if (!PageAnon(page)) + mapping = page->mapping; + return mapping; } /* diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/objrmap.h xx/include/linux/objrmap.h --- 2.6.5-rc2/include/linux/objrmap.h 1970-01-01 01:00:00.000000000 +0100 +++ xx/include/linux/objrmap.h 2004-03-21 15:21:42.000000000 +0100 @@ -0,0 +1,78 @@ +#ifndef _LINUX_RMAP_H +#define _LINUX_RMAP_H +/* + * Declarations for Object Reverse Mapping functions in mm/objrmap.c + */ +#include + +#ifdef CONFIG_MMU + +#include +#include +#include + +extern kmem_cache_t * anon_vma_cachep; + +#define page_map_lock(page) bit_spin_lock(PG_maplock, &page->flags) +#define page_map_unlock(page) bit_spin_unlock(PG_maplock, &page->flags) + +static inline void anon_vma_free(anon_vma_t * anon_vma) +{ + kmem_cache_free(anon_vma_cachep, anon_vma); +} + +static inline anon_vma_t * anon_vma_alloc(void) +{ + return kmem_cache_alloc(anon_vma_cachep, SLAB_KERNEL); +} + +static inline void anon_vma_lock(struct vm_area_struct * vma) +{ + anon_vma_t * anon_vma = vma->anon_vma; + if (anon_vma) + spin_lock(&anon_vma->anon_vma_lock); +} + +static inline void anon_vma_unlock(struct vm_area_struct * vma) +{ + anon_vma_t * anon_vma = vma->anon_vma; + if (anon_vma) + spin_unlock(&anon_vma->anon_vma_lock); +} + +/* + * anon_vma helper functions. The one starting with __ requires + * the caller to hold the anon_vma_lock, the other takes it + * internally. + */ +extern int FASTCALL(anon_vma_prepare(struct vm_area_struct * vma)); +extern void FASTCALL(anon_vma_merge(struct vm_area_struct * vma, + struct vm_area_struct * vma_dying)); +extern void FASTCALL(anon_vma_unlink(struct vm_area_struct * vma)); +extern void FASTCALL(anon_vma_link(struct vm_area_struct * vma)); +extern void FASTCALL(__anon_vma_link(struct vm_area_struct * vma)); + +/* objrmap tracking functions */ +void FASTCALL(page_add_rmap(struct page *, struct vm_area_struct *, unsigned long, int)); +void FASTCALL(page_remove_rmap(struct page *)); + +/* + * Called from mm/vmscan.c to handle paging out + */ +int FASTCALL(try_to_unmap(struct page *)); +int FASTCALL(page_referenced(struct page *)); + +/* + * Return values of try_to_unmap + */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 + +#else /* !CONFIG_MMU */ + +#define page_referenced(page) TestClearPageReferenced(page) + +#endif /* CONFIG_MMU */ + +#endif /* _LINUX_RMAP_H */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/page-flags.h xx/include/linux/page-flags.h --- 2.6.5-rc2/include/linux/page-flags.h 2004-03-21 15:09:25.000000000 +0100 +++ xx/include/linux/page-flags.h 2004-03-21 15:21:42.000000000 +0100 @@ -69,12 +69,13 @@ #define PG_private 12 /* Has something at ->private */ #define PG_writeback 13 /* Page is under writeback */ #define PG_nosave 14 /* Used for system suspend/resume */ -#define PG_chainlock 15 /* lock bit for ->pte_chain */ +#define PG_maplock 15 /* lock bit for ->as.anon_vma and ->mapcount */ -#define PG_direct 16 /* ->pte_chain points directly at pte */ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_anon 20 /* Anonymous page */ +#define PG_swapcache 21 /* SwapCache page */ /* @@ -279,12 +280,6 @@ extern void get_full_page_state(struct p #define ClearPageNosave(page) clear_bit(PG_nosave, &(page)->flags) #define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags) -#define PageDirect(page) test_bit(PG_direct, &(page)->flags) -#define SetPageDirect(page) set_bit(PG_direct, &(page)->flags) -#define TestSetPageDirect(page) test_and_set_bit(PG_direct, &(page)->flags) -#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) -#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags) - #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) @@ -298,13 +293,14 @@ extern void get_full_page_state(struct p #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) -/* - * The PageSwapCache predicate doesn't use a PG_flag at this time, - * but it may again do so one day. - */ +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) + #ifdef CONFIG_SWAP -extern struct address_space swapper_space; -#define PageSwapCache(page) ((page)->mapping == &swapper_space) +#define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) +#define SetPageSwapCache(page) set_bit(PG_swapcache, &(page)->flags) +#define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags) #else #define PageSwapCache(page) 0 #endif @@ -312,10 +308,18 @@ extern struct address_space swapper_spac struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); +int __clear_page_dirty(struct page *page); +int test_clear_page_writeback(struct page *page); +int test_set_page_writeback(struct page *page); static inline void clear_page_dirty(struct page *page) { test_clear_page_dirty(page); } +static inline void set_page_writeback(struct page *page) +{ + test_set_page_writeback(page); +} + #endif /* PAGE_FLAGS_H */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/pagemap.h xx/include/linux/pagemap.h --- 2.6.5-rc2/include/linux/pagemap.h 2004-01-15 18:36:24.000000000 +0100 +++ xx/include/linux/pagemap.h 2004-03-21 15:21:42.000000000 +0100 @@ -69,9 +69,10 @@ extern struct page * find_trylock_page(s unsigned long index); extern struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask); -extern unsigned int find_get_pages(struct address_space *mapping, - pgoff_t start, unsigned int nr_pages, - struct page **pages); +unsigned find_get_pages(struct address_space *mapping, pgoff_t start, + unsigned int nr_pages, struct page **pages); +unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, + int tag, unsigned int nr_pages, struct page **pages); /* * Returns locked page at given index in given cache, creating it if needed. @@ -141,9 +142,16 @@ static inline unsigned long get_page_cac static inline void ___add_to_page_cache(struct page *page, struct address_space *mapping, unsigned long index) { - list_add(&page->list, &mapping->clean_pages); - page->mapping = mapping; - page->index = index; + extern struct address_space swapper_space; + + if (likely(mapping != &swapper_space)) { + BUG_ON(PageAnon(page)); + page->mapping = mapping; + page->index = index; + } else { + SetPageSwapCache(page); + page->private = index; + } mapping->nrpages++; pagecache_acct(1); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/pagevec.h xx/include/linux/pagevec.h --- 2.6.5-rc2/include/linux/pagevec.h 2002-11-22 20:32:14.000000000 +0100 +++ xx/include/linux/pagevec.h 2004-03-21 15:21:40.000000000 +0100 @@ -22,8 +22,11 @@ void __pagevec_free(struct pagevec *pvec void __pagevec_lru_add(struct pagevec *pvec); void __pagevec_lru_add_active(struct pagevec *pvec); void pagevec_strip(struct pagevec *pvec); -unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned int nr_pages); +unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, + pgoff_t start, unsigned nr_pages); +unsigned pagevec_lookup_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, int tag, + unsigned nr_pages); static inline void pagevec_init(struct pagevec *pvec, int cold) { diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/radix-tree.h xx/include/linux/radix-tree.h --- 2.6.5-rc2/include/linux/radix-tree.h 2003-04-17 07:02:36.000000000 +0200 +++ xx/include/linux/radix-tree.h 2004-03-21 15:21:40.000000000 +0100 @@ -20,8 +20,7 @@ #define _LINUX_RADIX_TREE_H #include - -struct radix_tree_node; +#include struct radix_tree_root { unsigned int height; @@ -29,25 +28,40 @@ struct radix_tree_root { struct radix_tree_node *rnode; }; -#define RADIX_TREE_INIT(mask) {0, (mask), NULL} +#define RADIX_TREE_INIT(mask) { \ + .height = 0, \ + .gfp_mask = (mask), \ + .rnode = NULL, \ +} #define RADIX_TREE(name, mask) \ struct radix_tree_root name = RADIX_TREE_INIT(mask) -#define INIT_RADIX_TREE(root, mask) \ -do { \ - (root)->height = 0; \ - (root)->gfp_mask = (mask); \ - (root)->rnode = NULL; \ +#define INIT_RADIX_TREE(root, mask) \ +do { \ + (root)->height = 0; \ + (root)->gfp_mask = (mask); \ + (root)->rnode = NULL; \ } while (0) -extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); -extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long); -extern void *radix_tree_delete(struct radix_tree_root *, unsigned long); -extern unsigned int +int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); +void *radix_tree_lookup(struct radix_tree_root *, unsigned long); +void *radix_tree_delete(struct radix_tree_root *, unsigned long); +unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); int radix_tree_preload(int gfp_mask); +void radix_tree_init(void); +void *radix_tree_tag_set(struct radix_tree_root *root, + unsigned long index, int tag); +void *radix_tree_tag_clear(struct radix_tree_root *root, + unsigned long index, int tag); +int radix_tree_tag_get(struct radix_tree_root *root, + unsigned long index, int tag); +unsigned int +radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items, int tag); +int radix_tree_tagged(struct radix_tree_root *root, int tag); static inline void radix_tree_preload_end(void) { diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/rmap-locking.h xx/include/linux/rmap-locking.h --- 2.6.5-rc2/include/linux/rmap-locking.h 2004-03-21 15:09:25.000000000 +0100 +++ xx/include/linux/rmap-locking.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,23 +0,0 @@ -/* - * include/linux/rmap-locking.h - * - * Locking primitives for exclusive access to a page's reverse-mapping - * pte chain. - */ - -#include - -struct pte_chain; -extern kmem_cache_t *pte_chain_cache; - -#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, (unsigned long *)&page->flags) -#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, (unsigned long *)&page->flags) - -struct pte_chain *pte_chain_alloc(int gfp_flags); -void __pte_chain_free(struct pte_chain *pte_chain); - -static inline void pte_chain_free(struct pte_chain *pte_chain) -{ - if (pte_chain) - __pte_chain_free(pte_chain); -} diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/serial_core.h xx/include/linux/serial_core.h --- 2.6.5-rc2/include/linux/serial_core.h 2004-03-11 08:27:46.000000000 +0100 +++ xx/include/linux/serial_core.h 2004-03-21 15:21:40.000000000 +0100 @@ -160,7 +160,9 @@ struct uart_port { unsigned char x_char; /* xon/xoff char */ unsigned char regshift; /* reg offset shift */ unsigned char iotype; /* io access style */ - +#ifdef CONFIG_KGDB + int kgdb; /* in use by kgdb */ +#endif #define UPIO_PORT (0) #define UPIO_HUB6 (1) #define UPIO_MEM (2) diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/spinlock.h xx/include/linux/spinlock.h --- 2.6.5-rc2/include/linux/spinlock.h 2003-07-17 01:55:09.000000000 +0200 +++ xx/include/linux/spinlock.h 2004-03-21 15:21:40.000000000 +0100 @@ -15,6 +15,12 @@ #include /* for cpu relax */ #include +#ifdef CONFIG_KGDB +#include +#define SET_WHO(x, him) (x)->who = him; +#else +#define SET_WHO(x, him) +#endif /* * Must define these before including other files, inline functions need them @@ -55,6 +61,9 @@ typedef struct { const char *module; char *owner; int oline; +#ifdef CONFIG_KGDB + struct task_struct *who; +#endif } spinlock_t; #define SPIN_LOCK_UNLOCKED (spinlock_t) { SPINLOCK_MAGIC, 0, 10, __FILE__ , NULL, 0} @@ -66,6 +75,7 @@ typedef struct { (x)->module = __FILE__; \ (x)->owner = NULL; \ (x)->oline = 0; \ + SET_WHO(x, NULL) \ } while (0) #define CHECK_LOCK(x) \ @@ -88,6 +98,7 @@ typedef struct { (x)->lock = 1; \ (x)->owner = __FILE__; \ (x)->oline = __LINE__; \ + SET_WHO(x, current) \ } while (0) /* without debugging, spin_is_locked on UP always says @@ -118,6 +129,7 @@ typedef struct { (x)->lock = 1; \ (x)->owner = __FILE__; \ (x)->oline = __LINE__; \ + SET_WHO(x, current) \ 1; \ }) diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/include/linux/swap.h xx/include/linux/swap.h --- 2.6.5-rc2/include/linux/swap.h 2004-02-04 16:07:05.000000000 +0100 +++ xx/include/linux/swap.h 2004-03-21 15:21:42.000000000 +0100 @@ -76,7 +76,6 @@ struct reclaim_state { #ifdef __KERNEL__ struct address_space; -struct pte_chain; struct sysinfo; struct writeback_control; struct zone; @@ -177,26 +176,11 @@ extern int try_to_free_pages(struct zone extern int shrink_all_memory(int); extern int vm_swappiness; -/* linux/mm/rmap.c */ #ifdef CONFIG_MMU -int FASTCALL(page_referenced(struct page *)); -struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *, - struct pte_chain *)); -void FASTCALL(page_remove_rmap(struct page *, pte_t *)); -int FASTCALL(try_to_unmap(struct page *)); - /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); -#else -#define page_referenced(page) TestClearPageReferenced(page) -#define try_to_unmap(page) SWAP_FAIL #endif /* CONFIG_MMU */ -/* return values of try_to_unmap */ -#define SWAP_SUCCESS 0 -#define SWAP_AGAIN 1 -#define SWAP_FAIL 2 - #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ extern int swap_readpage(struct file *, struct page *); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/init/main.c xx/init/main.c --- 2.6.5-rc2/init/main.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/init/main.c 2004-03-21 15:21:42.000000000 +0100 @@ -84,7 +84,7 @@ extern void signals_init(void); extern void buffer_init(void); extern void pidhash_init(void); extern void pidmap_init(void); -extern void pte_chain_init(void); +extern void anon_vma_init(void); extern void radix_tree_init(void); extern void free_initmem(void); extern void populate_rootfs(void); @@ -421,12 +421,12 @@ asmlinkage void __init start_kernel(void build_all_zonelists(); page_alloc_init(); + trap_init(); printk("Kernel command line: %s\n", saved_command_line); parse_args("Booting kernel", command_line, __start___param, __stop___param - __start___param, &unknown_bootoption); sort_main_extable(); - trap_init(); rcu_init(); init_IRQ(); pidhash_init(); @@ -460,7 +460,7 @@ asmlinkage void __init start_kernel(void calibrate_delay(); pidmap_init(); pgtable_cache_init(); - pte_chain_init(); + anon_vma_init(); #ifdef CONFIG_X86 if (efi_enabled) efi_enter_virtual_mode(); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/ipc/shm.c xx/ipc/shm.c --- 2.6.5-rc2/ipc/shm.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/ipc/shm.c 2004-03-21 15:21:40.000000000 +0100 @@ -380,9 +380,7 @@ static void shm_get_stat(unsigned long * if (is_file_hugepages(shp->shm_file)) { struct address_space *mapping = inode->i_mapping; - spin_lock(&mapping->page_lock); *rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages; - spin_unlock(&mapping->page_lock); } else { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/kernel/fork.c xx/kernel/fork.c --- 2.6.5-rc2/kernel/fork.c 2004-03-11 08:27:47.000000000 +0100 +++ xx/kernel/fork.c 2004-03-21 15:21:42.000000000 +0100 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -326,6 +327,8 @@ static inline int dup_mmap(struct mm_str up(&file->f_mapping->i_shared_sem); } + anon_vma_link(tmp); + /* * Link in the new vma and copy the page table entries: * link in first so that swapoff can see swap entries, diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/kernel/pid.c xx/kernel/pid.c --- 2.6.5-rc2/kernel/pid.c 2004-03-11 08:27:47.000000000 +0100 +++ xx/kernel/pid.c 2004-03-21 15:21:40.000000000 +0100 @@ -268,6 +268,9 @@ void switch_exec_pids(task_t *leader, ta * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or * more. */ +#ifdef CONFIG_KGDB +int kgdb_pid_init_done; /* so we don't call prior to... */ +#endif void __init pidhash_init(void) { int i, j, pidhash_size; @@ -289,6 +292,9 @@ void __init pidhash_init(void) for (j = 0; j < pidhash_size; j++) INIT_LIST_HEAD(&pid_hash[i][j]); } +#ifdef CONFIG_KGDB + kgdb_pid_init_done++; +#endif } void __init pidmap_init(void) diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/kernel/sched.c xx/kernel/sched.c --- 2.6.5-rc2/kernel/sched.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/kernel/sched.c 2004-03-21 15:21:40.000000000 +0100 @@ -2014,6 +2014,13 @@ out_unlock: EXPORT_SYMBOL(set_user_nice); +#if defined( CONFIG_KGDB) +struct task_struct * kgdb_get_idle(int this_cpu) +{ + return cpu_rq(this_cpu)->idle; +} +#endif + #ifndef __alpha__ /* diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/lib/radix-tree.c xx/lib/radix-tree.c --- 2.6.5-rc2/lib/radix-tree.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/lib/radix-tree.c 2004-03-21 15:21:40.000000000 +0100 @@ -6,12 +6,12 @@ * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2, or (at * your option) any later version. - * + * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. @@ -28,21 +28,36 @@ #include #include #include +#include /* * Radix tree node definition. + * + * RADIX_TREE_MAP_SHIFT must be >= log2(BITS_PER_LONG). Otherwise the tags + * array will have zero size and the set_tag() arithmetic will go wrong. */ -#define RADIX_TREE_MAP_SHIFT 6 -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) -#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) +#ifdef __KERNEL__ +#define RADIX_TREE_MAP_SHIFT 6 +#else +#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ +#endif +#define RADIX_TREE_TAGS 2 + +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + +#define RADIX_TREE_TAG_LONGS \ + ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) struct radix_tree_node { unsigned int count; void *slots[RADIX_TREE_MAP_SIZE]; + unsigned long tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS]; }; struct radix_tree_path { struct radix_tree_node *node, **slot; + int offset; }; #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) @@ -124,6 +139,22 @@ out: return ret; } +static inline void tag_set(struct radix_tree_node *node, int tag, int offset) +{ + if (!test_bit(offset, &node->tags[tag][0])) + __set_bit(offset, &node->tags[tag][0]); +} + +static inline void tag_clear(struct radix_tree_node *node, int tag, int offset) +{ + __clear_bit(offset, &node->tags[tag][0]); +} + +static inline int tag_get(struct radix_tree_node *node, int tag, int offset) +{ + return test_bit(offset, &node->tags[tag][0]); +} + /* * Return the maximum key which can be store into a * radix tree with height HEIGHT. @@ -140,26 +171,53 @@ static int radix_tree_extend(struct radi { struct radix_tree_node *node; unsigned int height; + char tags[RADIX_TREE_TAGS]; + int tag; /* Figure out what the height should be. */ height = root->height + 1; while (index > radix_tree_maxindex(height)) height++; - if (root->rnode) { - do { - if (!(node = radix_tree_node_alloc(root))) - return -ENOMEM; - - /* Increase the height. */ - node->slots[0] = root->rnode; - node->count = 1; - root->rnode = node; - root->height++; - } while (height > root->height); - } else + if (root->rnode == NULL) { root->height = height; + goto out; + } + /* + * Prepare the tag status of the top-level node for propagation + * into the newly-pushed top-level node(s) + */ + for (tag = 0; tag < RADIX_TREE_TAGS; tag++) { + int idx; + + tags[tag] = 0; + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (root->rnode->tags[tag][idx]) { + tags[tag] = 1; + break; + } + } + } + + do { + if (!(node = radix_tree_node_alloc(root))) + return -ENOMEM; + + /* Increase the height. */ + node->slots[0] = root->rnode; + + /* Propagate the aggregated tag info into the new root */ + for (tag = 0; tag < RADIX_TREE_TAGS; tag++) { + if (tags[tag]) + tag_set(node, tag, 0); + } + + node->count = 1; + root->rnode = node; + root->height++; + } while (height > root->height); +out: return 0; } @@ -171,23 +229,27 @@ static int radix_tree_extend(struct radi * * Insert an item into the radix tree at position @index. */ -int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) +int radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *item) { struct radix_tree_node *node = NULL, *tmp, **slot; unsigned int height, shift; + int offset; int error; /* Make sure the tree is high enough. */ - if (index > radix_tree_maxindex(root->height)) { + if ((!index && !root->rnode) || + index > radix_tree_maxindex(root->height)) { error = radix_tree_extend(root, index); if (error) return error; } - + slot = &root->rnode; height = root->height; shift = (height-1) * RADIX_TREE_MAP_SHIFT; + offset = 0; /* uninitialised var warning */ while (height > 0) { if (*slot == NULL) { /* Have to add a child node. */ @@ -198,18 +260,21 @@ int radix_tree_insert(struct radix_tree_ node->count++; } - /* Go a level down. */ + /* Go a level down */ + offset = (index >> shift) & RADIX_TREE_MAP_MASK; node = *slot; - slot = (struct radix_tree_node **) - (node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK)); + slot = (struct radix_tree_node **)(node->slots + offset); shift -= RADIX_TREE_MAP_SHIFT; height--; } if (*slot != NULL) return -EEXIST; - if (node) + if (node) { node->count++; + BUG_ON(tag_get(node, 0, offset)); + BUG_ON(tag_get(node, 1, offset)); + } *slot = item; return 0; @@ -221,7 +286,7 @@ EXPORT_SYMBOL(radix_tree_insert); * @root: radix tree root * @index: index key * - * Lookup them item at the position @index in the radix tree @root. + * Lookup the item at the position @index in the radix tree @root. */ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) { @@ -240,16 +305,174 @@ void *radix_tree_lookup(struct radix_tre return NULL; slot = (struct radix_tree_node **) - ((*slot)->slots + ((index >> shift) & RADIX_TREE_MAP_MASK)); + ((*slot)->slots + + ((index >> shift) & RADIX_TREE_MAP_MASK)); shift -= RADIX_TREE_MAP_SHIFT; height--; } - return (void *) *slot; + return *slot; } EXPORT_SYMBOL(radix_tree_lookup); -static /* inline */ unsigned int +/** + * radix_tree_tag_set - set a tag on a radix tree node + * @root: radix tree root + * @index: index key + * @tag: tag index + * + * Set the search tag corresponging to @index in the radix tree. From + * the root all the way down to the leaf node. + * + * Returns the address of the tagged item. Setting a tag on a not-present + * item is a bug. + */ +void *radix_tree_tag_set(struct radix_tree_root *root, + unsigned long index, int tag) +{ + unsigned int height, shift; + struct radix_tree_node **slot; + + height = root->height; + if (index > radix_tree_maxindex(height)) + return NULL; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = &root->rnode; + + while (height > 0) { + int offset; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + tag_set(*slot, tag, offset); + slot = (struct radix_tree_node **)((*slot)->slots + offset); + BUG_ON(*slot == NULL); + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + return *slot; +} +EXPORT_SYMBOL(radix_tree_tag_set); + +/** + * radix_tree_tag_clear - clear a tag on a radix tree node + * @root: radix tree root + * @index: index key + * @tag: tag index + * + * Clear the search tag corresponging to @index in the radix tree. If + * this causes the leaf node to have no tags set then clear the tag in the + * next-to-leaf node, etc. + * + * Returns the address of the tagged item on success, else NULL. ie: + * has the same return value and semantics as radix_tree_lookup(). + */ +void *radix_tree_tag_clear(struct radix_tree_root *root, + unsigned long index, int tag) +{ + struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + unsigned int height, shift; + void *ret = NULL; + + height = root->height; + if (index > radix_tree_maxindex(height)) + goto out; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + pathp->node = NULL; + pathp->slot = &root->rnode; + + while (height > 0) { + int offset; + + if (*pathp->slot == NULL) + goto out; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp[1].offset = offset; + pathp[1].node = *pathp[0].slot; + pathp[1].slot = (struct radix_tree_node **) + (pathp[1].node->slots + offset); + pathp++; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + ret = *pathp[0].slot; + if (ret == NULL) + goto out; + + do { + int idx; + + tag_clear(pathp[0].node, tag, pathp[0].offset); + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (pathp[0].node->tags[tag][idx]) + goto out; + } + pathp--; + } while (pathp[0].node); +out: + return ret; +} +EXPORT_SYMBOL(radix_tree_tag_clear); + +#ifndef __KERNEL__ /* Only the test harness uses this at present */ +/** + * radix_tree_tag_get - get a tag on a radix tree node + * @root: radix tree root + * @index: index key + * @tag: tag index + * + * Return the search tag corresponging to @index in the radix tree. + * + * Returns zero if the tag is unset, or if there is no corresponding item + * in the tree. + */ +int radix_tree_tag_get(struct radix_tree_root *root, + unsigned long index, int tag) +{ + unsigned int height, shift; + struct radix_tree_node **slot; + int saw_unset_tag = 0; + + height = root->height; + if (index > radix_tree_maxindex(height)) + return 0; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = &root->rnode; + + for ( ; ; ) { + int offset; + + if (*slot == NULL) + return 0; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + + /* + * This is just a debug check. Later, we can bale as soon as + * we see an unset tag. + */ + if (!tag_get(*slot, tag, offset)) + saw_unset_tag = 1; + if (height == 1) { + int ret = tag_get(*slot, tag, offset); + + BUG_ON(ret && saw_unset_tag); + return ret; + } + slot = (struct radix_tree_node **)((*slot)->slots + offset); + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } +} +EXPORT_SYMBOL(radix_tree_tag_get); +#endif + +static unsigned int __lookup(struct radix_tree_root *root, void **results, unsigned long index, unsigned int max_items, unsigned long *next_index) { @@ -316,17 +539,6 @@ radix_tree_gang_lookup(struct radix_tree unsigned long cur_index = first_index; unsigned int ret = 0; - if (root->rnode == NULL) - goto out; - if (max_index == 0) { /* Bah. Special case */ - if (first_index == 0) { - if (max_items > 0) { - *results = root->rnode; - ret = 1; - } - } - goto out; - } while (ret < max_items) { unsigned int nr_found; unsigned long next_index; /* Index of next search */ @@ -340,11 +552,101 @@ radix_tree_gang_lookup(struct radix_tree break; cur_index = next_index; } -out: return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup); +/* + * FIXME: the two tag_get()s here should use find_next_bit() instead of + * open-coding the search. + */ +static unsigned int +__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index, + unsigned int max_items, unsigned long *next_index, int tag) +{ + unsigned int nr_found = 0; + unsigned int shift; + unsigned int height = root->height; + struct radix_tree_node *slot; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = root->rnode; + + while (height > 0) { + unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK; + + for ( ; i < RADIX_TREE_MAP_SIZE; i++) { + if (tag_get(slot, tag, i)) { + BUG_ON(slot->slots[i] == NULL); + break; + } + index &= ~((1 << shift) - 1); + index += 1 << shift; + if (index == 0) + goto out; /* 32-bit wraparound */ + } + if (i == RADIX_TREE_MAP_SIZE) + goto out; + height--; + if (height == 0) { /* Bottom level: grab some items */ + unsigned long j = index & RADIX_TREE_MAP_MASK; + + for ( ; j < RADIX_TREE_MAP_SIZE; j++) { + index++; + if (tag_get(slot, tag, j)) { + BUG_ON(slot->slots[j] == NULL); + results[nr_found++] = slot->slots[j]; + if (nr_found == max_items) + goto out; + } + } + } + shift -= RADIX_TREE_MAP_SHIFT; + slot = slot->slots[i]; + } +out: + *next_index = index; + return nr_found; +} + +/** + * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree + * based on a tag + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * @tag: the tag index + * + * Performs an index-ascending scan of the tree for present items which + * have the tag indexed by @tag set. Places the items at *@results and + * returns the number of items which were placed at *@results. + */ +unsigned int +radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items, int tag) +{ + const unsigned long max_index = radix_tree_maxindex(root->height); + unsigned long cur_index = first_index; + unsigned int ret = 0; + + while (ret < max_items) { + unsigned int nr_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + nr_found = __lookup_tag(root, results + ret, cur_index, + max_items - ret, &next_index, tag); + ret += nr_found; + if (next_index == 0) + break; + cur_index = next_index; + } + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup_tag); + /** * radix_tree_delete - delete an item from a radix tree * @root: radix tree root @@ -357,24 +659,31 @@ EXPORT_SYMBOL(radix_tree_gang_lookup); void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + struct radix_tree_path *orig_pathp; unsigned int height, shift; void *ret = NULL; + char tags[RADIX_TREE_TAGS]; + int nr_cleared_tags; height = root->height; if (index > radix_tree_maxindex(height)) goto out; - shift = (height-1) * RADIX_TREE_MAP_SHIFT; + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; pathp->slot = &root->rnode; while (height > 0) { + int offset; + if (*pathp->slot == NULL) goto out; + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp[1].offset = offset; pathp[1].node = *pathp[0].slot; pathp[1].slot = (struct radix_tree_node **) - (pathp[1].node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK)); + (pathp[1].node->slots + offset); pathp++; shift -= RADIX_TREE_MAP_SHIFT; height--; @@ -384,20 +693,67 @@ void *radix_tree_delete(struct radix_tre if (ret == NULL) goto out; + orig_pathp = pathp; + + /* + * Clear all tags associated with the just-deleted item + */ + memset(tags, 0, sizeof(tags)); + do { + int tag; + + nr_cleared_tags = RADIX_TREE_TAGS; + for (tag = 0; tag < RADIX_TREE_TAGS; tag++) { + int idx; + + if (!tags[tag]) + tag_clear(pathp[0].node, tag, pathp[0].offset); + + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (pathp[0].node->tags[tag][idx]) { + tags[tag] = 1; + nr_cleared_tags--; + break; + } + } + } + pathp--; + } while (pathp[0].node && nr_cleared_tags); + + pathp = orig_pathp; *pathp[0].slot = NULL; while (pathp[0].node && --pathp[0].node->count == 0) { pathp--; + BUG_ON(*pathp[0].slot == NULL); *pathp[0].slot = NULL; radix_tree_node_free(pathp[1].node); } - if (root->rnode == NULL) - root->height = 0; /* Empty tree, we can reset the height */ + root->height = 0; out: return ret; } EXPORT_SYMBOL(radix_tree_delete); +/** + * radix_tree_tagged - test whether any items in the tree are tagged + * @root: radix tree root + * @tag: tag to test + */ +int radix_tree_tagged(struct radix_tree_root *root, int tag) +{ + int idx; + + if (!root->rnode) + return 0; + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (root->rnode->tags[tag][idx]) + return 1; + } + return 0; +} +EXPORT_SYMBOL(radix_tree_tagged); + static void radix_tree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags) { diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/MAINTAINERS xx/MAINTAINERS --- 2.6.5-rc2/MAINTAINERS 2004-03-21 15:09:23.000000000 +0100 +++ xx/MAINTAINERS 2004-03-21 15:21:40.000000000 +0100 @@ -1186,6 +1186,12 @@ W: http://sf.net/projects/kernel-janitor W: http://developer.osdl.org/rddunlap/kj-patches/ S: Maintained +KGDB FOR I386 PLATFORM +P: George Anzinger +M: george@mvista.com +L: linux-net@vger.kernel.org +S: Supported + KERNEL NFSD P: Neil Brown M: neilb@cse.unsw.edu.au diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/Makefile xx/Makefile --- 2.6.5-rc2/Makefile 2004-03-21 15:09:23.000000000 +0100 +++ xx/Makefile 2004-03-21 15:21:40.000000000 +0100 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 5 -EXTRAVERSION =-rc2 +EXTRAVERSION =-rc2-aa1 NAME=Feisty Dunnart # *DOCUMENTATION* @@ -449,6 +449,7 @@ endif ifdef CONFIG_DEBUG_INFO CFLAGS += -g +AFLAGS += -g endif # warn about C99 declaration after statement diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/filemap.c xx/mm/filemap.c --- 2.6.5-rc2/mm/filemap.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/mm/filemap.c 2004-03-21 15:21:42.000000000 +0100 @@ -59,11 +59,14 @@ * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_list_lock * ->swap_device_lock (exclusive_swap_page, others) - * ->mapping->page_lock + * ->mapping->tree_lock * * ->i_sem * ->i_shared_sem (truncate->invalidate_mmap_range) * + * ->lock_page + * ->i_shared_sem (page_convert_anon) + * * ->mmap_sem * ->i_shared_sem (various places) * @@ -75,12 +78,12 @@ * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) - * ->mapping->page_lock (__sync_single_inode) + * ->mapping->tree_lock (__sync_single_inode) * * ->page_table_lock * ->swap_device_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) - * ->page_lock (try_to_unmap_one) + * ->tree_lock (try_to_unmap_one) * ->zone.lru_lock (follow_page->mark_page_accessed) * * ->task->proc_lock @@ -90,35 +93,39 @@ /* * Remove a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold a write_lock on the mapping's page_lock. + * is safe. The caller must hold a write_lock on the mapping's tree_lock. */ void __remove_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; - - radix_tree_delete(&mapping->page_tree, page->index); - list_del(&page->list); - page->mapping = NULL; + struct address_space *mapping = page_mapping(page); + if (likely(!PageSwapCache(page))) { + BUG_ON(PageAnon(page)); + radix_tree_delete(&mapping->page_tree, page->index); + page->mapping = NULL; + } else { + radix_tree_delete(&mapping->page_tree, page->private); + ClearPageSwapCache(page); + } mapping->nrpages--; pagecache_acct(-1); } void remove_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (unlikely(!PageLocked(page))) PAGE_BUG(page); - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); } static inline int sync_page(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping && mapping->a_ops && mapping->a_ops->sync_page) return mapping->a_ops->sync_page(page); @@ -145,9 +152,6 @@ static int __filemap_fdatawrite(struct a if (mapping->backing_dev_info->memory_backed) return 0; - spin_lock(&mapping->page_lock); - list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); ret = do_writepages(mapping, &wbc); return ret; } @@ -156,7 +160,6 @@ int filemap_fdatawrite(struct address_sp { return __filemap_fdatawrite(mapping, WB_SYNC_ALL); } - EXPORT_SYMBOL(filemap_fdatawrite); /* @@ -167,55 +170,40 @@ int filemap_flush(struct address_space * { return __filemap_fdatawrite(mapping, WB_SYNC_NONE); } - EXPORT_SYMBOL(filemap_flush); -/** - * filemap_fdatawait - walk the list of locked pages of the given address - * space and wait for all of them. - * @mapping: address space structure to wait for +/* + * Wait for writeback to complete against pages indexed by start->end + * inclusive */ -int filemap_fdatawait(struct address_space * mapping) +static int wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) { + struct pagevec pvec; + int nr_pages; int ret = 0; - int progress; - -restart: - progress = 0; - spin_lock(&mapping->page_lock); - while (!list_empty(&mapping->locked_pages)) { - struct page *page; + pgoff_t index; - page = list_entry(mapping->locked_pages.next,struct page,list); - list_del(&page->list); - if (PageDirty(page)) - list_add(&page->list, &mapping->dirty_pages); - else - list_add(&page->list, &mapping->clean_pages); + if (end < start) + return 0; - if (!PageWriteback(page)) { - if (++progress > 32) { - if (need_resched()) { - spin_unlock(&mapping->page_lock); - __cond_resched(); - goto restart; - } - } - continue; + pagevec_init(&pvec, 0); + index = start; + while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; } - - progress = 0; - page_cache_get(page); - spin_unlock(&mapping->page_lock); - - wait_on_page_writeback(page); - if (PageError(page)) - ret = -EIO; - - page_cache_release(page); - spin_lock(&mapping->page_lock); + pagevec_release(&pvec); + cond_resched(); } - spin_unlock(&mapping->page_lock); /* Check for outstanding write errors */ if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) @@ -226,6 +214,17 @@ restart: return ret; } +/** + * filemap_fdatawait - walk the list of under-writeback pages of the given + * address space and wait for all of them. + * + * @mapping: address space structure to wait for + */ +int filemap_fdatawait(struct address_space *mapping) +{ + return wait_on_page_writeback_range(mapping, 0, -1); +} + EXPORT_SYMBOL(filemap_fdatawait); /* @@ -252,7 +251,7 @@ int add_to_page_cache(struct page *page, if (error == 0) { page_cache_get(page); - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { SetPageLocked(page); @@ -260,7 +259,7 @@ int add_to_page_cache(struct page *page, } else { page_cache_release(page); } - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } return error; @@ -348,8 +347,7 @@ void end_page_writeback(struct page *pag wait_queue_head_t *waitqueue = page_waitqueue(page); if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { - smp_mb__before_clear_bit(); - if (!TestClearPageWriteback(page)) + if (!test_clear_page_writeback(page)) BUG(); smp_mb__after_clear_bit(); } @@ -396,11 +394,11 @@ struct page * find_get_page(struct addre * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */ - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) page_cache_get(page); - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); return page; } @@ -413,11 +411,11 @@ struct page *find_trylock_page(struct ad { struct page *page; - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page && TestSetPageLocked(page)) page = NULL; - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); return page; } @@ -439,17 +437,18 @@ struct page *find_lock_page(struct addre { struct page *page; - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); repeat: page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); lock_page(page); - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); /* Has the page been truncated while we slept? */ + BUG_ON(PageAnon(page)); if (page->mapping != mapping || page->index != offset) { unlock_page(page); page_cache_release(page); @@ -457,7 +456,7 @@ repeat: } } } - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); return page; } @@ -525,18 +524,39 @@ EXPORT_SYMBOL(find_or_create_page); * * find_get_pages() returns the number of pages which were found. */ -unsigned int find_get_pages(struct address_space *mapping, pgoff_t start, +unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { unsigned int i; unsigned int ret; - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); for (i = 0; i < ret; i++) page_cache_get(pages[i]); - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +/* + * Like find_get_pages, except we only return pages which are tagged with + * `tag'. We update *start to index the next page for the traversal. + */ +unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, + int tag, unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + unsigned int ret; + + spin_lock_irq(&mapping->tree_lock); + ret = radix_tree_gang_lookup_tag(&mapping->page_tree, + (void **)pages, *index, nr_pages, tag); + for (i = 0; i < ret; i++) + page_cache_get(pages[i]); + if (ret) + *index = pages[ret - 1]->index + 1; + spin_unlock_irq(&mapping->tree_lock); return ret; } @@ -667,7 +687,7 @@ page_not_up_to_date: lock_page(page); /* Did it get unhashed before we got the lock? */ - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); continue; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/fremap.c xx/mm/fremap.c --- 2.6.5-rc2/mm/fremap.c 2004-03-11 08:27:47.000000000 +0100 +++ xx/mm/fremap.c 2004-03-21 15:21:42.000000000 +0100 @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include @@ -36,7 +36,7 @@ static inline void zap_pte(struct mm_str if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page, ptep); + page_remove_rmap(page); page_cache_release(page); mm->rss--; } @@ -60,11 +60,7 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pmd_t *pmd; pte_t pte_val; - struct pte_chain *pte_chain; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto err; pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -81,18 +77,14 @@ int install_page(struct mm_struct *mm, s mm->rss++; flush_icache_page(vma, page); set_pte(pte, mk_pte(page, prot)); - pte_chain = page_add_rmap(page, pte, pte_chain); + page_add_rmap(page, vma, addr, 0); pte_val = *pte; pte_unmap(pte); update_mmu_cache(vma, addr, pte_val); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); -err: return err; } EXPORT_SYMBOL(install_page); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/Makefile xx/mm/Makefile --- 2.6.5-rc2/mm/Makefile 2003-09-17 04:09:59.000000000 +0200 +++ xx/mm/Makefile 2004-03-21 15:21:42.000000000 +0100 @@ -4,7 +4,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ - mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ + mlock.o mmap.o mprotect.o mremap.o msync.o objrmap.o \ shmem.o vmalloc.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/memory.c xx/mm/memory.c --- 2.6.5-rc2/mm/memory.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/mm/memory.c 2004-03-21 15:21:42.000000000 +0100 @@ -43,12 +43,11 @@ #include #include #include -#include +#include #include #include #include -#include #include #include #include @@ -105,7 +104,7 @@ static inline void free_one_pmd(struct m } page = pmd_page(*dir); pmd_clear(dir); - pgtable_remove_rmap(page); + dec_page_state(nr_page_table_pages); pte_free_tlb(tlb, page); } @@ -164,7 +163,7 @@ pte_t fastcall * pte_alloc_map(struct mm pte_free(new); goto out; } - pgtable_add_rmap(new, mm, address); + inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); } out: @@ -190,7 +189,6 @@ pte_t fastcall * pte_alloc_kernel(struct pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); pmd_populate_kernel(mm, pmd, new); } out: @@ -217,20 +215,10 @@ int copy_page_range(struct mm_struct *ds unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow; - struct pte_chain *pte_chain = NULL; if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst, src, vma); - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (!pte_chain) { - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - } - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -299,7 +287,7 @@ skip_copy_pte_range: pfn = pte_pfn(pte); /* the pte points outside of valid memory, the * mapping is assumed to be good, meaningful - * and not mapped via rmap - duplicate the + * and not mapped via objrmap - duplicate the * mapping as is. */ page = NULL; @@ -331,30 +319,31 @@ skip_copy_pte_range: dst->rss++; set_pte(dst_pte, pte); - pte_chain = page_add_rmap(page, dst_pte, - pte_chain); - if (pte_chain) - goto cont_copy_pte_range_noset; - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (pte_chain) - goto cont_copy_pte_range_noset; + if (likely(!(vma->vm_flags & VM_RESERVED))) { + /* + * Device driver pages must not be + * tracked by the VM for unmapping. + */ + BUG_ON(!page_mapped(page)); + BUG_ON(!page->mapping); + page_add_rmap(page, vma, address, PageAnon(page)); + } else { + BUG_ON(page_mapped(page)); + BUG_ON(page->mapping); + } - /* - * pte_chain allocation failed, and we need to - * run page reclaim. - */ - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - spin_lock(&src->page_table_lock); - dst_pte = pte_offset_map(dst_pmd, address); - src_pte = pte_offset_map_nested(src_pmd, - address); + if (need_resched()) { + pte_unmap_nested(src_pte); + pte_unmap(dst_pte); + spin_unlock(&src->page_table_lock); + spin_unlock(&dst->page_table_lock); + __cond_resched(); + spin_lock(&dst->page_table_lock); + spin_lock(&src->page_table_lock); + dst_pte = pte_offset_map(dst_pmd, address); + src_pte = pte_offset_map_nested(src_pmd, + address); + } cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { @@ -377,10 +366,9 @@ cont_copy_pmd_range: out_unlock: spin_unlock(&src->page_table_lock); out: - pte_chain_free(pte_chain); return 0; + nomem: - pte_chain_free(pte_chain); return -ENOMEM; } @@ -417,11 +405,11 @@ zap_pte_range(struct mmu_gather *tlb, pm if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - if (page->mapping && pte_young(pte) && + if (page_mapping(page) && pte_young(pte) && !PageSwapCache(page)) mark_page_accessed(page); tlb->freed++; - page_remove_rmap(page, ptep); + page_remove_rmap(page); tlb_remove_page(tlb, page); } } @@ -1014,7 +1002,6 @@ static int do_wp_page(struct mm_struct * { struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); - struct pte_chain *pte_chain; pte_t entry; if (unlikely(!pfn_valid(pfn))) { @@ -1053,9 +1040,9 @@ static int do_wp_page(struct mm_struct * page_cache_get(old_page); spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_pte_chain; + if (unlikely(anon_vma_prepare(vma))) + goto no_new_page; + new_page = alloc_page(GFP_HIGHUSER); if (!new_page) goto no_new_page; @@ -1066,12 +1053,12 @@ static int do_wp_page(struct mm_struct * */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (pte_same(*page_table, pte)) { + if (likely(pte_same(*page_table, pte))) { if (PageReserved(old_page)) ++mm->rss; - page_remove_rmap(old_page, page_table); + page_remove_rmap(old_page); break_cow(vma, new_page, address, page_table); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + page_add_rmap(new_page, vma, address, 1); lru_cache_add_active(new_page); /* Free the old page.. */ @@ -1081,12 +1068,9 @@ static int do_wp_page(struct mm_struct * page_cache_release(new_page); page_cache_release(old_page); spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return VM_FAULT_MINOR; no_new_page: - pte_chain_free(pte_chain); -no_pte_chain: page_cache_release(old_page); return VM_FAULT_OOM; } @@ -1243,11 +1227,14 @@ static int do_swap_page(struct mm_struct struct page *page; swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; - int ret = VM_FAULT_MINOR; - struct pte_chain *pte_chain = NULL; + int ret; pte_unmap(page_table); spin_unlock(&mm->page_table_lock); + + BUG_ON(!vma->anon_vma); + + ret = VM_FAULT_MINOR; page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry); @@ -1259,7 +1246,7 @@ static int do_swap_page(struct mm_struct */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (pte_same(*page_table, orig_pte)) + if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; @@ -1274,11 +1261,6 @@ static int do_swap_page(struct mm_struct } mark_page_accessed(page); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - ret = VM_FAULT_OOM; - goto out; - } lock_page(page); /* @@ -1287,7 +1269,7 @@ static int do_swap_page(struct mm_struct */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (!pte_same(*page_table, orig_pte)) { + if (unlikely(!pte_same(*page_table, orig_pte))) { pte_unmap(page_table); spin_unlock(&mm->page_table_lock); unlock_page(page); @@ -1310,14 +1292,13 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); set_pte(page_table, pte); - pte_chain = page_add_rmap(page, page_table, pte_chain); + page_add_rmap(page, vma, address, 1); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: - pte_chain_free(pte_chain); return ret; } @@ -1333,20 +1314,8 @@ do_anonymous_page(struct mm_struct *mm, { pte_t entry; struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; - int ret; + int ret, anon = 0; - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (!pte_chain) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - } - /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1356,9 +1325,12 @@ do_anonymous_page(struct mm_struct *mm, pte_unmap(page_table); spin_unlock(&mm->page_table_lock); + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + page = alloc_page(GFP_HIGHUSER); - if (!page) - goto no_mem; + if (unlikely(!page)) + return VM_FAULT_OOM; clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); @@ -1368,8 +1340,7 @@ do_anonymous_page(struct mm_struct *mm, pte_unmap(page_table); page_cache_release(page); spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; - goto out; + return VM_FAULT_MINOR; } mm->rss++; entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, @@ -1377,23 +1348,20 @@ do_anonymous_page(struct mm_struct *mm, vma); lru_cache_add_active(page); mark_page_accessed(page); + anon = 1; } set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MINOR; - goto out; -no_mem: - ret = VM_FAULT_OOM; -out: - pte_chain_free(pte_chain); + /* ignores ZERO_PAGE */ + page_add_rmap(page, vma, addr, anon); + return ret; } @@ -1416,8 +1384,7 @@ do_no_page(struct mm_struct *mm, struct struct page * new_page; struct address_space *mapping = NULL; pte_t entry; - struct pte_chain *pte_chain; - int sequence = 0; + int sequence = 0, reserved, anon, pageable, as; int ret = VM_FAULT_MINOR; if (!vma->vm_ops || !vma->vm_ops->nopage) @@ -1440,21 +1407,46 @@ retry: if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto oom; +#ifndef CONFIG_DISCONTIGMEM + /* this check is unreliable with numa enabled */ + BUG_ON(!pfn_valid(page_to_pfn(new_page))); +#endif + pageable = !PageReserved(new_page); + as = !!new_page->mapping; + + BUG_ON(!pageable && as); + + pageable &= as; + + /* ->nopage cannot return swapcache */ + BUG_ON(PageSwapCache(new_page)); + /* ->nopage cannot return anonymous pages */ + BUG_ON(PageAnon(new_page)); + + /* + * This is the entry point for memory under VM_RESERVED vmas. + * That memory will not be tracked by the vm. These aren't + * real anonymous pages, they're "device" reserved pages instead. + */ + reserved = !!(vma->vm_flags & VM_RESERVED); + WARN_ON(reserved == pageable); /* * Should we do an early C-O-W break? */ + anon = 0; if (write_access && !(vma->vm_flags & VM_SHARED)) { - struct page * page = alloc_page(GFP_HIGHUSER); + struct page * page; + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_page(GFP_HIGHUSER); if (!page) goto oom; copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); new_page = page; + anon = 1; } spin_lock(&mm->page_table_lock); @@ -1468,7 +1460,6 @@ retry: sequence = atomic_read(&mapping->truncate_count); spin_unlock(&mm->page_table_lock); page_cache_release(new_page); - pte_chain_free(pte_chain); goto retry; } page_table = pte_offset_map(pmd, address); @@ -1492,7 +1483,8 @@ retry: if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte(page_table, entry); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + if (likely(pageable)) + page_add_rmap(new_page, vma, address, anon); pte_unmap(page_table); } else { /* One of our sibling threads was faster, back out. */ @@ -1505,13 +1497,13 @@ retry: /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); spin_unlock(&mm->page_table_lock); - goto out; -oom: + out: + return ret; + + oom: page_cache_release(new_page); ret = VM_FAULT_OOM; -out: - pte_chain_free(pte_chain); - return ret; + goto out; } /* diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/mmap.c xx/mm/mmap.c --- 2.6.5-rc2/mm/mmap.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/mm/mmap.c 2004-03-21 15:21:42.000000000 +0100 @@ -6,6 +6,16 @@ * Address space accounting code */ +/* + * All modifications to vm_start/vm_pgoff must happen + * under the semaphore (for file mappings) and under the + * anon_vma->anon_vma_lock (for anon mappings), to serialize + * against truncate and other objrmap users. See move_vma_start. + * + * We take the page_table_lock then the PG_maplock and finally + * the anon_vma_lock (fork requires that ordering). + */ + #include #include #include @@ -21,6 +31,7 @@ #include #include #include +#include #include #include @@ -63,7 +74,7 @@ EXPORT_SYMBOL(vm_committed_space); /* * Requires inode->i_mapping->i_shared_sem */ -static inline void +static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode) { if (inode) { @@ -272,6 +283,7 @@ __vma_link(struct mm_struct *mm, struct __vma_link_list(mm, vma, prev, rb_parent); __vma_link_rb(mm, vma, rb_link, rb_parent); __vma_link_file(vma); + __anon_vma_link(vma); } static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, @@ -285,9 +297,9 @@ static void vma_link(struct mm_struct *m if (mapping) down(&mapping->i_shared_sem); - spin_lock(&mm->page_table_lock); + anon_vma_lock(vma); __vma_link(mm, vma, prev, rb_link, rb_parent); - spin_unlock(&mm->page_table_lock); + anon_vma_unlock(vma); if (mapping) up(&mapping->i_shared_sem); @@ -337,6 +349,17 @@ static inline int is_mergeable_vma(struc } /* + * Requires that the relevant i_shared_sem and anon_vma_lock + * be held by the caller. + */ +static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) +{ + /* we must update pgoff even if no vm_file for the anon_vma */ + vma->vm_pgoff += (long) (addr - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = addr; +} + +/* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. * @@ -345,15 +368,28 @@ static inline int is_mergeable_vma(struc * wrap, nor mmaps which cover the final page at index -1UL. */ static int -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct file *file, unsigned long vm_pgoff, unsigned long size) -{ - if (is_mergeable_vma(vma, file, vm_flags)) { - if (!file) - return 1; /* anon mapping */ - if (vma->vm_pgoff == vm_pgoff + size) +can_vma_merge_before(struct vm_area_struct *prev, + struct vm_area_struct *vma, unsigned long vm_flags, + struct file *file, unsigned long vm_pgoff, unsigned long size) +{ + if (is_mergeable_vma(vma, file, vm_flags)) + if (vma->vm_pgoff == vm_pgoff + size) { + if (prev) { + /* + * We can fill an hole only if the two + * anonymous mappings are queued in the same + * anon_vma, or if one of them is "direct" + * and it can be queued in the existing + * anon_vma. + * + * Must check this even if file != NULL + * for MAP_PRIVATE mappings. + */ + return ((!vma->anon_vma || !prev->anon_vma) || + (vma->anon_vma == prev->anon_vma)); + } return 1; - } + } return 0; } @@ -368,9 +404,6 @@ can_vma_merge_after(struct vm_area_struc if (is_mergeable_vma(vma, file, vm_flags)) { unsigned long vma_size; - if (!file) - return 1; /* anon mapping */ - vma_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; if (vma->vm_pgoff + vma_size == vm_pgoff) return 1; @@ -388,7 +421,6 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t *lock = &mm->page_table_lock; struct inode *inode = file ? file->f_dentry->d_inode : NULL; struct semaphore *i_shared_sem; @@ -410,32 +442,43 @@ static int vma_merge(struct mm_struct *m * Can it merge with the predecessor? */ if (prev->vm_end == addr && - is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; - int need_up = 0; - if (unlikely(file && prev->vm_next && - prev->vm_next->vm_file == file)) { - down(i_shared_sem); - need_up = 1; - } - spin_lock(lock); + /* + * this can happen outside the i_shared_sem and outside + * the anon_vma_lock since it only enlarge the size of + * the vma, there are no ptes mapped in this new extended + * region anyways. + */ prev->vm_end = end; /* * OK, it did. Can we now merge in the successor as well? */ next = prev->vm_next; + /* next cannot change under us, it's serialized by the mmap_sem */ if (next && prev->vm_end == next->vm_start && - can_vma_merge_before(next, vm_flags, file, + can_vma_merge_before(prev, next, vm_flags, file, pgoff, (end - addr) >> PAGE_SHIFT)) { + /* + * the vm_end extension on the right can happen as usual + * outside the i_shared_sem/anon_vma_lock. + */ prev->vm_end = next->vm_end; + + /* serialized by the mmap_sem */ __vma_unlink(mm, next, prev); + + if (file) + down(i_shared_sem); __remove_shared_vm_struct(next, inode); - spin_unlock(lock); - if (need_up) + if (file) up(i_shared_sem); + + /* the anon_vma_lock is taken inside */ + anon_vma_merge(prev, next); + if (file) fput(file); @@ -443,9 +486,6 @@ static int vma_merge(struct mm_struct *m kmem_cache_free(vm_area_cachep, next); return 1; } - spin_unlock(lock); - if (need_up) - up(i_shared_sem); return 1; } @@ -455,16 +495,15 @@ static int vma_merge(struct mm_struct *m prev = prev->vm_next; if (prev) { merge_next: - if (!can_vma_merge_before(prev, vm_flags, file, + if (!can_vma_merge_before(NULL, prev, vm_flags, file, pgoff, (end - addr) >> PAGE_SHIFT)) return 0; if (end == prev->vm_start) { if (file) down(i_shared_sem); - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + anon_vma_lock(prev); + move_vma_start(prev, addr); + anon_vma_unlock(prev); if (file) up(i_shared_sem); return 1; @@ -583,7 +622,8 @@ unsigned long do_mmap_pgoff(struct file return -EINVAL; case MAP_PRIVATE: vm_flags &= ~(VM_SHARED | VM_MAYSHARE); - /* fall through */ + pgoff = addr >> PAGE_SHIFT; + break; case MAP_SHARED: break; } @@ -626,7 +666,7 @@ munmap_back: /* Can we just expand an old anonymous mapping? */ if (!file && !(vm_flags & VM_SHARED) && rb_parent) if (vma_merge(mm, prev, rb_parent, addr, addr + len, - vm_flags, NULL, 0)) + vm_flags, NULL, pgoff)) goto out; /* @@ -650,6 +690,7 @@ munmap_back: vma->vm_private_data = NULL; vma->vm_next = NULL; INIT_LIST_HEAD(&vma->shared); + vma->anon_vma = NULL; if (file) { error = -EINVAL; @@ -923,19 +964,16 @@ int expand_stack(struct vm_area_struct * */ address += 4 + PAGE_SIZE - 1; address &= PAGE_MASK; - spin_lock(&vma->vm_mm->page_table_lock); grow = (address - vma->vm_end) >> PAGE_SHIFT; /* Overcommit.. */ if (security_vm_enough_memory(grow)) { - spin_unlock(&vma->vm_mm->page_table_lock); return -ENOMEM; } if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur || ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { - spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); return -ENOMEM; } @@ -943,7 +981,6 @@ int expand_stack(struct vm_area_struct * vma->vm_mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) vma->vm_mm->locked_vm += grow; - spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -977,19 +1014,16 @@ int expand_stack(struct vm_area_struct * * the spinlock only before relocating the vma range ourself. */ address &= PAGE_MASK; - spin_lock(&vma->vm_mm->page_table_lock); grow = (vma->vm_start - address) >> PAGE_SHIFT; /* Overcommit.. */ if (security_vm_enough_memory(grow)) { - spin_unlock(&vma->vm_mm->page_table_lock); return -ENOMEM; } if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { - spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); return -ENOMEM; } @@ -998,7 +1032,6 @@ int expand_stack(struct vm_area_struct * vma->vm_mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) vma->vm_mm->locked_vm += grow; - spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -1116,6 +1149,9 @@ static void unmap_vma(struct mm_struct * area->vm_ops->close(area); if (area->vm_file) fput(area->vm_file); + + anon_vma_unlink(area); + kmem_cache_free(vm_area_cachep, area); } @@ -1165,8 +1201,6 @@ static void unmap_region(struct mm_struc /* * Create a list of vma's touched by the unmap, removing them from the mm's * vma list as we go.. - * - * Called with the page_table_lock held. */ static void detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, @@ -1228,15 +1262,16 @@ int split_vma(struct mm_struct * mm, str if (mapping) down(&mapping->i_shared_sem); spin_lock(&mm->page_table_lock); + anon_vma_lock(vma); - if (new_below) { - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); - } else + if (new_below) + move_vma_start(vma, addr); + else vma->vm_end = addr; __insert_vm_struct(mm, new); + anon_vma_unlock(vma); spin_unlock(&mm->page_table_lock); if (mapping) up(&mapping->i_shared_sem); @@ -1341,6 +1376,7 @@ unsigned long do_brk(unsigned long addr, struct vm_area_struct * vma, * prev; unsigned long flags; struct rb_node ** rb_link, * rb_parent; + unsigned long pgoff; len = PAGE_ALIGN(len); if (!len) @@ -1383,9 +1419,11 @@ unsigned long do_brk(unsigned long addr, flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + pgoff = addr >> PAGE_SHIFT; + /* Can we just expand an old anonymous mapping? */ if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, - flags, NULL, 0)) + flags, NULL, pgoff)) goto out; /* @@ -1403,10 +1441,11 @@ unsigned long do_brk(unsigned long addr, vma->vm_flags = flags; vma->vm_page_prot = protection_map[flags & 0x0f]; vma->vm_ops = NULL; - vma->vm_pgoff = 0; + vma->vm_pgoff = pgoff; vma->vm_file = NULL; vma->vm_private_data = NULL; INIT_LIST_HEAD(&vma->shared); + vma->anon_vma = NULL; vma_link(mm, vma, prev, rb_link, rb_parent); @@ -1466,6 +1505,7 @@ void exit_mmap(struct mm_struct *mm) } if (vma->vm_file) fput(vma->vm_file); + anon_vma_unlink(vma); kmem_cache_free(vm_area_cachep, vma); vma = next; } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/mprotect.c xx/mm/mprotect.c --- 2.6.5-rc2/mm/mprotect.c 2003-10-05 17:15:12.000000000 +0200 +++ xx/mm/mprotect.c 2004-03-21 15:21:42.000000000 +0100 @@ -106,6 +106,8 @@ change_protection(struct vm_area_struct spin_unlock(¤t->mm->page_table_lock); return; } + +#if VMA_MERGING_FIXUP /* * Try to merge a vma with the previous flag, return 1 if successful or 0 if it * was impossible. @@ -149,6 +151,7 @@ mprotect_attempt_merge(struct vm_area_st spin_unlock(&mm->page_table_lock); return 1; } +#endif static int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, @@ -184,6 +187,7 @@ mprotect_fixup(struct vm_area_struct *vm newprot = protection_map[newflags & 0xf]; if (start == vma->vm_start) { +#if VMA_MERGING_FIXUP /* * Try to merge with the previous vma. */ @@ -191,6 +195,7 @@ mprotect_fixup(struct vm_area_struct *vm vma = *pprev; goto success; } +#endif } else { error = split_vma(mm, vma, start, 1); if (error) @@ -212,7 +217,9 @@ mprotect_fixup(struct vm_area_struct *vm vma->vm_flags = newflags; vma->vm_page_prot = newprot; spin_unlock(&mm->page_table_lock); +#if VMA_MERGING_FIXUP success: +#endif change_protection(vma, start, end, newprot); return 0; @@ -314,6 +321,7 @@ sys_mprotect(unsigned long start, size_t } } +#if VMA_MERGING_FIXUP if (next && prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags) && !prev->vm_file && !(prev->vm_flags & VM_SHARED)) { @@ -325,6 +333,7 @@ sys_mprotect(unsigned long start, size_t kmem_cache_free(vm_area_cachep, next); prev->vm_mm->map_count--; } +#endif out: up_write(¤t->mm->mmap_sem); return error; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/mremap.c xx/mm/mremap.c --- 2.6.5-rc2/mm/mremap.c 2004-02-20 17:26:54.000000000 +0100 +++ xx/mm/mremap.c 2004-03-21 15:21:42.000000000 +0100 @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -81,7 +80,7 @@ static inline pte_t *alloc_one_pte_map(s static int copy_one_pte(struct vm_area_struct *vma, unsigned long old_addr, - pte_t *src, pte_t *dst, struct pte_chain **pte_chainp) + pte_t *src, pte_t *dst) { int error = 0; pte_t pte; @@ -91,8 +90,6 @@ copy_one_pte(struct vm_area_struct *vma, page = pte_page(*src); if (!pte_none(*src)) { - if (page) - page_remove_rmap(page, src); pte = ptep_clear_flush(vma, old_addr, src); if (!dst) { /* No dest? We must put it back. */ @@ -100,8 +97,6 @@ copy_one_pte(struct vm_area_struct *vma, error++; } set_pte(dst, pte); - if (page) - *pte_chainp = page_add_rmap(page, dst, *pte_chainp); } return error; } @@ -113,13 +108,7 @@ move_one_page(struct vm_area_struct *vma struct mm_struct *mm = vma->vm_mm; int error = 0; pte_t *src, *dst; - struct pte_chain *pte_chain; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - error = -ENOMEM; - goto out; - } spin_lock(&mm->page_table_lock); src = get_one_pte_map_nested(mm, old_addr); if (src) { @@ -140,15 +129,12 @@ move_one_page(struct vm_area_struct *vma * page_table_lock, we should re-check the src entry... */ if (src) { - error = copy_one_pte(vma, old_addr, src, - dst, &pte_chain); + error = copy_one_pte(vma, old_addr, src, dst); pte_unmap_nested(src); } pte_unmap(dst); } spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); -out: return error; } @@ -190,12 +176,17 @@ static unsigned long move_vma(struct vm_ unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long new_addr) { +#if VMA_MERGING_FIXUP struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *next, *prev; +#else + struct vm_area_struct *new_vma; +#endif int allocated_vma; int split = 0; new_vma = NULL; +#if VMA_MERGING_FIXUP next = find_vma_prev(mm, new_addr, &prev); if (next) { if (prev && prev->vm_end == new_addr && @@ -237,6 +228,7 @@ static unsigned long move_vma(struct vm_ new_vma = prev; } } +#endif allocated_vma = 0; if (!new_vma) { diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/nommu.c xx/mm/nommu.c --- 2.6.5-rc2/mm/nommu.c 2004-02-04 16:07:06.000000000 +0100 +++ xx/mm/nommu.c 2004-03-21 15:21:42.000000000 +0100 @@ -568,6 +568,6 @@ unsigned long get_unmapped_area(struct f return -ENOMEM; } -void pte_chain_init(void) +void anon_vma_init(void) { } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/objrmap.c xx/mm/objrmap.c --- 2.6.5-rc2/mm/objrmap.c 1970-01-01 01:00:00.000000000 +0100 +++ xx/mm/objrmap.c 2004-03-21 15:21:42.000000000 +0100 @@ -0,0 +1,642 @@ +/* + * mm/objrmap.c + * + * Provides methods for unmapping all sort of mapped pages + * using the vma objects, the brainer part of objrmap is the + * tracking of the vma to analyze for every given mapped page. + * The anon_vma methods are tracking anonymous pages, + * and the inode methods are tracking pages belonging + * to an inode. + * + * anonymous methods by Andrea Arcangeli 2004 + * inode methods by Dave McCracken 2003, 2004 + */ + +/* + * try_to_unmap/page_referenced/page_add_rmap/page_remove_rmap + * inherit from the rmap design mm/rmap.c under + * Copyright 2001, Rik van Riel + * Released under the General Public License (GPL). + */ + +#include +#include +#include +#include +#include +#include + +kmem_cache_t * anon_vma_cachep; + +//#define OBJRMAP_DEBUG /* can be enabled only for debugging */ + +static inline void validate_anon_vma_find_vma(struct vm_area_struct * find_vma) +{ +#ifdef OBJRMAP_DEBUG + struct vm_area_struct * vma; + anon_vma_t * anon_vma = find_vma->anon_vma; + unsigned long mapcount = 0; + int found = 0; + + list_for_each_entry(vma, &anon_vma->anon_vma_head, anon_vma_node) { + mapcount += 1; + BUG_ON(mapcount > 100000); + if (vma == find_vma) + found = 1; + } + BUG_ON(!found); +#endif +} + +/** + * find_pte - Find a pte pointer given a vma and a struct page. + * @vma: the vma to search + * @page: the page to find + * + * Determine if this page is mapped in this vma. If it is, map and rethrn + * the pte pointer associated with it. Return null if the page is not + * mapped in this vma for any reason. + * + * This is strictly an internal helper function for the object-based rmap + * functions. + * + * It is the caller's responsibility to unmap the pte if it is returned. + */ +static inline pte_t * +find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long loffset; + unsigned long address; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (addr) + *addr = address; + + return pte; + +out_unmap: + pte_unmap(pte); +out: + return NULL; +} + +/** + * page_referenced_one - referenced check for object-based rmap + * @vma: the vma to look in. + * @page: the page we're working on. + * + * Find a pte entry for a page/vma pair, then check and clear the referenced + * bit. + * + * This is strictly a helper function for page_referenced_inode. + */ +static int +page_referenced_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte; + int referenced = 0; + + if (!spin_trylock(&mm->page_table_lock)) + goto out; + + pte = find_pte(vma, page, NULL); + if (pte) { + if (ptep_test_and_clear_young(pte)) + referenced++; + pte_unmap(pte); + } + + spin_unlock(&mm->page_table_lock); + out: + return referenced; +} + +/** + * page_referenced_inode - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->as.mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * assume a reference count of 1. + */ +static int +page_referenced_inode(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int referenced = 0; + + BUG_ON(PageSwapCache(page)); + + if (down_trylock(&mapping->i_shared_sem)) + goto out; + + list_for_each_entry(vma, &mapping->i_mmap, shared) + referenced += page_referenced_one(vma, page); + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) + referenced += page_referenced_one(vma, page); + + up(&mapping->i_shared_sem); + out: + return referenced; +} + +static int page_referenced_anon(struct page *page) +{ + int referenced; + struct vm_area_struct * vma; + anon_vma_t * anon_vma = (anon_vma_t *) page->mapping; + + referenced = 0; + spin_lock(&anon_vma->anon_vma_lock); + BUG_ON(list_empty(&anon_vma->anon_vma_head)); + list_for_each_entry(vma, &anon_vma->anon_vma_head, anon_vma_node) + referenced += page_referenced_one(vma, page); + spin_unlock(&anon_vma->anon_vma_lock); + + return referenced; +} + +/** + * page_referenced - test if the page was referenced + * @page: the page to test + * + * Quick test_and_clear_referenced for all mappings to a page, + * returns the number of processes which referenced the page. + * + * Caller needs to hold the page_map_lock. + */ +int fastcall page_referenced(struct page * page) +{ + int referenced = 0; + + if (!page_mapped(page)) + goto out; + + /* + * We need an object to reach the ptes, all mapped + * pages must provide some method in their mapping. + * Subtle: this checks for page->as.anon_vma/vma too ;). + */ + BUG_ON(!page->mapping); + + if (page_test_and_clear_young(page)) + mark_page_accessed(page); + + if (TestClearPageReferenced(page)) + referenced++; + + if (!PageAnon(page)) + referenced += page_referenced_inode(page); + else + referenced += page_referenced_anon(page); + + out: + return referenced; +} + +/* this needs the page->flags PG_map_lock held */ +static inline void anon_vma_page_link(struct page * page, struct vm_area_struct * vma, + unsigned long address) +{ + unsigned long index = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + BUG_ON(!vma->anon_vma); + if (page->mapcount == 1) { + page->index = index; + BUG_ON(page->mapping); + page->mapping = (struct address_space *) vma->anon_vma; + } else { + BUG_ON(vma->anon_vma != (anon_vma_t *) page->mapping || index != page->index); + } +} + +/** + * page_add_rmap - add reverse mapping entry to a page + * @page: the page to add the mapping to + * @vma: the vma that is covering the page + * + * Add a new pte reverse mapping to a page. + */ +void fastcall page_add_rmap(struct page *page, struct vm_area_struct * vma, + unsigned long address, int anon) +{ + int last_anon; + + if (PageReserved(page)) + return; + + BUG_ON(vma->vm_flags & VM_RESERVED); + + page_map_lock(page); + + /* + * Setting and clearing PG_anon must always happen inside + * page_map_lock to avoid races between mapping and + * unmapping on different processes of the same + * shared cow swapcache page. And while we take the + * page_map_lock PG_anon cannot change from under us. + * Actually PG_anon cannot change under fork either + * since fork holds a reference on the page so it cannot + * be unmapped under fork and in turn copy_page_range is + * allowed to read PG_anon outside the page_map_lock. + */ + last_anon = PageAnon(page); + if (anon && !last_anon) + SetPageAnon(page); + BUG_ON(!anon && last_anon); + + if (!page->mapcount++) + inc_page_state(nr_mapped); + + if (PageAnon(page)) + anon_vma_page_link(page, vma, address); + else { + /* + * If this is an object-based page, just count it. + * We can find the mappings by walking the object + * vma chain for that object. + */ + BUG_ON(PageSwapCache(page)); + BUG_ON(!page->mapping); + } + + page_map_unlock(page); +} + +/* this needs the page->flags PG_map_lock held */ +static inline void anon_vma_page_unlink(struct page * page) +{ + BUG_ON(!page->mapping); + /* + * Cleanup if this anon page is gone + * as far as the vm is concerned. + */ + if (!page->mapcount) { + page->mapping = NULL; + ClearPageAnon(page); + } +} + +/** + * page_remove_rmap - take down reverse mapping to a page + * @page: page to remove mapping from + * + * Removes the reverse mapping from the pte_chain of the page, + * after that the caller can clear the page table entry and free + * the page. + */ +void fastcall page_remove_rmap(struct page *page) +{ + if (PageReserved(page)) + return; + + page_map_lock(page); + + if (!page_mapped(page)) + goto out_unlock; + + if (!--page->mapcount) + dec_page_state(nr_mapped); + + if (PageAnon(page)) + anon_vma_page_unlink(page); + else { + /* + * If this is an object-based page, just uncount it. + * We can find the mappings by walking the object vma + * chain for that object. + */ + BUG_ON(!page->mapping); + BUG_ON(PageSwapCache(page)); + } + + out_unlock: + page_map_unlock(page); + return; +} + +/** + * try_to_unmap_one - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Determine whether a page is mapped in a given vma and unmap it if it's found. + * + * This function is strictly a helper function for try_to_unmap_inode. + */ +static int +try_to_unmap_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + pte_t pteval; + int ret = SWAP_AGAIN; + + if (!spin_trylock(&mm->page_table_lock)) + return ret; + + pte = find_pte(vma, page, &address); + if (!pte) + goto out; + + BUG_ON(vma->vm_flags & VM_RESERVED); + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unmap; + } + + flush_cache_page(vma, address); + pteval = ptep_clear_flush(vma, address, pte); + + if (PageSwapCache(page)) { + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + swp_entry_t entry = { .val = page->private }; + swap_duplicate(entry); + set_pte(pte, swp_entry_to_pte(entry)); + BUG_ON(pte_file(*pte)); + } else { + unsigned long pgidx; + /* + * If a nonlinear mapping then store the file page offset + * in the pte. + */ + pgidx = (address - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (page->index != pgidx) { + set_pte(pte, pgoff_to_pte(page->index)); + BUG_ON(!pte_file(*pte)); + } + } + + if (pte_dirty(pteval)) + set_page_dirty(page); + + BUG_ON(!page->mapcount); + + mm->rss--; + if (!--page->mapcount && PageAnon(page)) + anon_vma_page_unlink(page); + page_cache_release(page); + +out_unmap: + pte_unmap(pte); + +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap_inode - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * return a temporary error. + */ +static int +try_to_unmap_inode(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + BUG_ON(PageSwapCache(page)); + + if (down_trylock(&mapping->i_shared_sem)) + return ret; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + ret = try_to_unmap_one(vma, page); + if (ret == SWAP_FAIL || !page->mapcount) + goto out; + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + ret = try_to_unmap_one(vma, page); + if (ret == SWAP_FAIL || !page->mapcount) + goto out; + } + +out: + up(&mapping->i_shared_sem); + return ret; +} + +static int +try_to_unmap_anon(struct page * page) +{ + int ret = SWAP_AGAIN; + struct vm_area_struct * vma; + anon_vma_t * anon_vma = (anon_vma_t *) page->mapping; + + if (!PageSwapCache(page)) + return SWAP_AGAIN; + + spin_lock(&anon_vma->anon_vma_lock); + BUG_ON(list_empty(&anon_vma->anon_vma_head)); + list_for_each_entry(vma, &anon_vma->anon_vma_head, anon_vma_node) { + ret = try_to_unmap_one(vma, page); + if (ret == SWAP_FAIL || !page->mapcount) + break; + } + spin_unlock(&anon_vma->anon_vma_lock); + + return ret; +} + +/** + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. + * + * Caller must hold the page_map_lock. + * + * Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable + */ +int fastcall try_to_unmap(struct page * page) +{ + int ret = SWAP_SUCCESS; + + /* This page should not be on the pageout lists. */ + BUG_ON(PageReserved(page)); + BUG_ON(!PageLocked(page)); + + /* + * We need an object to reach the ptes. + * Subtle: this checks for page->as.anon_vma too ;). + */ + BUG_ON(!page->mapping); + + if (!PageAnon(page)) + ret = try_to_unmap_inode(page); + else + ret = try_to_unmap_anon(page); + + if (!page_mapped(page)) { + dec_page_state(nr_mapped); + ret = SWAP_SUCCESS; + } + return ret; +} + +/* + * No more VM stuff below this comment, only anon_vma helper + * functions. + */ + +/* This must be called under the mmap_sem. */ +int fastcall anon_vma_prepare(struct vm_area_struct * vma) +{ + anon_vma_t * anon_vma = vma->anon_vma; + + might_sleep(); + if (!anon_vma) { + anon_vma = anon_vma_alloc(); + if (!anon_vma) + return -ENOMEM; + vma->anon_vma = anon_vma; + /* mmap_sem to protect against threads is enough */ + list_add(&vma->anon_vma_node, &anon_vma->anon_vma_head); + } + return 0; +} + +void fastcall anon_vma_merge(struct vm_area_struct * vma, + struct vm_area_struct * vma_dying) +{ + anon_vma_t * anon_vma; + + anon_vma = vma_dying->anon_vma; + if (!anon_vma) + return; + + if (!vma->anon_vma) { + /* this is serialized by the mmap_sem */ + vma->anon_vma = anon_vma; + + spin_lock(&anon_vma->anon_vma_lock); + list_add(&vma->anon_vma_node, &vma_dying->anon_vma_node); + list_del(&vma_dying->anon_vma_node); + spin_unlock(&anon_vma->anon_vma_lock); + } else { + /* if they're both non-null they must be the same */ + BUG_ON(vma->anon_vma != anon_vma); + + spin_lock(&anon_vma->anon_vma_lock); + list_del(&vma_dying->anon_vma_node); + spin_unlock(&anon_vma->anon_vma_lock); + } +} + +void fastcall __anon_vma_link(struct vm_area_struct * vma) +{ + anon_vma_t * anon_vma = vma->anon_vma; + + if (anon_vma) { + list_add(&vma->anon_vma_node, &anon_vma->anon_vma_head); + validate_anon_vma_find_vma(vma); + } +} + +void fastcall anon_vma_link(struct vm_area_struct * vma) +{ + anon_vma_t * anon_vma = vma->anon_vma; + + if (anon_vma) { + spin_lock(&anon_vma->anon_vma_lock); + list_add(&vma->anon_vma_node, &anon_vma->anon_vma_head); + validate_anon_vma_find_vma(vma); + spin_unlock(&anon_vma->anon_vma_lock); + } +} + +void fastcall anon_vma_unlink(struct vm_area_struct * vma) +{ + anon_vma_t * anon_vma; + int empty = 0; + + anon_vma = vma->anon_vma; + if (!anon_vma) + return; + + spin_lock(&anon_vma->anon_vma_lock); + validate_anon_vma_find_vma(vma); + list_del(&vma->anon_vma_node); + /* We must garbage collect the anon_vma if it's empty */ + if (list_empty(&anon_vma->anon_vma_head)) + empty = 1; + spin_unlock(&anon_vma->anon_vma_lock); + + if (empty) + anon_vma_free(anon_vma); +} + +static void +anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) +{ + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + anon_vma_t * anon_vma = (anon_vma_t *) data; + + spin_lock_init(&anon_vma->anon_vma_lock); + INIT_LIST_HEAD(&anon_vma->anon_vma_head); + } +} + +void __init anon_vma_init(void) +{ + /* this is intentonally not hw aligned to avoid wasting ram */ + anon_vma_cachep = kmem_cache_create("anon_vma", + sizeof(anon_vma_t), 0, 0, + anon_vma_ctor, NULL); + + if(!anon_vma_cachep) + panic("Cannot create anon_vma SLAB cache"); +} diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/page_alloc.c xx/mm/page_alloc.c --- 2.6.5-rc2/mm/page_alloc.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/mm/page_alloc.c 2004-03-21 15:21:42.000000000 +0100 @@ -71,21 +71,26 @@ static int bad_range(struct zone *zone, static void bad_page(const char *function, struct page *page) { - printk("Bad page state at %s (in process '%s', page %p)\n", function, current->comm, page); - printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n", + printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", + function, current->comm, page); + printk(KERN_EMERG "flags:0x%08lx mapping:%p mapped:%d count:%d\n", (unsigned long)page->flags, page->mapping, page_mapped(page), page_count(page)); - printk("Backtrace:\n"); + printk(KERN_EMERG "Backtrace:\n"); dump_stack(); - printk("Trying to fix it up, but a reboot is needed\n"); + printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); page->flags &= ~(1 << PG_private | 1 << PG_locked | 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | + 1 << PG_swapcache | + 1 << PG_anon | + 1 << PG_maplock | 1 << PG_writeback); set_page_count(page, 0); page->mapping = NULL; + page->mapcount = 0; } #ifndef CONFIG_HUGETLB_PAGE @@ -99,13 +104,13 @@ static void bad_page(const char *functio * * The remaining PAGE_SIZE pages are called "tail pages". * - * All pages have PG_compound set. All pages have their lru.next pointing at + * All pages have PG_compound set. All pages have their ->private pointing at * the head page (even the head page has this). * - * The head page's lru.prev, if non-zero, holds the address of the compound - * page's put_page() function. + * The first tail page's ->mapping, if non-zero, holds the address of the + * compound page's put_page() function. * - * The order of the allocation is stored in the first tail page's lru.prev. + * The order of the allocation is stored in the first tail page's ->index * This is only for debug at present. This usage means that zero-order pages * may not be compound. */ @@ -114,13 +119,13 @@ static void prep_compound_page(struct pa int i; int nr_pages = 1 << order; - page->lru.prev = NULL; - page[1].lru.prev = (void *)order; + page[1].mapping = 0; + page[1].index = order; for (i = 0; i < nr_pages; i++) { struct page *p = page + i; SetPageCompound(p); - p->lru.next = (void *)page; + p->private = (unsigned long)page; } } @@ -129,7 +134,7 @@ static void destroy_compound_page(struct int i; int nr_pages = 1 << order; - if (page[1].lru.prev != (void *)order) + if (page[1].index != order) bad_page(__FUNCTION__, page); for (i = 0; i < nr_pages; i++) { @@ -137,7 +142,7 @@ static void destroy_compound_page(struct if (!PageCompound(p)) bad_page(__FUNCTION__, page); - if (p->lru.next != (void *)page) + if (p->private != (unsigned long)page) bad_page(__FUNCTION__, page); ClearPageCompound(p); } @@ -199,19 +204,18 @@ static inline void __free_pages_bulk (st buddy2 = base + page_idx; BUG_ON(bad_range(zone, buddy1)); BUG_ON(bad_range(zone, buddy2)); - list_del(&buddy1->list); + list_del(&buddy1->lru); mask <<= 1; area++; index >>= 1; page_idx &= mask; } - list_add(&(base + page_idx)->list, &area->free_list); + list_add(&(base + page_idx)->lru, &area->free_list); } static inline void free_pages_check(const char *function, struct page *page) { - if ( page_mapped(page) || - page->mapping != NULL || + if ( page->mapping != NULL || page_count(page) != 0 || (page->flags & ( 1 << PG_lru | @@ -220,6 +224,9 @@ static inline void free_pages_check(cons 1 << PG_active | 1 << PG_reclaim | 1 << PG_slab | + 1 << PG_swapcache | + 1 << PG_anon | + 1 << PG_maplock | 1 << PG_writeback ))) bad_page(function, page); if (PageDirty(page)) @@ -253,9 +260,9 @@ free_pages_bulk(struct zone *zone, int c zone->all_unreclaimable = 0; zone->pages_scanned = 0; while (!list_empty(list) && count--) { - page = list_entry(list->prev, struct page, list); + page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_pages_bulk list manipulates */ - list_del(&page->list); + list_del(&page->lru); __free_pages_bulk(page, base, zone, area, mask, order); ret++; } @@ -271,7 +278,7 @@ void __free_pages_ok(struct page *page, mod_page_state(pgfree, 1 << order); for (i = 0 ; i < (1 << order) ; ++i) free_pages_check(__FUNCTION__, page + i); - list_add(&page->list, &list); + list_add(&page->lru, &list); kernel_map_pages(page, 1<>= 1; - list_add(&page->list, &area->free_list); + list_add(&page->lru, &area->free_list); MARK_USED(index, high, area); index += size; page += size; @@ -319,7 +326,7 @@ static inline void set_page_refs(struct */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapped(page) || + if (page->mapping || (page->flags & ( 1 << PG_private | 1 << PG_locked | @@ -327,6 +334,9 @@ static void prep_new_page(struct page *p 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | + 1 << PG_anon | + 1 << PG_maplock | + 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); @@ -353,8 +363,8 @@ static struct page *__rmqueue(struct zon if (list_empty(&area->free_list)) continue; - page = list_entry(area->free_list.next, struct page, list); - list_del(&page->list); + page = list_entry(area->free_list.next, struct page, lru); + list_del(&page->lru); index = page - zone->zone_mem_map; if (current_order != MAX_ORDER-1) MARK_USED(index, current_order, area); @@ -384,7 +394,7 @@ static int rmqueue_bulk(struct zone *zon if (page == NULL) break; allocated++; - list_add_tail(&page->list, list); + list_add_tail(&page->lru, list); } spin_unlock_irqrestore(&zone->lock, flags); return allocated; @@ -426,7 +436,7 @@ int is_head_of_free_region(struct page * spin_lock_irqsave(&zone->lock, flags); for (order = MAX_ORDER - 1; order >= 0; --order) list_for_each(curr, &zone->free_area[order].free_list) - if (page == list_entry(curr, struct page, list)) { + if (page == list_entry(curr, struct page, lru)) { spin_unlock_irqrestore(&zone->lock, flags); return 1 << order; } @@ -464,7 +474,7 @@ static void fastcall free_hot_cold_page( local_irq_save(flags); if (pcp->count >= pcp->high) pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); - list_add(&page->list, &pcp->list); + list_add(&page->lru, &pcp->list); pcp->count++; local_irq_restore(flags); put_cpu(); @@ -500,8 +510,8 @@ static struct page *buffered_rmqueue(str pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); if (pcp->count) { - page = list_entry(pcp->list.next, struct page, list); - list_del(&page->list); + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); pcp->count--; } local_irq_restore(flags); @@ -512,14 +522,14 @@ static struct page *buffered_rmqueue(str spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); spin_unlock_irqrestore(&zone->lock, flags); - if (order && page) - prep_compound_page(page, order); } if (page != NULL) { BUG_ON(bad_range(zone, page)); mod_page_state_zone(zone, pgalloc, 1 << order); prep_new_page(page, order); + if (order) + prep_compound_page(page, order); } return page; } @@ -1360,7 +1370,7 @@ void __init memmap_init_zone(struct page set_page_zone(page, NODEZONE(nid, zone)); set_page_count(page, 0); SetPageReserved(page); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); #ifdef WANT_PAGE_VIRTUAL /* The shift won't overflow because ZONE_NORMAL is below 4G. */ if (zone != ZONE_HIGHMEM) diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/page_io.c xx/mm/page_io.c --- 2.6.5-rc2/mm/page_io.c 2002-12-15 04:18:17.000000000 +0100 +++ xx/mm/page_io.c 2004-03-21 15:21:42.000000000 +0100 @@ -32,7 +32,7 @@ get_swap_bio(int gfp_flags, struct page swp_entry_t entry; BUG_ON(!PageSwapCache(page)); - entry.val = page->index; + entry.val = page->private; sis = get_swap_info_struct(swp_type(entry)); bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * @@ -104,7 +104,7 @@ int swap_writepage(struct page *page, st goto out; } inc_page_state(pswpout); - SetPageWriteback(page); + set_page_writeback(page); unlock_page(page); submit_bio(WRITE, bio); out: @@ -150,9 +150,9 @@ int rw_swap_page_sync(int rw, swp_entry_ lock_page(page); - BUG_ON(page->mapping); - page->mapping = &swapper_space; - page->index = entry.val; + BUG_ON(page_mapping(page)); + SetPageSwapCache(page); + page->private = entry.val; if (rw == READ) { ret = swap_readpage(NULL, page); @@ -161,7 +161,7 @@ int rw_swap_page_sync(int rw, swp_entry_ ret = swap_writepage(page, &swap_wbc); wait_on_page_writeback(page); } - page->mapping = NULL; + ClearPageSwapCache(page); if (ret == 0 && (!PageUptodate(page) || PageError(page))) ret = -EIO; return ret; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/page-writeback.c xx/mm/page-writeback.c --- 2.6.5-rc2/mm/page-writeback.c 2004-02-04 16:07:06.000000000 +0100 +++ xx/mm/page-writeback.c 2004-03-21 15:21:42.000000000 +0100 @@ -458,7 +458,7 @@ int do_writepages(struct address_space * */ int write_one_page(struct page *page, int wait) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -470,12 +470,8 @@ int write_one_page(struct page *page, in if (wait) wait_on_page_writeback(page); - spin_lock(&mapping->page_lock); - list_del(&page->list); - if (test_clear_page_dirty(page)) { - list_add(&page->list, &mapping->locked_pages); + if (clear_page_dirty_for_io(page)) { page_cache_get(page); - spin_unlock(&mapping->page_lock); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); @@ -484,8 +480,6 @@ int write_one_page(struct page *page, in } page_cache_release(page); } else { - list_add(&page->list, &mapping->clean_pages); - spin_unlock(&mapping->page_lock); unlock_page(page); } return ret; @@ -493,9 +487,8 @@ int write_one_page(struct page *page, in EXPORT_SYMBOL(write_one_page); /* - * For address_spaces which do not use buffers. Just set the page's dirty bit - * and move it to the dirty_pages list. Also perform space reservation if - * required. + * For address_spaces which do not use buffers. Just tag the page as dirty in + * its radix tree. * * __set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page * is still safe, as long as it actually manages to find some blocks at @@ -510,18 +503,19 @@ int __set_page_dirty_nobuffers(struct pa int ret = 0; if (!TestSetPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping) { - spin_lock(&mapping->page_lock); - if (page->mapping) { /* Race with truncate? */ - BUG_ON(page->mapping != mapping); + spin_lock_irq(&mapping->tree_lock); + if (page_mapping(page)) { /* Race with truncate? */ + BUG_ON(page_mapping(page) != mapping); if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); - list_del(&page->list); - list_add(&page->list, &mapping->dirty_pages); + radix_tree_tag_set(&mapping->page_tree, + !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_DIRTY); } - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); if (!PageSwapCache(page)) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -532,6 +526,24 @@ int __set_page_dirty_nobuffers(struct pa EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* + * If the mapping doesn't provide a set_page_dirty a_op, then + * just fall through and assume that it wants buffer_heads. + * FIXME: make the method unconditional. + */ +int fastcall set_page_dirty(struct page *page) +{ + if (page_mapping(page)) { + int (*spd)(struct page *); + + spd = page_mapping(page)->a_ops->set_page_dirty; + if (spd) + return (*spd)(page); + } + return __set_page_dirty_buffers(page); +} +EXPORT_SYMBOL(set_page_dirty); + +/* * set_page_dirty() is racy if the caller has no reference against * page->mapping->host, and if the page is unlocked. This is because another * CPU could truncate the page off the mapping and then free the mapping. @@ -558,13 +570,136 @@ EXPORT_SYMBOL(set_page_dirty_lock); */ int test_clear_page_dirty(struct page *page) { - if (TestClearPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); + unsigned long flags; - if (mapping && !mapping->backing_dev_info->memory_backed) - dec_page_state(nr_dirty); - return 1; + if (mapping) { + spin_lock_irqsave(&mapping->tree_lock, flags); + if (TestClearPageDirty(page)) { + radix_tree_tag_clear(&mapping->page_tree, !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + if (!mapping->backing_dev_info->memory_backed) + dec_page_state(nr_dirty); + return 1; + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return 0; } - return 0; + return TestClearPageDirty(page); } EXPORT_SYMBOL(test_clear_page_dirty); + +/* + * Clear a page's dirty flag, while caring for dirty memory accounting. + * Returns true if the page was previously dirty. + * + * This is for preparing to put the page under writeout. We leave the page + * tagged as dirty in the radix tree so that a concurrent write-for-sync + * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage + * implementation will run either set_page_writeback() or set_page_dirty(), + * at which stage we bring the page's dirty flag and radix-tree dirty tag + * back into sync. + * + * This incoherency between the page's dirty flag and radix-tree tag is + * unfortunate, but it only exists while the page is locked. + */ +int clear_page_dirty_for_io(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (mapping) { + if (TestClearPageDirty(page)) { + if (!mapping->backing_dev_info->memory_backed) + dec_page_state(nr_dirty); + return 1; + } + return 0; + } + return TestClearPageDirty(page); +} +EXPORT_SYMBOL(clear_page_dirty_for_io); + +/* + * Clear a page's dirty flag while ignoring dirty memory accounting + */ +int __clear_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (mapping) { + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + if (TestClearPageDirty(page)) { + radix_tree_tag_clear(&mapping->page_tree, !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return 1; + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return 0; + } + return TestClearPageDirty(page); +} + +int test_clear_page_writeback(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int ret; + + if (mapping) { + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = TestClearPageWriteback(page); + if (ret) + radix_tree_tag_clear(&mapping->page_tree, !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_WRITEBACK); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + } else { + ret = TestClearPageWriteback(page); + } + return ret; +} + +int test_set_page_writeback(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int ret; + + if (mapping) { + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = TestSetPageWriteback(page); + if (!ret) + radix_tree_tag_set(&mapping->page_tree, !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_WRITEBACK); + if (!PageDirty(page)) + radix_tree_tag_clear(&mapping->page_tree, !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + } else { + ret = TestSetPageWriteback(page); + } + return ret; + +} +EXPORT_SYMBOL(test_set_page_writeback); + +/* + * Return true if any of the pages in the mapping are marged with the + * passed tag. + */ +int mapping_tagged(struct address_space *mapping, int tag) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = radix_tree_tagged(&mapping->page_tree, tag); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return ret; +} +EXPORT_SYMBOL(mapping_tagged); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/readahead.c xx/mm/readahead.c --- 2.6.5-rc2/mm/readahead.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/mm/readahead.c 2004-03-21 15:21:42.000000000 +0100 @@ -48,7 +48,7 @@ static inline unsigned long get_min_read return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; } -#define list_to_page(head) (list_entry((head)->prev, struct page, list)) +#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) /** * read_cache_pages - populate an address space with some pages, and @@ -72,7 +72,7 @@ int read_cache_pages(struct address_spac while (!list_empty(pages)) { page = list_to_page(pages); - list_del(&page->list); + list_del(&page->lru); if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { page_cache_release(page); continue; @@ -85,7 +85,7 @@ int read_cache_pages(struct address_spac struct page *victim; victim = list_to_page(pages); - list_del(&victim->list); + list_del(&victim->lru); page_cache_release(victim); } break; @@ -112,7 +112,7 @@ static int read_pages(struct address_spa pagevec_init(&lru_pvec, 0); for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_to_page(pages); - list_del(&page->list); + list_del(&page->lru); if (!add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { mapping->a_ops->readpage(filp, page); @@ -230,7 +230,7 @@ __do_page_cache_readahead(struct address /* * Preallocate as many pages as we will need. */ - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { unsigned long page_offset = offset + page_idx; @@ -241,16 +241,16 @@ __do_page_cache_readahead(struct address if (page) continue; - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); page = page_cache_alloc_cold(mapping); - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); if (!page) break; page->index = page_offset; - list_add(&page->list, &page_pool); + list_add(&page->lru, &page_pool); ret++; } - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); /* * Now start the IO. We ignore I/O errors - if the page is not diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/rmap.c xx/mm/rmap.c --- 2.6.5-rc2/mm/rmap.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/mm/rmap.c 1970-01-01 01:00:00.000000000 +0100 @@ -1,533 +0,0 @@ -/* - * mm/rmap.c - physical to virtual reverse mappings - * - * Copyright 2001, Rik van Riel - * Released under the General Public License (GPL). - * - * - * Simple, low overhead pte-based reverse mapping scheme. - * This is kept modular because we may want to experiment - * with object-based reverse mapping schemes. Please try - * to keep this thing as modular as possible. - */ - -/* - * Locking: - * - the page->pte.chain is protected by the PG_chainlock bit, - * which nests within the the mm->page_table_lock, - * which nests within the page lock. - * - because swapout locking is opposite to the locking order - * in the page fault path, the swapout path uses trylocks - * on the mm->page_table_lock - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* #define DEBUG_RMAP */ - -/* - * Shared pages have a chain of pte_chain structures, used to locate - * all the mappings to this page. We only need a pointer to the pte - * here, the page struct for the page table page contains the process - * it belongs to and the offset within that process. - * - * We use an array of pte pointers in this structure to minimise cache misses - * while traversing reverse maps. - */ -#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t)) - -/* - * next_and_idx encodes both the address of the next pte_chain and the - * offset of the lowest-index used pte in ptes[] (which is equal also - * to the offset of the highest-index unused pte in ptes[], plus one). - */ -struct pte_chain { - unsigned long next_and_idx; - pte_addr_t ptes[NRPTE]; -} ____cacheline_aligned; - -kmem_cache_t *pte_chain_cache; - -static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain) -{ - return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE); -} - -static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr) -{ - return (struct pte_chain *)(pte_chain_addr & ~NRPTE); -} - -static inline int pte_chain_idx(struct pte_chain *pte_chain) -{ - return pte_chain->next_and_idx & NRPTE; -} - -static inline unsigned long -pte_chain_encode(struct pte_chain *pte_chain, int idx) -{ - return (unsigned long)pte_chain | idx; -} - -/* - * pte_chain list management policy: - * - * - If a page has a pte_chain list then it is shared by at least two processes, - * because a single sharing uses PageDirect. (Well, this isn't true yet, - * coz this code doesn't collapse singletons back to PageDirect on the remove - * path). - * - A pte_chain list has free space only in the head member - all succeeding - * members are 100% full. - * - If the head element has free space, it occurs in its leading slots. - * - All free space in the pte_chain is at the start of the head member. - * - Insertion into the pte_chain puts a pte pointer in the last free slot of - * the head member. - * - Removal from a pte chain moves the head pte of the head member onto the - * victim pte and frees the head member if it became empty. - */ - -/** - ** VM stuff below this comment - **/ - -/** - * page_referenced - test if the page was referenced - * @page: the page to test - * - * Quick test_and_clear_referenced for all mappings to a page, - * returns the number of processes which referenced the page. - * Caller needs to hold the pte_chain_lock. - * - * If the page has a single-entry pte_chain, collapse that back to a PageDirect - * representation. This way, it's only done under memory pressure. - */ -int fastcall page_referenced(struct page * page) -{ - struct pte_chain *pc; - int referenced = 0; - - if (page_test_and_clear_young(page)) - referenced++; - - if (TestClearPageReferenced(page)) - referenced++; - - if (PageDirect(page)) { - pte_t *pte = rmap_ptep_map(page->pte.direct); - if (ptep_test_and_clear_young(pte)) - referenced++; - rmap_ptep_unmap(pte); - } else { - int nr_chains = 0; - - /* Check all the page tables mapping this page. */ - for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) { - int i; - - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pte_paddr = pc->ptes[i]; - pte_t *p; - - p = rmap_ptep_map(pte_paddr); - if (ptep_test_and_clear_young(p)) - referenced++; - rmap_ptep_unmap(p); - nr_chains++; - } - } - if (nr_chains == 1) { - pc = page->pte.chain; - page->pte.direct = pc->ptes[NRPTE-1]; - SetPageDirect(page); - pc->ptes[NRPTE-1] = 0; - __pte_chain_free(pc); - } - } - return referenced; -} - -/** - * page_add_rmap - add reverse mapping entry to a page - * @page: the page to add the mapping to - * @ptep: the page table entry mapping this page - * - * Add a new pte reverse mapping to a page. - * The caller needs to hold the mm->page_table_lock. - */ -struct pte_chain * fastcall -page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain) -{ - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *cur_pte_chain; - - if (PageReserved(page)) - return pte_chain; - - pte_chain_lock(page); - - if (page->pte.direct == 0) { - page->pte.direct = pte_paddr; - SetPageDirect(page); - inc_page_state(nr_mapped); - goto out; - } - - if (PageDirect(page)) { - /* Convert a direct pointer into a pte_chain */ - ClearPageDirect(page); - pte_chain->ptes[NRPTE-1] = page->pte.direct; - pte_chain->ptes[NRPTE-2] = pte_paddr; - pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2); - page->pte.direct = 0; - page->pte.chain = pte_chain; - pte_chain = NULL; /* We consumed it */ - goto out; - } - - cur_pte_chain = page->pte.chain; - if (cur_pte_chain->ptes[0]) { /* It's full */ - pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain, - NRPTE - 1); - page->pte.chain = pte_chain; - pte_chain->ptes[NRPTE-1] = pte_paddr; - pte_chain = NULL; /* We consumed it */ - goto out; - } - cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr; - cur_pte_chain->next_and_idx--; -out: - pte_chain_unlock(page); - return pte_chain; -} - -/** - * page_remove_rmap - take down reverse mapping to a page - * @page: page to remove mapping from - * @ptep: page table entry to remove - * - * Removes the reverse mapping from the pte_chain of the page, - * after that the caller can clear the page table entry and free - * the page. - * Caller needs to hold the mm->page_table_lock. - */ -void fastcall page_remove_rmap(struct page *page, pte_t *ptep) -{ - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *pc; - - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return; - - pte_chain_lock(page); - - if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ - - if (PageDirect(page)) { - if (page->pte.direct == pte_paddr) { - page->pte.direct = 0; - ClearPageDirect(page); - goto out; - } - } else { - struct pte_chain *start = page->pte.chain; - struct pte_chain *next; - int victim_i = pte_chain_idx(start); - - for (pc = start; pc; pc = next) { - int i; - - next = pte_chain_next(pc); - if (next) - prefetch(next); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pa = pc->ptes[i]; - - if (pa != pte_paddr) - continue; - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - if (victim_i == NRPTE-1) { - /* Emptied a pte_chain */ - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - } else { - start->next_and_idx++; - } - goto out; - } - } - } -out: - if (page->pte.direct == 0 && page_test_and_clear_dirty(page)) - set_page_dirty(page); - if (!page_mapped(page)) - dec_page_state(nr_mapped); -out_unlock: - pte_chain_unlock(page); - return; -} - -/** - * try_to_unmap_one - worker function for try_to_unmap - * @page: page to unmap - * @ptep: page table entry to unmap from page - * - * Internal helper function for try_to_unmap, called for each page - * table entry mapping a page. Because locking order here is opposite - * to the locking order used by the page fault path, we use trylocks. - * Locking: - * page lock shrink_list(), trylock - * pte_chain_lock shrink_list() - * mm->page_table_lock try_to_unmap_one(), trylock - */ -static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); -static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr) -{ - pte_t *ptep = rmap_ptep_map(paddr); - unsigned long address = ptep_to_address(ptep); - struct mm_struct * mm = ptep_to_mm(ptep); - struct vm_area_struct * vma; - pte_t pte; - int ret; - - if (!mm) - BUG(); - - /* - * We need the page_table_lock to protect us from page faults, - * munmap, fork, etc... - */ - if (!spin_trylock(&mm->page_table_lock)) { - rmap_ptep_unmap(ptep); - return SWAP_AGAIN; - } - - - /* During mremap, it's possible pages are not in a VMA. */ - vma = find_vma(mm, address); - if (!vma) { - ret = SWAP_FAIL; - goto out_unlock; - } - - /* The page is mlock()d, we cannot swap it out. */ - if (vma->vm_flags & VM_LOCKED) { - ret = SWAP_FAIL; - goto out_unlock; - } - - /* Nuke the page table entry. */ - flush_cache_page(vma, address); - pte = ptep_clear_flush(vma, address, ptep); - - if (PageSwapCache(page)) { - /* - * Store the swap location in the pte. - * See handle_pte_fault() ... - */ - swp_entry_t entry = { .val = page->index }; - swap_duplicate(entry); - set_pte(ptep, swp_entry_to_pte(entry)); - BUG_ON(pte_file(*ptep)); - } else { - unsigned long pgidx; - /* - * If a nonlinear mapping then store the file page offset - * in the pte. - */ - pgidx = (address - vma->vm_start) >> PAGE_SHIFT; - pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - if (page->index != pgidx) { - set_pte(ptep, pgoff_to_pte(page->index)); - BUG_ON(!pte_file(*ptep)); - } - } - - /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pte)) - set_page_dirty(page); - - mm->rss--; - page_cache_release(page); - ret = SWAP_SUCCESS; - -out_unlock: - rmap_ptep_unmap(ptep); - spin_unlock(&mm->page_table_lock); - return ret; -} - -/** - * try_to_unmap - try to remove all page table mappings to a page - * @page: the page to get unmapped - * - * Tries to remove all the page table entries which are mapping this - * page, used in the pageout path. Caller must hold the page lock - * and its pte chain lock. Return values are: - * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a trylock, try again later - * SWAP_FAIL - the page is unswappable - */ -int fastcall try_to_unmap(struct page * page) -{ - struct pte_chain *pc, *next_pc, *start; - int ret = SWAP_SUCCESS; - int victim_i; - - /* This page should not be on the pageout lists. */ - if (PageReserved(page)) - BUG(); - if (!PageLocked(page)) - BUG(); - /* We need backing store to swap out a page. */ - if (!page->mapping) - BUG(); - - if (PageDirect(page)) { - ret = try_to_unmap_one(page, page->pte.direct); - if (ret == SWAP_SUCCESS) { - if (page_test_and_clear_dirty(page)) - set_page_dirty(page); - page->pte.direct = 0; - ClearPageDirect(page); - } - goto out; - } - - start = page->pte.chain; - victim_i = pte_chain_idx(start); - for (pc = start; pc; pc = next_pc) { - int i; - - next_pc = pte_chain_next(pc); - if (next_pc) - prefetch(next_pc); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pte_paddr = pc->ptes[i]; - - switch (try_to_unmap_one(page, pte_paddr)) { - case SWAP_SUCCESS: - /* - * Release a slot. If we're releasing the - * first pte in the first pte_chain then - * pc->ptes[i] and start->ptes[victim_i] both - * refer to the same thing. It works out. - */ - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - victim_i++; - if (victim_i == NRPTE) { - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - start = page->pte.chain; - victim_i = 0; - } else { - start->next_and_idx++; - } - if (page->pte.direct == 0 && - page_test_and_clear_dirty(page)) - set_page_dirty(page); - break; - case SWAP_AGAIN: - /* Skip this pte, remembering status. */ - ret = SWAP_AGAIN; - continue; - case SWAP_FAIL: - ret = SWAP_FAIL; - goto out; - } - } - } -out: - if (!page_mapped(page)) - dec_page_state(nr_mapped); - return ret; -} - -/** - ** No more VM stuff below this comment, only pte_chain helper - ** functions. - **/ - -static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags) -{ - struct pte_chain *pc = p; - - memset(pc, 0, sizeof(*pc)); -} - -DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0; - -/** - * __pte_chain_free - free pte_chain structure - * @pte_chain: pte_chain struct to free - */ -void __pte_chain_free(struct pte_chain *pte_chain) -{ - struct pte_chain **pte_chainp; - - pte_chainp = &get_cpu_var(local_pte_chain); - if (pte_chain->next_and_idx) - pte_chain->next_and_idx = 0; - if (*pte_chainp) - kmem_cache_free(pte_chain_cache, *pte_chainp); - *pte_chainp = pte_chain; - put_cpu_var(local_pte_chain); -} - -/* - * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap(). - * - * The caller of page_add_rmap() must perform the allocation because - * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap() - * will not actually use the pte_chain, because there is space available in one - * of the existing pte_chains which are attached to the page. So the case of - * allocating and then freeing a single pte_chain is specially optimised here, - * with a one-deep per-cpu cache. - */ -struct pte_chain *pte_chain_alloc(int gfp_flags) -{ - struct pte_chain *ret; - struct pte_chain **pte_chainp; - - might_sleep_if(gfp_flags & __GFP_WAIT); - - pte_chainp = &get_cpu_var(local_pte_chain); - if (*pte_chainp) { - ret = *pte_chainp; - *pte_chainp = NULL; - put_cpu_var(local_pte_chain); - } else { - put_cpu_var(local_pte_chain); - ret = kmem_cache_alloc(pte_chain_cache, gfp_flags); - } - return ret; -} - -void __init pte_chain_init(void) -{ - pte_chain_cache = kmem_cache_create( "pte_chain", - sizeof(struct pte_chain), - 0, - SLAB_MUST_HWCACHE_ALIGN, - pte_chain_ctor, - NULL); - - if (!pte_chain_cache) - panic("failed to create pte_chain cache!\n"); -} diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/slab.c xx/mm/slab.c --- 2.6.5-rc2/mm/slab.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/mm/slab.c 2004-03-21 15:21:41.000000000 +0100 @@ -453,10 +453,10 @@ static int slab_break_gfp_order = BREAK_ * global 'mem_map'. These are used to find the slab an obj belongs to. * With kfree(), these are used to find the cache which an obj belongs to. */ -#define SET_PAGE_CACHE(pg,x) ((pg)->list.next = (struct list_head *)(x)) -#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->list.next) -#define SET_PAGE_SLAB(pg,x) ((pg)->list.prev = (struct list_head *)(x)) -#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->list.prev) +#define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x)) +#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next) +#define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x)) +#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev) /* These are the default caches for kmalloc. Custom caches can have other sizes. */ struct cache_sizes malloc_sizes[] = { diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/swap.c xx/mm/swap.c --- 2.6.5-rc2/mm/swap.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/mm/swap.c 2004-03-21 15:21:40.000000000 +0100 @@ -70,7 +70,7 @@ int rotate_reclaimable_page(struct page list_add_tail(&page->lru, &zone->inactive_list); inc_page_state(pgrotated); } - if (!TestClearPageWriteback(page)) + if (!test_clear_page_writeback(page)) BUG(); spin_unlock_irqrestore(&zone->lru_lock, flags); return 0; @@ -353,13 +353,21 @@ void pagevec_strip(struct pagevec *pvec) * * pagevec_lookup() returns the number of pages which were found. */ -unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned int nr_pages) +unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, + pgoff_t start, unsigned nr_pages) { pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); return pagevec_count(pvec); } +unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, + pgoff_t *index, int tag, unsigned nr_pages) +{ + pvec->nr = find_get_pages_tag(mapping, index, tag, + nr_pages, pvec->pages); + return pagevec_count(pvec); +} + #ifdef CONFIG_SMP /* diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/swapfile.c xx/mm/swapfile.c --- 2.6.5-rc2/mm/swapfile.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/mm/swapfile.c 2004-03-21 15:21:42.000000000 +0100 @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -247,16 +247,16 @@ static int exclusive_swap_page(struct pa struct swap_info_struct * p; swp_entry_t entry; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (p) { /* Is the only swap cache user the cache itself? */ if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + spin_lock_irq(&swapper_space.tree_lock); if (page_count(page) - !!PagePrivate(page) == 2) retval = 1; - spin_unlock(&swapper_space.page_lock); + spin_unlock_irq(&swapper_space.tree_lock); } swap_info_put(p); } @@ -315,7 +315,7 @@ int remove_exclusive_swap_page(struct pa if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (!p) return 0; @@ -324,13 +324,13 @@ int remove_exclusive_swap_page(struct pa retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + spin_lock_irq(&swapper_space.tree_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - spin_unlock(&swapper_space.page_lock); + spin_unlock_irq(&swapper_space.tree_lock); } swap_info_put(p); @@ -385,19 +385,20 @@ void free_swap_and_cache(swp_entry_t ent /* vma->vm_mm->page_table_lock is held */ static void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { vma->vm_mm->rss++; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); - *pte_chainp = page_add_rmap(page, dir, *pte_chainp); + BUG_ON(!vma->anon_vma); + page_add_rmap(page, vma, address, 1); swap_free(entry); } /* vma->vm_mm->page_table_lock is held */ static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pte_t * pte; unsigned long end; @@ -423,7 +424,7 @@ static int unuse_pmd(struct vm_area_stru */ if (unlikely(pte_same(*pte, swp_pte))) { unuse_pte(vma, offset + address, pte, - entry, page, pte_chainp); + entry, page); pte_unmap(pte); return 1; } @@ -437,7 +438,7 @@ static int unuse_pmd(struct vm_area_stru /* vma->vm_mm->page_table_lock is held */ static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pmd_t * pmd; unsigned long offset, end; @@ -459,7 +460,7 @@ static int unuse_pgd(struct vm_area_stru BUG(); do { if (unuse_pmd(vma, pmd, address, end - address, - offset, entry, page, pte_chainp)) + offset, entry, page)) return 1; address = (address + PMD_SIZE) & PMD_MASK; pmd++; @@ -469,7 +470,7 @@ static int unuse_pgd(struct vm_area_stru /* vma->vm_mm->page_table_lock is held */ static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { unsigned long start = vma->vm_start, end = vma->vm_end; @@ -477,7 +478,7 @@ static int unuse_vma(struct vm_area_stru BUG(); do { if (unuse_pgd(vma, pgdir, start, end - start, - entry, page, pte_chainp)) + entry, page)) return 1; start = (start + PGDIR_SIZE) & PGDIR_MASK; pgdir++; @@ -489,11 +490,6 @@ static int unuse_process(struct mm_struc swp_entry_t entry, struct page* page) { struct vm_area_struct* vma; - struct pte_chain *pte_chain; - - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - return -ENOMEM; /* * Go through process' page directory. @@ -501,11 +497,10 @@ static int unuse_process(struct mm_struc spin_lock(&mm->page_table_lock); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); - if (unuse_vma(vma, pgd, entry, page, &pte_chain)) + if (unuse_vma(vma, pgd, entry, page)) break; } spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return 0; } @@ -998,7 +993,7 @@ int page_queue_congested(struct page *pa bdi = page->mapping->backing_dev_info; if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->private }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/swap_state.c xx/mm/swap_state.c --- 2.6.5-rc2/mm/swap_state.c 2003-08-16 15:00:13.000000000 +0200 +++ xx/mm/swap_state.c 2004-03-21 15:21:42.000000000 +0100 @@ -25,11 +25,7 @@ extern struct address_space_operations s struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC), - .page_lock = SPIN_LOCK_UNLOCKED, - .clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages), - .dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages), - .io_pages = LIST_HEAD_INIT(swapper_space.io_pages), - .locked_pages = LIST_HEAD_INIT(swapper_space.locked_pages), + .tree_lock = SPIN_LOCK_UNLOCKED, .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap), @@ -63,8 +59,8 @@ static int add_to_swap_cache(struct page { int error; - if (page->mapping) - BUG(); + BUG_ON(page_mapping(page)); + BUG_ON(PageSwapCache(page)); if (!swap_duplicate(entry)) { INC_CACHE_INFO(noent_race); return -ENOENT; @@ -74,15 +70,14 @@ static int add_to_swap_cache(struct page * Anon pages are already on the LRU, we don't run lru_cache_add here. */ if (error != 0) { + BUG_ON(PageSwapCache(page)); swap_free(entry); if (error == -EEXIST) INC_CACHE_INFO(exist_race); return error; } - if (!PageLocked(page)) - BUG(); - if (!PageSwapCache(page)) - BUG(); + BUG_ON(!PageLocked(page)); + BUG_ON(!PageSwapCache(page)); INC_CACHE_INFO(add_total); return 0; } @@ -149,7 +144,7 @@ int add_to_swap(struct page * page) switch (err) { case 0: /* Success */ SetPageUptodate(page); - ClearPageDirty(page); + __clear_page_dirty(page); set_page_dirty(page); INC_CACHE_INFO(add_total); return 1; @@ -180,11 +175,11 @@ void delete_from_swap_cache(struct page BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->private; - spin_lock(&swapper_space.page_lock); + spin_lock_irq(&swapper_space.tree_lock); __delete_from_swap_cache(page); - spin_unlock(&swapper_space.page_lock); + spin_unlock_irq(&swapper_space.tree_lock); swap_free(entry); page_cache_release(page); @@ -192,11 +187,14 @@ void delete_from_swap_cache(struct page int move_to_swap_cache(struct page *page, swp_entry_t entry) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); int err; - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); + BUG_ON(PageAnon(page)); + BUG_ON(PageSwapCache(page)); + + spin_lock_irq(&swapper_space.tree_lock); + spin_lock(&mapping->tree_lock); err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); if (!err) { @@ -204,13 +202,12 @@ int move_to_swap_cache(struct page *page ___add_to_page_cache(page, &swapper_space, entry.val); } - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); + spin_unlock(&mapping->tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); if (!err) { if (!swap_duplicate(entry)) BUG(); - /* shift page from clean_pages to dirty_pages list */ BUG_ON(PageDirty(page)); set_page_dirty(page); INC_CACHE_INFO(add_total); @@ -229,10 +226,10 @@ int move_from_swap_cache(struct page *pa BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->private; - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); + spin_lock_irq(&swapper_space.tree_lock); + spin_lock(&mapping->tree_lock); err = radix_tree_insert(&mapping->page_tree, index, page); if (!err) { @@ -240,13 +237,12 @@ int move_from_swap_cache(struct page *pa ___add_to_page_cache(page, mapping, index); } - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); + spin_unlock(&mapping->tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); if (!err) { swap_free(entry); - /* shift page from clean_pages to dirty_pages list */ - ClearPageDirty(page); + __clear_page_dirty(page); set_page_dirty(page); } return err; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/truncate.c xx/mm/truncate.c --- 2.6.5-rc2/mm/truncate.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/mm/truncate.c 2004-03-21 15:21:40.000000000 +0100 @@ -62,7 +62,7 @@ truncate_complete_page(struct address_sp * This is for invalidate_inode_pages(). That function can be called at * any time, and is not supposed to throw away dirty pages. But pages can * be marked dirty at any time too. So we re-check the dirtiness inside - * ->page_lock. That provides exclusion against the __set_page_dirty + * ->tree_lock. That provides exclusion against the __set_page_dirty * functions. */ static int @@ -74,13 +74,13 @@ invalidate_complete_page(struct address_ if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) { - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); return 0; } __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.5-rc2/mm/vmscan.c xx/mm/vmscan.c --- 2.6.5-rc2/mm/vmscan.c 2004-03-21 15:09:25.000000000 +0100 +++ xx/mm/vmscan.c 2004-03-21 15:21:42.000000000 +0100 @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include @@ -173,10 +173,10 @@ static int shrink_slab(unsigned long sca return 0; } -/* Must be called with page's pte_chain_lock held. */ +/* Must be called with page's page_map_lock held. */ static inline int page_mapping_inuse(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); /* Page is in somebody's page tables. */ if (page_mapped(page)) @@ -233,7 +233,7 @@ static void handle_write_error(struct ad struct page *page, int error) { lock_page(page); - if (page->mapping == mapping) { + if (page_mapping(page) == mapping) { if (error == -ENOSPC) set_bit(AS_ENOSPC, &mapping->flags); else @@ -277,15 +277,15 @@ shrink_list(struct list_head *page_list, if (PageWriteback(page)) goto keep_locked; - pte_chain_lock(page); + page_map_lock(page); referenced = page_referenced(page); if (referenced && page_mapping_inuse(page)) { /* In active use or really unfreeable. Activate it. */ - pte_chain_unlock(page); + page_map_unlock(page); goto activate_locked; } - mapping = page->mapping; + mapping = page_mapping(page); #ifdef CONFIG_SWAP /* @@ -295,11 +295,11 @@ shrink_list(struct list_head *page_list, * XXX: implement swap clustering ? */ if (page_mapped(page) && !mapping && !PagePrivate(page)) { - pte_chain_unlock(page); + page_map_unlock(page); if (!add_to_swap(page)) goto activate_locked; - pte_chain_lock(page); - mapping = page->mapping; + page_map_lock(page); + mapping = page_mapping(page); } #endif /* CONFIG_SWAP */ @@ -313,16 +313,16 @@ shrink_list(struct list_head *page_list, if (page_mapped(page) && mapping) { switch (try_to_unmap(page)) { case SWAP_FAIL: - pte_chain_unlock(page); + page_map_unlock(page); goto activate_locked; case SWAP_AGAIN: - pte_chain_unlock(page); + page_map_unlock(page); goto keep_locked; case SWAP_SUCCESS: ; /* try to free the page below */ } } - pte_chain_unlock(page); + page_map_unlock(page); /* * If the page is dirty, only perform writeback if that write @@ -354,8 +354,7 @@ shrink_list(struct list_head *page_list, goto keep_locked; if (!may_write_to_queue(mapping->backing_dev_info)) goto keep_locked; - spin_lock(&mapping->page_lock); - if (test_clear_page_dirty(page)) { + if (clear_page_dirty_for_io(page)) { int res; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, @@ -364,9 +363,6 @@ shrink_list(struct list_head *page_list, .for_reclaim = 1, }; - list_move(&page->list, &mapping->locked_pages); - spin_unlock(&mapping->page_lock); - SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); if (res < 0) @@ -381,7 +377,6 @@ shrink_list(struct list_head *page_list, } goto keep; } - spin_unlock(&mapping->page_lock); } /* @@ -415,7 +410,7 @@ shrink_list(struct list_head *page_list, if (!mapping) goto keep_locked; /* truncate got there first */ - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); /* * The non-racy check for busy page. It is critical to check @@ -423,15 +418,15 @@ shrink_list(struct list_head *page_list, * not in use by anybody. (pagecache + us == 2) */ if (page_count(page) != 2 || PageDirty(page)) { - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); goto keep_locked; } #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page->index }; + swp_entry_t swap = { .val = page->private }; __delete_from_swap_cache(page); - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); swap_free(swap); __put_page(page); /* The pagecache ref */ goto free_it; @@ -439,7 +434,7 @@ shrink_list(struct list_head *page_list, #endif /* CONFIG_SWAP */ __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); __put_page(page); free_it: @@ -658,19 +653,19 @@ refill_inactive_zone(struct zone *zone, list_add(&page->lru, &l_active); continue; } - pte_chain_lock(page); + page_map_lock(page); if (page_referenced(page)) { - pte_chain_unlock(page); + page_map_unlock(page); list_add(&page->lru, &l_active); continue; } - pte_chain_unlock(page); + page_map_unlock(page); } /* * FIXME: need to consider page_count(page) here if/when we * reap orphaned pages via the LRU (Daniel's locking stuff) */ - if (total_swap_pages == 0 && !page->mapping && + if (total_swap_pages == 0 && !page_mapping(page) && !PagePrivate(page)) { list_add(&page->lru, &l_active); continue;