--- linux-2.6.4-rc1/arch/alpha/kernel/alpha_ksyms.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/alpha/kernel/alpha_ksyms.c 2004-02-29 13:09:03.000000000 -0800 @@ -35,9 +35,6 @@ #include #include -#define __KERNEL_SYSCALLS__ -#include - extern struct hwrpb_struct *hwrpb; extern void dump_thread(struct pt_regs *, struct user *); extern spinlock_t rtc_lock; --- linux-2.6.4-rc1/arch/alpha/kernel/smp.c 2003-10-08 15:07:08.000000000 -0700 +++ 25/arch/alpha/kernel/smp.c 2004-02-29 13:09:04.000000000 -0800 @@ -39,9 +39,6 @@ #include #include -#define __KERNEL_SYSCALLS__ -#include - #include "proto.h" #include "irq_impl.h" --- linux-2.6.4-rc1/arch/arm26/mm/Makefile 2003-06-14 12:18:01.000000000 -0700 +++ 25/arch/arm26/mm/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,12 +1,5 @@ # # Makefile for the linux arm26-specific parts of the memory manager. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# -# Note 2! The CFLAGS definition is now in the main makefile... - -# Object file lists. obj-y := init.o extable.o proc-funcs.o mm-memc.o fault.o --- linux-2.6.4-rc1/arch/arm/common/sa1111-pcibuf.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/arm/common/sa1111-pcibuf.c 2004-02-29 13:07:52.000000000 -0800 @@ -457,8 +457,8 @@ void sa1111_unmap_sg(struct device *dev, local_irq_restore(flags); } -void sa1111_dma_sync_single(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir) +void sa1111_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir) { unsigned long flags; @@ -472,8 +472,44 @@ void sa1111_dma_sync_single(struct devic local_irq_restore(flags); } -void sa1111_dma_sync_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir) +void sa1111_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir) +{ + unsigned long flags; + + dev_dbg(dev, "%s(ptr=%08lx,size=%d,dir=%x)\n", + __func__, dma_addr, size, dir); + + local_irq_save(flags); + + sync_single(dev, dma_addr, size, dir); + + local_irq_restore(flags); +} + +void sa1111_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ + unsigned long flags; + int i; + + dev_dbg(dev, "%s(sg=%p,nents=%d,dir=%x)\n", + __func__, sg, nents, dir); + + local_irq_save(flags); + + for (i = 0; i < nents; i++, sg++) { + dma_addr_t dma_addr = sg->dma_address; + unsigned int length = sg->length; + + sync_single(dev, dma_addr, length, dir); + } + + local_irq_restore(flags); +} + +void sa1111_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) { unsigned long flags; int i; @@ -497,8 +533,10 @@ EXPORT_SYMBOL(sa1111_map_single); EXPORT_SYMBOL(sa1111_unmap_single); EXPORT_SYMBOL(sa1111_map_sg); EXPORT_SYMBOL(sa1111_unmap_sg); -EXPORT_SYMBOL(sa1111_dma_sync_single); -EXPORT_SYMBOL(sa1111_dma_sync_sg); +EXPORT_SYMBOL(sa1111_dma_sync_single_for_cpu); +EXPORT_SYMBOL(sa1111_dma_sync_single_for_device); +EXPORT_SYMBOL(sa1111_dma_sync_sg_for_cpu); +EXPORT_SYMBOL(sa1111_dma_sync_sg_for_device); /* **************************************** */ --- linux-2.6.4-rc1/arch/arm/kernel/Makefile 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/arm/kernel/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -29,7 +29,7 @@ obj-$(CONFIG_DEBUG_LL) += debug.o extra-y := $(head-y) init_task.o vmlinux.lds.s -# Spell out some dependencies that `make dep' doesn't spot +# Spell out some dependencies that aren't automatically figured out $(obj)/entry-armv.o: $(obj)/entry-header.S include/asm-arm/constants.h $(obj)/entry-common.o: $(obj)/entry-header.S include/asm-arm/constants.h \ $(obj)/calls.S --- linux-2.6.4-rc1/arch/arm/kernel/time.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/arm/kernel/time.c 2004-02-29 13:07:44.000000000 -0800 @@ -178,7 +178,7 @@ static int __init leds_init(void) int ret; ret = sysdev_class_register(&leds_sysclass); if (ret == 0) - ret = sys_device_register(&leds_device); + ret = sysdev_register(&leds_device); return ret; } --- linux-2.6.4-rc1/arch/arm/mach-integrator/integrator_ap.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/arm/mach-integrator/integrator_ap.c 2004-02-29 13:07:44.000000000 -0800 @@ -173,7 +173,7 @@ static int __init irq_init_sysfs(void) { int ret = sysdev_class_register(&irq_class); if (ret == 0) - ret = sys_device_register(&irq_device); + ret = sysdev_register(&irq_device); return ret; } --- linux-2.6.4-rc1/arch/arm/mach-sa1100/irq.c 2003-06-22 12:04:43.000000000 -0700 +++ 25/arch/arm/mach-sa1100/irq.c 2004-02-29 13:07:44.000000000 -0800 @@ -278,7 +278,7 @@ static struct sys_device sa1100irq_devic static int __init sa1100irq_init_devicefs(void) { sysdev_class_register(&sa1100irq_sysclass); - return sys_device_register(&sa1100irq_device); + return sysdev_register(&sa1100irq_device); } device_initcall(sa1100irq_init_devicefs); --- linux-2.6.4-rc1/arch/cris/kernel/process.c 2003-10-08 15:07:08.000000000 -0700 +++ 25/arch/cris/kernel/process.c 2004-02-29 13:09:04.000000000 -0800 @@ -91,8 +91,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#define __KERNEL_SYSCALLS__ - #include #include #include --- linux-2.6.4-rc1/arch/h8300/kernel/syscalls.S 2003-08-22 19:23:40.000000000 -0700 +++ 25/arch/h8300/kernel/syscalls.S 2004-02-29 13:08:05.000000000 -0800 @@ -116,7 +116,7 @@ SYMBOL_NAME_LABEL(sys_call_table) .long SYMBOL_NAME(sys_ni_syscall) /* old profil syscall holder */ .long SYMBOL_NAME(sys_statfs) .long SYMBOL_NAME(sys_fstatfs) /* 100 */ - .long SYMBOL_NAME(sys_ioperm) + .long SYMBOL_NAME(sys_ni_syscall) /* ioperm for i386 */ .long SYMBOL_NAME(sys_socketcall) .long SYMBOL_NAME(sys_syslog) .long SYMBOL_NAME(sys_setitimer) --- linux-2.6.4-rc1/arch/h8300/kernel/sys_h8300.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/h8300/kernel/sys_h8300.c 2004-02-29 13:08:05.000000000 -0800 @@ -260,11 +260,6 @@ asmlinkage int sys_ipc (uint call, int f return -EINVAL; } -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on) -{ - return -ENOSYS; -} - /* sys_cacheflush -- no support. */ asmlinkage int sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) --- linux-2.6.4-rc1/arch/h8300/mm/Makefile 2004-02-03 20:42:34.000000000 -0800 +++ 25/arch/h8300/mm/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,5 @@ # # Makefile for the linux m68k-specific parts of the memory manager. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# -# Note 2! The CFLAGS definition is now in the main makefile... obj-y := init.o fault.o memory.o kmap.o --- linux-2.6.4-rc1/arch/h8300/platform/h8300h/aki3068net/Makefile 2003-08-08 22:55:10.000000000 -0700 +++ 25/arch/h8300/platform/h8300h/aki3068net/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the linux kernel. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# extra-y := crt0_ram.o obj-y := timer.o --- linux-2.6.4-rc1/arch/h8300/platform/h8300h/generic/Makefile 2003-08-08 22:55:10.000000000 -0700 +++ 25/arch/h8300/platform/h8300h/generic/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the linux kernel. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := timer.o extra-y = crt0_$(MODEL).o --- linux-2.6.4-rc1/arch/h8300/platform/h8300h/h8max/Makefile 2003-08-08 22:55:10.000000000 -0700 +++ 25/arch/h8300/platform/h8300h/h8max/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the linux kernel. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# extra-y := crt0_ram.o obj-y := timer.o --- linux-2.6.4-rc1/arch/h8300/platform/h8300h/Makefile 2003-06-14 12:18:29.000000000 -0700 +++ 25/arch/h8300/platform/h8300h/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -6,10 +6,6 @@ #VPATH := $(VPATH):$(BOARD) -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# .S.o: $(CC) -D__ASSEMBLY__ $(AFLAGS) -I. -c $< -o $*.o --- linux-2.6.4-rc1/arch/h8300/platform/h8s/edosk2674/Makefile 2003-08-08 22:55:10.000000000 -0700 +++ 25/arch/h8300/platform/h8s/edosk2674/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the linux kernel. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# extra-y := crt0_ram.o obj-y := timer.o --- linux-2.6.4-rc1/arch/h8300/platform/h8s/generic/Makefile 2003-08-08 22:55:10.000000000 -0700 +++ 25/arch/h8300/platform/h8s/generic/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the linux kernel. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# extra-y = crt0_$(MODEL).o obj-y := timer.o --- linux-2.6.4-rc1/arch/h8300/platform/h8s/Makefile 2003-08-08 22:55:10.000000000 -0700 +++ 25/arch/h8300/platform/h8s/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -6,10 +6,6 @@ #VPATH := $(VPATH):$(BOARD) -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# .S.o: $(CC) -D__ASSEMBLY__ $(AFLAGS) -I. -c $< -o $*.o --- linux-2.6.4-rc1/arch/i386/boot/setup.S 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/boot/setup.S 2004-02-29 13:09:29.000000000 -0800 @@ -164,7 +164,7 @@ cmd_line_ptr: .long 0 # (Header versio # can be located anywhere in # low memory 0x10000 or higher. -ramdisk_max: .long MAXMEM-1 # (Header version 0x0203 or later) +ramdisk_max: .long __MAXMEM-1 # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd --- linux-2.6.4-rc1/arch/i386/boot/tools/build.c 2003-06-14 12:18:21.000000000 -0700 +++ 25/arch/i386/boot/tools/build.c 2004-02-29 13:08:58.000000000 -0800 @@ -150,10 +150,8 @@ int main(int argc, char ** argv) sz = sb.st_size; fprintf (stderr, "System is %d kB\n", sz/1024); sys_size = (sz + 15) / 16; - /* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */ - if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE)) - die("System is too big. Try using %smodules.", - is_big_kernel ? "" : "bzImage or "); + if (!is_big_kernel && sys_size > DEF_SYSSIZE) + die("System is too big. Try using bzImage or modules."); while (sz > 0) { int l, n; --- linux-2.6.4-rc1/arch/i386/defconfig 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/i386/defconfig 2004-02-29 13:08:58.000000000 -0800 @@ -1195,5 +1195,4 @@ CONFIG_CRC32=y CONFIG_X86_SMP=y CONFIG_X86_HT=y CONFIG_X86_BIOS_REBOOT=y -CONFIG_X86_TRAMPOLINE=y CONFIG_PC=y --- linux-2.6.4-rc1/arch/i386/Kconfig 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/Kconfig 2004-02-29 13:09:29.000000000 -0800 @@ -421,6 +421,54 @@ config X86_OOSTORE depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR default y +config X86_4G + bool "4 GB kernel-space and 4 GB user-space virtual memory support" + help + This option is only useful for systems that have more than 1 GB + of RAM. + + The default kernel VM layout leaves 1 GB of virtual memory for + kernel-space mappings, and 3 GB of VM for user-space applications. + This option ups both the kernel-space VM and the user-space VM to + 4 GB. + + The cost of this option is additional TLB flushes done at + system-entry points that transition from user-mode into kernel-mode. + I.e. system calls and page faults, and IRQs that interrupt user-mode + code. There's also additional overhead to kernel operations that copy + memory to/from user-space. The overhead from this is hard to tell and + depends on the workload - it can be anything from no visible overhead + to 20-30% overhead. A good rule of thumb is to count with a runtime + overhead of 20%. + + The upside is the much increased kernel-space VM, which more than + quadruples the maximum amount of RAM supported. Kernels compiled with + this option boot on 64GB of RAM and still have more than 3.1 GB of + 'lowmem' left. Another bonus is that highmem IO bouncing decreases, + if used with drivers that still use bounce-buffers. + + There's also a 33% increase in user-space VM size - database + applications might see a boost from this. + + But the cost of the TLB flushes and the runtime overhead has to be + weighed against the bonuses offered by the larger VM spaces. The + dividing line depends on the actual workload - there might be 4 GB + systems that benefit from this option. Systems with less than 4 GB + of RAM will rarely see a benefit from this option - but it's not + out of question, the exact circumstances have to be considered. + +config X86_SWITCH_PAGETABLES + def_bool X86_4G + +config X86_4G_VM_LAYOUT + def_bool X86_4G + +config X86_UACCESS_INDIRECT + def_bool X86_4G + +config X86_HIGH_ENTRY + def_bool X86_4G + config HPET_TIMER bool "HPET Timer Support" help @@ -478,6 +526,16 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with Intel Pentium 4 chips with HyperThreading at a + cost of slightly increased overhead in some places. If unsure say + N here. + config PREEMPT bool "Preemptible Kernel" help @@ -552,7 +610,7 @@ config X86_MCE the 386 and 486, so nearly everyone can say Y here. config X86_MCE_NONFATAL - bool "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" + tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" depends on X86_MCE help Enabling this feature starts a timer that triggers every 5 seconds which @@ -1068,12 +1126,16 @@ config PCI_GOBIOS PCI-based systems don't have any BIOS at all. Linux can also try to detect the PCI hardware directly without using the BIOS. - With this option, you can specify how Linux should detect the PCI - devices. If you choose "BIOS", the BIOS will be used, if you choose - "Direct", the BIOS won't be used, and if you choose "Any", the - kernel will try the direct access method and falls back to the BIOS - if that doesn't work. If unsure, go with the default, which is - "Any". + With this option, you can specify how Linux should detect the + PCI devices. If you choose "BIOS", the BIOS will be used, + if you choose "Direct", the BIOS won't be used, and if you + choose "MMConfig", then PCI Express MMCONFIG will be used. + If you choose "Any", the kernel will try MMCONFIG, then the + direct access method and falls back to the BIOS if that doesn't + work. If unsure, go with the default, which is "Any". + +config PCI_GOMMCONFIG + bool "MMConfig" config PCI_GODIRECT bool "Direct" @@ -1093,6 +1155,12 @@ config PCI_DIRECT depends on PCI && ((PCI_GODIRECT || PCI_GOANY) || X86_VISWS) default y +config PCI_MMCONFIG + bool + depends on PCI && (PCI_GOMMCONFIG || PCI_GOANY) + select ACPI_BOOT + default y + config PCI_USE_VECTOR bool "Vector-based interrupt indexing (MSI)" depends on X86_LOCAL_APIC && X86_IO_APIC @@ -1231,17 +1299,6 @@ config DEBUG_SLAB allocation as well as poisoning memory on free to catch use of freed memory. -config DEBUG_IOVIRT - bool "Memory mapped I/O debugging" - depends on DEBUG_KERNEL - help - Say Y here to get warned whenever an attempt is made to do I/O on - obviously invalid addresses such as those generated when ioremap() - calls are forgotten. Memory mapped I/O will go through an extra - check to catch access to unmapped ISA addresses, an access method - that can still be used by old drivers that are being ported from - 2.0/2.2. - config MAGIC_SYSRQ bool "Magic SysRq key" depends on DEBUG_KERNEL @@ -1273,6 +1330,15 @@ config DEBUG_PAGEALLOC This results in a large slowdown, but helps to find certain types of memory corruptions. +config SPINLINE + bool "Spinlock inlining" + depends on DEBUG_KERNEL + help + This will change spinlocks from out of line to inline, making them + account cost to the callers in readprofile, rather than the lock + itself (as ".text.lock.filename"). This can be helpful for finding + the callers of locks. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM @@ -1289,20 +1355,208 @@ config DEBUG_INFO Say Y here only if you plan to use gdb to debug the kernel. If you don't debug the kernel, you can say N. +config LOCKMETER + bool "Kernel lock metering" + depends on SMP + help + Say Y to enable kernel lock metering, which adds overhead to SMP locks, + but allows you to see various statistics using the lockstat command. + config DEBUG_SPINLOCK_SLEEP bool "Sleep-inside-spinlock checking" help If you say Y here, various routines which may sleep will become very noisy if they are called with a spinlock held. +config KGDB + bool "Include kgdb kernel debugger" + depends on DEBUG_KERNEL + help + If you say Y here, the system will be compiled with the debug + option (-g) and a debugging stub will be included in the + kernel. This stub communicates with gdb on another (host) + computer via a serial port. The host computer should have + access to the kernel binary file (vmlinux) and a serial port + that is connected to the target machine. Gdb can be made to + configure the serial port or you can use stty and setserial to + do this. See the 'target' command in gdb. This option also + configures in the ability to request a breakpoint early in the + boot process. To request the breakpoint just include 'kgdb' + as a boot option when booting the target machine. The system + will then break as soon as it looks at the boot options. This + option also installs a breakpoint in panic and sends any + kernel faults to the debugger. For more information see the + Documentation/i386/kgdb/kgdb.txt file. + +choice + depends on KGDB + prompt "Debug serial port BAUD" + default KGDB_115200BAUD + help + Gdb and the kernel stub need to agree on the baud rate to be + used. Some systems (x86 family at this writing) allow this to + be configured. + +config KGDB_9600BAUD + bool "9600" + +config KGDB_19200BAUD + bool "19200" + +config KGDB_38400BAUD + bool "38400" + +config KGDB_57600BAUD + bool "57600" + +config KGDB_115200BAUD + bool "115200" +endchoice + +config KGDB_PORT + hex "hex I/O port address of the debug serial port" + depends on KGDB + default 3f8 + help + Some systems (x86 family at this writing) allow the port + address to be configured. The number entered is assumed to be + hex, don't put 0x in front of it. The standard address are: + COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx + will tell you what you have. It is good to test the serial + connection with a live system before trying to debug. + +config KGDB_IRQ + int "IRQ of the debug serial port" + depends on KGDB + default 4 + help + This is the irq for the debug port. If everything is working + correctly and the kernel has interrupts on a control C to the + port should cause a break into the kernel debug stub. + +config DEBUG_INFO + bool + depends on KGDB + default y + +config KGDB_MORE + bool "Add any additional compile options" + depends on KGDB + default n + help + Saying yes here turns on the ability to enter additional + compile options. + + +config KGDB_OPTIONS + depends on KGDB_MORE + string "Additional compile arguments" + default "-O1" + help + This option allows you enter additional compile options for + the whole kernel compile. Each platform will have a default + that seems right for it. For example on PPC "-ggdb -O1", and + for i386 "-O1". Note that by configuring KGDB "-g" is already + turned on. In addition, on i386 platforms + "-fomit-frame-pointer" is deleted from the standard compile + options. + +config NO_KGDB_CPUS + int "Number of CPUs" + depends on KGDB && SMP + default NR_CPUS + help + + This option sets the number of cpus for kgdb ONLY. It is used + to prune some internal structures so they look "nice" when + displayed with gdb. This is to overcome possibly larger + numbers that may have been entered above. Enter the real + number to get nice clean kgdb_info displays. + +config KGDB_TS + bool "Enable kgdb time stamp macros?" + depends on KGDB + default n + help + Kgdb event macros allow you to instrument your code with calls + to the kgdb event recording function. The event log may be + examined with gdb at a break point. Turning on this + capability also allows you to choose how many events to + keep. Kgdb always keeps the lastest events. + +choice + depends on KGDB_TS + prompt "Max number of time stamps to save?" + default KGDB_TS_128 + +config KGDB_TS_64 + bool "64" + +config KGDB_TS_128 + bool "128" + +config KGDB_TS_256 + bool "256" + +config KGDB_TS_512 + bool "512" + +config KGDB_TS_1024 + bool "1024" + +endchoice + +config STACK_OVERFLOW_TEST + bool "Turn on kernel stack overflow testing?" + depends on KGDB + default n + help + This option enables code in the front line interrupt handlers + to check for kernel stack overflow on interrupts and system + calls. This is part of the kgdb code on x86 systems. + +config KGDB_CONSOLE + bool "Enable serial console thru kgdb port" + depends on KGDB + default n + help + This option enables the command line "console=kgdb" option. + When the system is booted with this option in the command line + all kernel printk output is sent to gdb (as well as to other + consoles). For this to work gdb must be connected. For this + reason, this command line option will generate a breakpoint if + gdb has not yet connected. After the gdb continue command is + given all pent up console output will be printed by gdb on the + host machine. Neither this option, nor KGDB require the + serial driver to be configured. + +config KGDB_SYSRQ + bool "Turn on SysRq 'G' command to do a break?" + depends on KGDB + default y + help + This option includes an option in the SysRq code that allows + you to enter SysRq G which generates a breakpoint to the KGDB + stub. This will work if the keyboard is alive and can + interrupt the system. Because of constraints on when the + serial port interrupt can be enabled, this code may allow you + to interrupt the system before the serial port control C is + available. Just say yes here. + config FRAME_POINTER bool "Compile the kernel with frame pointers" + default KGDB help If you say Y here the resulting kernel image will be slightly larger and slower, but it will give very useful debugging information. If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. +config MAGIC_SYSRQ + bool + depends on KGDB_SYSRQ + default y + config X86_FIND_SMP_CONFIG bool depends on X86_LOCAL_APIC || X86_VOYAGER @@ -1336,11 +1590,6 @@ config X86_BIOS_REBOOT depends on !(X86_VISWS || X86_VOYAGER) default y -config X86_TRAMPOLINE - bool - depends on SMP || X86_VISWS - default y - config PC bool depends on X86 && !EMBEDDED --- linux-2.6.4-rc1/arch/i386/kernel/acpi/boot.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/acpi/boot.c 2004-02-29 13:09:29.000000000 -0800 @@ -43,11 +43,12 @@ #define PREFIX "ACPI: " -int acpi_noirq __initdata = 0; /* skip ACPI IRQ initialization */ +int acpi_noirq __initdata; /* skip ACPI IRQ initialization */ int acpi_ht __initdata = 1; /* enable HT */ int acpi_lapic; int acpi_ioapic; +int acpi_strict; /* -------------------------------------------------------------------------- Boot-time Configuration @@ -96,6 +97,31 @@ char *__acpi_map_table(unsigned long phy } +#ifdef CONFIG_PCI_MMCONFIG +static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_mcfg *mcfg; + + if (!phys_addr || !size) + return -EINVAL; + + mcfg = (struct acpi_table_mcfg *) __acpi_map_table(phys_addr, size); + if (!mcfg) { + printk(KERN_WARNING PREFIX "Unable to map MCFG\n"); + return -ENODEV; + } + + if (mcfg->base_reserved) { + printk(KERN_ERR PREFIX "MMCONFIG not in low 4GB of memory\n"); + return -ENODEV; + } + + pci_mmcfg_base_addr = mcfg->base_address; + + return 0; +} +#endif /* CONFIG_PCI_MMCONFIG */ + #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; @@ -259,7 +285,7 @@ acpi_parse_nmi_src ( * programs the PIC-mode SCI to Level Trigger. * (NO-OP if the BIOS set Level Trigger already) * - * If a PIC-mode SCI is not recogznied or gives spurious IRQ7's + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's * it may require Edge Trigger -- use "acpi_pic_sci=edge" * (NO-OP if the BIOS set Edge Trigger already) * @@ -339,7 +365,7 @@ acpi_scan_rsdp ( * RSDP signature. */ for (offset = 0; offset < length; offset += 16) { - if (strncmp((char *) (start + offset), "RSD PTR ", sig_len)) + if (strncmp((char *) __va(start + offset), "RSD PTR ", sig_len)) continue; return (start + offset); } @@ -429,55 +455,10 @@ acpi_find_rsdp (void) return rsdp_phys; } -/* - * acpi_boot_init() - * called from setup_arch(), always. - * 1. maps ACPI tables for later use - * 2. enumerates lapics - * 3. enumerates io-apics - * - * side effects: - * acpi_lapic = 1 if LAPIC found - * acpi_ioapic = 1 if IOAPIC found - * if (acpi_lapic && acpi_ioapic) smp_found_config = 1; - * if acpi_blacklisted() acpi_disabled = 1; - * acpi_irq_model=... - * ... - * - * return value: (currently ignored) - * 0: success - * !0: failure - */ -int __init -acpi_boot_init (void) +static int acpi_apic_setup(void) { - int result = 0; - - if (acpi_disabled && !acpi_ht) - return 1; - - /* - * The default interrupt routing model is PIC (8259). This gets - * overriden if IOAPICs are enumerated (below). - */ - acpi_irq_model = ACPI_IRQ_MODEL_PIC; - - /* - * Initialize the ACPI boot-time table parser. - */ - result = acpi_table_init(); - if (result) { - acpi_disabled = 1; - return result; - } - - result = acpi_blacklisted(); - if (result) { - printk(KERN_WARNING PREFIX "BIOS listed in blacklist, disabling ACPI support\n"); - acpi_disabled = 1; - return result; - } + int result; #ifdef CONFIG_X86_PM_TIMER acpi_table_parse(ACPI_FADT, acpi_parse_fadt); @@ -541,24 +522,17 @@ acpi_boot_init (void) acpi_lapic = 1; -#endif /*CONFIG_X86_LOCAL_APIC*/ +#endif /* CONFIG_X86_LOCAL_APIC */ #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_ACPI_INTERPRETER) /* * I/O APIC - * -------- */ - /* - * ACPI interpreter is required to complete interrupt setup, - * so if it is off, don't enumerate the io-apics with ACPI. - * If MPS is present, it will handle them, - * otherwise the system will stay in PIC mode - */ - if (acpi_disabled || acpi_noirq) { + if (acpi_noirq) { return 1; - } + } /* * if "noapic" boot option, don't look for IO-APICs @@ -573,8 +547,7 @@ acpi_boot_init (void) if (!result) { printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); return -ENODEV; - } - else if (result < 0) { + } else if (result < 0) { printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n"); return result; } @@ -611,9 +584,82 @@ acpi_boot_init (void) } #endif + return 0; +} + +/* + * acpi_boot_init() + * called from setup_arch(), always. + * 1. maps ACPI tables for later use + * 2. enumerates lapics + * 3. enumerates io-apics + * + * side effects: + * acpi_lapic = 1 if LAPIC found + * acpi_ioapic = 1 if IOAPIC found + * if (acpi_lapic && acpi_ioapic) smp_found_config = 1; + * if acpi_blacklisted() acpi_disabled = 1; + * acpi_irq_model=... + * ... + * + * return value: (currently ignored) + * 0: success + * !0: failure + */ + +int __init +acpi_boot_init (void) +{ + int result, error; + + if (acpi_disabled && !acpi_ht) + return 1; + + /* + * The default interrupt routing model is PIC (8259). This gets + * overriden if IOAPICs are enumerated (below). + */ + acpi_irq_model = ACPI_IRQ_MODEL_PIC; + + /* + * Initialize the ACPI boot-time table parser. + */ + result = acpi_table_init(); + if (result) { + acpi_disabled = 1; + return result; + } + + result = acpi_blacklisted(); + if (result) { + printk(KERN_WARNING PREFIX "BIOS listed in blacklist, disabling ACPI support\n"); + acpi_disabled = 1; + return result; + } + + error = acpi_apic_setup(); + +#ifdef CONFIG_PCI_MMCONFIG + result = acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg); + if (result < 0) { + printk(KERN_ERR PREFIX "Error %d parsing MCFG\n", result); + if (!error) + error = result; + } else if (result > 1) { + printk(KERN_WARNING PREFIX "Multiple MCFG tables exist\n"); + } +#endif /* CONFIG_PCI_MMCONFIG */ + #ifdef CONFIG_HPET_TIMER - acpi_table_parse(ACPI_HPET, acpi_parse_hpet); + result = acpi_table_parse(ACPI_HPET, acpi_parse_hpet); + if (result < 0) { + printk(KERN_ERR PREFIX "Error %d parsing HPET\n", result); + if (!error) + error = result; + } else if (result > 1) { + printk(KERN_WARNING PREFIX "Multiple HPET tables exist\n"); + } #endif - return 0; + return error; } --- linux-2.6.4-rc1/arch/i386/kernel/apic.c 2004-02-03 20:42:34.000000000 -0800 +++ 25/arch/i386/kernel/apic.c 2004-02-29 13:07:44.000000000 -0800 @@ -595,7 +595,7 @@ static int __init init_lapic_sysfs(void) error = sysdev_class_register(&lapic_sysclass); if (!error) - error = sys_device_register(&device_lapic); + error = sysdev_register(&device_lapic); return error; } device_initcall(init_lapic_sysfs); --- linux-2.6.4-rc1/arch/i386/kernel/asm-offsets.c 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/asm-offsets.c 2004-02-29 13:09:29.000000000 -0800 @@ -4,9 +4,11 @@ * to extract and format the required data. */ +#include #include #include #include "sigframe.h" +#include #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -28,4 +30,20 @@ void foo(void) DEFINE(RT_SIGFRAME_sigcontext, offsetof (struct rt_sigframe, uc.uc_mcontext)); + + DEFINE(TI_task, offsetof (struct thread_info, task)); + DEFINE(TI_exec_domain, offsetof (struct thread_info, exec_domain)); + DEFINE(TI_flags, offsetof (struct thread_info, flags)); + DEFINE(TI_preempt_count, offsetof (struct thread_info, preempt_count)); + DEFINE(TI_addr_limit, offsetof (struct thread_info, addr_limit)); + DEFINE(TI_real_stack, offsetof (struct thread_info, real_stack)); + DEFINE(TI_virtual_stack, offsetof (struct thread_info, virtual_stack)); + DEFINE(TI_user_pgd, offsetof (struct thread_info, user_pgd)); + + DEFINE(FIX_ENTRY_TRAMPOLINE_0_addr, + __fix_to_virt(FIX_ENTRY_TRAMPOLINE_0)); + DEFINE(FIX_VSYSCALL_addr, __fix_to_virt(FIX_VSYSCALL)); + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); + DEFINE(task_thread_db7, + offsetof (struct task_struct, thread.debugreg[7])); } --- linux-2.6.4-rc1/arch/i386/kernel/cpu/centaur.c 2003-06-14 12:18:24.000000000 -0700 +++ 25/arch/i386/kernel/cpu/centaur.c 2004-02-29 13:08:31.000000000 -0800 @@ -246,7 +246,15 @@ static void __init winchip2_protect_mcr( lo&=~0x1C0; /* blank bits 8-6 */ wrmsr(MSR_IDT_MCR_CTRL, lo, hi); } -#endif +#endif /* CONFIG_X86_OOSTORE */ + +#define ACE_PRESENT (1 << 6) +#define ACE_ENABLED (1 << 7) +#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */ + +#define RNG_PRESENT (1 << 2) +#define RNG_ENABLED (1 << 3) +#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ static void __init init_c3(struct cpuinfo_x86 *c) { @@ -254,6 +262,24 @@ static void __init init_c3(struct cpuinf /* Test for Centaur Extended Feature Flags presence */ if (cpuid_eax(0xC0000000) >= 0xC0000001) { + u32 tmp = cpuid_edx(0xC0000001); + + /* enable ACE unit, if present and disabled */ + if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) { + rdmsr (MSR_VIA_FCR, lo, hi); + lo |= ACE_FCR; /* enable ACE unit */ + wrmsr (MSR_VIA_FCR, lo, hi); + printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n"); + } + + /* enable RNG unit, if present and disabled */ + if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) { + rdmsr (MSR_VIA_RNG, lo, hi); + lo |= RNG_ENABLE; /* enable RNG unit */ + wrmsr (MSR_VIA_RNG, lo, hi); + printk(KERN_INFO "CPU: Enabled h/w RNG\n"); + } + /* store Centaur Extended Feature Flags as * word 5 of the CPU capability bit array */ --- linux-2.6.4-rc1/arch/i386/kernel/cpu/common.c 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/i386/kernel/cpu/common.c 2004-02-29 13:09:29.000000000 -0800 @@ -514,12 +514,16 @@ void __init cpu_init (void) set_tss_desc(cpu,t); cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; load_TR_desc(); - load_LDT(&init_mm.context); + if (cpu) + load_LDT(&init_mm.context); /* Set up doublefault TSS pointer in the GDT */ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); cpu_gdt_table[cpu][GDT_ENTRY_DOUBLEFAULT_TSS].b &= 0xfffffdff; + if (cpu) + trap_init_virtual_GDT(); + /* Clear %fs and %gs. */ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); --- linux-2.6.4-rc1/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2004-02-29 13:08:14.000000000 -0800 @@ -57,8 +57,7 @@ static int cpufreq_p4_setdc(unsigned int u32 l, h; cpumask_t cpus_allowed, affected_cpu_map; struct cpufreq_freqs freqs; - int hyperthreading = 0; - int sibling = 0; + int j; if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV)) @@ -68,13 +67,10 @@ static int cpufreq_p4_setdc(unsigned int cpus_allowed = current->cpus_allowed; /* only run on CPU to be set, or on its sibling */ - affected_cpu_map = cpumask_of_cpu(cpu); -#ifdef CONFIG_X86_HT - hyperthreading = ((cpu_has_ht) && (smp_num_siblings == 2)); - if (hyperthreading) { - sibling = cpu_sibling_map[cpu]; - cpu_set(sibling, affected_cpu_map); - } +#ifdef CONFIG_SMP + affected_cpu_map = cpu_sibling_map[cpu]; +#else + affected_cpu_map = cpumask_of_cpu(cpu); #endif set_cpus_allowed(current, affected_cpu_map); BUG_ON(!cpu_isset(smp_processor_id(), affected_cpu_map)); @@ -97,11 +93,11 @@ static int cpufreq_p4_setdc(unsigned int /* notifiers */ freqs.old = stock_freq * l / 8; freqs.new = stock_freq * newstate / 8; - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - if (hyperthreading) { - freqs.cpu = sibling; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + for_each_cpu(j) { + if (cpu_isset(j, affected_cpu_map)) { + freqs.cpu = j; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } } rdmsr(MSR_IA32_THERM_STATUS, l, h); @@ -132,10 +128,11 @@ static int cpufreq_p4_setdc(unsigned int set_cpus_allowed(current, cpus_allowed); /* notifiers */ - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - if (hyperthreading) { - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + for_each_cpu(j) { + if (cpu_isset(j, affected_cpu_map)) { + freqs.cpu = j; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } } return 0; --- linux-2.6.4-rc1/arch/i386/kernel/cpu/cpufreq/powernow-k8.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/cpu/cpufreq/powernow-k8.c 2004-02-29 13:09:18.000000000 -0800 @@ -8,6 +8,8 @@ * * Based on the powernow-k7.c module written by Dave Jones. * (C) 2003 Dave Jones on behalf of SuSE Labs + * (C) 2004 Dominik Brodowski + * (C) 2004 Pavel Machek * Licensed under the terms of the GNU GPL License version 2. * Based upon datasheets & sample CPUs kindly provided by AMD. * @@ -34,10 +36,6 @@ #define VERSION "version 1.00.08a" #include "powernow-k8.h" -#ifdef CONFIG_PREEMPT -#warning this driver has not been tested on a preempt system -#endif - static u32 vstable; /* voltage stabalization time, from PSB, units 20 us */ static u32 plllock; /* pll lock time, from PSB, units 1 us */ static u32 numps; /* number of p-states, from PSB */ @@ -636,13 +634,22 @@ find_psb_table(void) return -ENOMEM; } - for (j = 0; j < numps; j++) { - printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x\n", j, - pst[j].fid, find_freq_from_fid(pst[j].fid), pst[j].vid); + for (j = 0; j < psb->numpstates; j++) { powernow_table[j].index = pst[j].fid; /* lower 8 bits */ powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */ - powernow_table[j].frequency = find_freq_from_fid(pst[j].fid); } + + /* If you want to override your frequency tables, this + is right place. */ + + for (j = 0; j < numps; j++) { + powernow_table[j].frequency = find_freq_from_fid(powernow_table[j].index & 0xff)*1000; + printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x\n", j, + powernow_table[j].index & 0xff, + powernow_table[j].frequency/1000, + powernow_table[j].index >> 8); + } + powernow_table[numps].frequency = CPUFREQ_TABLE_END; powernow_table[numps].index = 0; --- linux-2.6.4-rc1/arch/i386/kernel/cpu/intel.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/cpu/intel.c 2004-02-29 13:09:29.000000000 -0800 @@ -10,6 +10,7 @@ #include #include #include +#include #include "cpu.h" @@ -19,8 +20,6 @@ #include #endif -extern int trap_init_f00f_bug(void); - #ifdef CONFIG_X86_INTEL_USERCOPY /* * Alignment at which movsl is preferred for bulk memory copies. @@ -165,7 +164,7 @@ static void __init init_intel(struct cpu c->f00f_bug = 1; if ( !f00f_workaround_enabled ) { - trap_init_f00f_bug(); + trap_init_virtual_IDT(); printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); f00f_workaround_enabled = 1; } @@ -250,6 +249,12 @@ static void __init init_intel(struct cpu /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) clear_bit(X86_FEATURE_SEP, c->x86_capability); + /* + * FIXME: SEP is disabled for 4G/4G for now: + */ +#ifdef CONFIG_X86_HIGH_ENTRY + clear_bit(X86_FEATURE_SEP, c->x86_capability); +#endif /* Names for the Pentium II/Celeron processors detectable only by also checking the cache size. --- linux-2.6.4-rc1/arch/i386/kernel/cpu/proc.c 2003-08-22 19:23:40.000000000 -0700 +++ 25/arch/i386/kernel/cpu/proc.c 2004-02-29 13:08:31.000000000 -0800 @@ -50,7 +50,7 @@ static int show_cpuinfo(struct seq_file NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "xstore", NULL, NULL, NULL, NULL, NULL, + NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, --- linux-2.6.4-rc1/arch/i386/kernel/doublefault.c 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/doublefault.c 2004-02-29 13:09:29.000000000 -0800 @@ -7,12 +7,13 @@ #include #include #include +#include #define DOUBLEFAULT_STACKSIZE (1024) static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; #define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) -#define ptr_ok(x) ((x) > 0xc0000000 && (x) < 0xc1000000) +#define ptr_ok(x) (((x) > __PAGE_OFFSET && (x) < (__PAGE_OFFSET + 0x01000000)) || ((x) >= FIXADDR_START)) static void doublefault_fn(void) { @@ -38,8 +39,8 @@ static void doublefault_fn(void) printk("eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", t->eax, t->ebx, t->ecx, t->edx); - printk("esi = %08lx, edi = %08lx\n", - t->esi, t->edi); + printk("esi = %08lx, edi = %08lx, ebp = %08lx\n", + t->esi, t->edi, t->ebp); } } --- linux-2.6.4-rc1/arch/i386/kernel/entry.S 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/entry.S 2004-02-29 13:09:30.000000000 -0800 @@ -43,11 +43,25 @@ #include #include #include +#include #include #include +#include #include #include #include "irq_vectors.h" + /* We do not recover from a stack overflow, but at least + * we know it happened and should be able to track it down. + */ +#ifdef CONFIG_STACK_OVERFLOW_TEST +#define STACK_OVERFLOW_TEST \ + testl $7680,%esp; \ + jnz 10f; \ + call stack_overflow; \ +10: +#else +#define STACK_OVERFLOW_TEST +#endif #define nr_syscalls ((syscall_table_size)/4) @@ -87,7 +101,102 @@ TSS_ESP0_OFFSET = (4 - 0x200) #define resume_kernel restore_all #endif -#define SAVE_ALL \ +#ifdef CONFIG_X86_HIGH_ENTRY + +#ifdef CONFIG_X86_SWITCH_PAGETABLES + +#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) +/* + * If task is preempted in __SWITCH_KERNELSPACE, and moved to another cpu, + * __switch_to repoints %esp to the appropriate virtual stack; but %ebp is + * left stale, so we must check whether to repeat the real stack calculation. + */ +#define repeat_if_esp_changed \ + xorl %esp, %ebp; \ + testl $-THREAD_SIZE, %ebp; \ + jnz 0b +#else +#define repeat_if_esp_changed +#endif + +/* clobbers ebx, edx and ebp */ + +#define __SWITCH_KERNELSPACE \ + cmpl $0xff000000, %esp; \ + jb 1f; \ + \ + /* \ + * switch pagetables and load the real stack, \ + * keep the stack offset: \ + */ \ + \ + movl $swapper_pg_dir-__PAGE_OFFSET, %edx; \ + \ + /* GET_THREAD_INFO(%ebp) intermixed */ \ +0: \ + movl %esp, %ebp; \ + movl %esp, %ebx; \ + andl $(-THREAD_SIZE), %ebp; \ + andl $(THREAD_SIZE-1), %ebx; \ + orl TI_real_stack(%ebp), %ebx; \ + repeat_if_esp_changed; \ + \ + movl %edx, %cr3; \ + movl %ebx, %esp; \ +1: + +#endif + + +#define __SWITCH_USERSPACE \ + /* interrupted any of the user return paths? */ \ + \ + movl EIP(%esp), %eax; \ + \ + cmpl $int80_ret_start_marker, %eax; \ + jb 33f; /* nope - continue with sysexit check */\ + cmpl $int80_ret_end_marker, %eax; \ + jb 22f; /* yes - switch to virtual stack */ \ +33: \ + cmpl $sysexit_ret_start_marker, %eax; \ + jb 44f; /* nope - continue with user check */ \ + cmpl $sysexit_ret_end_marker, %eax; \ + jb 22f; /* yes - switch to virtual stack */ \ + /* return to userspace? */ \ +44: \ + movl EFLAGS(%esp),%ecx; \ + movb CS(%esp),%cl; \ + testl $(VM_MASK | 3),%ecx; \ + jz 2f; \ +22: \ + /* \ + * switch to the virtual stack, then switch to \ + * the userspace pagetables. \ + */ \ + \ + GET_THREAD_INFO(%ebp); \ + movl TI_virtual_stack(%ebp), %edx; \ + movl TI_user_pgd(%ebp), %ecx; \ + \ + movl %esp, %ebx; \ + andl $(THREAD_SIZE-1), %ebx; \ + orl %ebx, %edx; \ +int80_ret_start_marker: \ + movl %edx, %esp; \ + movl %ecx, %cr3; \ + \ + __RESTORE_ALL; \ +int80_ret_end_marker: \ +2: + +#else /* !CONFIG_X86_HIGH_ENTRY */ + +#define __SWITCH_KERNELSPACE +#define __SWITCH_USERSPACE + +#endif + +#define __SAVE_ALL \ cld; \ pushl %es; \ pushl %ds; \ @@ -102,7 +211,7 @@ TSS_ESP0_OFFSET = (4 - 0x200) movl %edx, %ds; \ movl %edx, %es; -#define RESTORE_INT_REGS \ +#define __RESTORE_INT_REGS \ popl %ebx; \ popl %ecx; \ popl %edx; \ @@ -111,29 +220,28 @@ TSS_ESP0_OFFSET = (4 - 0x200) popl %ebp; \ popl %eax -#define RESTORE_REGS \ - RESTORE_INT_REGS; \ -1: popl %ds; \ -2: popl %es; \ +#define __RESTORE_REGS \ + __RESTORE_INT_REGS; \ +111: popl %ds; \ +222: popl %es; \ .section .fixup,"ax"; \ -3: movl $0,(%esp); \ - jmp 1b; \ -4: movl $0,(%esp); \ - jmp 2b; \ +444: movl $0,(%esp); \ + jmp 111b; \ +555: movl $0,(%esp); \ + jmp 222b; \ .previous; \ .section __ex_table,"a";\ .align 4; \ - .long 1b,3b; \ - .long 2b,4b; \ + .long 111b,444b;\ + .long 222b,555b;\ .previous - -#define RESTORE_ALL \ - RESTORE_REGS \ +#define __RESTORE_ALL \ + __RESTORE_REGS \ addl $4, %esp; \ -1: iret; \ +333: iret; \ .section .fixup,"ax"; \ -2: sti; \ +666: sti; \ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ movl %edx, %es; \ @@ -142,10 +250,19 @@ TSS_ESP0_OFFSET = (4 - 0x200) .previous; \ .section __ex_table,"a";\ .align 4; \ - .long 1b,2b; \ + .long 333b,666b;\ .previous +#define SAVE_ALL \ + __SAVE_ALL; \ + __SWITCH_KERNELSPACE; \ + STACK_OVERFLOW_TEST; + +#define RESTORE_ALL \ + __SWITCH_USERSPACE; \ + __RESTORE_ALL; +.section .entry.text,"ax" ENTRY(lcall7) pushfl # We get a different stack layout with call @@ -163,7 +280,7 @@ do_lcall: movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO - movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain + movl TI_exec_domain(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp popl %eax @@ -208,7 +325,7 @@ ENTRY(resume_userspace) cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - movl TI_FLAGS(%ebp), %ecx + movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done on # int/exception return? jne work_pending @@ -216,18 +333,18 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - cmpl $0,TI_PRE_COUNT(%ebp) # non-zero preempt_count ? + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_all need_resched: - movl TI_FLAGS(%ebp), %ecx # need_resched set ? + movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl jz restore_all testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all - movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) + movl $PREEMPT_ACTIVE,TI_preempt_count(%ebp) sti call schedule - movl $0,TI_PRE_COUNT(%ebp) + movl $0,TI_preempt_count(%ebp) cli jmp need_resched #endif @@ -246,37 +363,50 @@ sysenter_past_esp: pushl $(__USER_CS) pushl $SYSENTER_RETURN -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault -1: movl (%ebp),%ebp -.section __ex_table,"a" - .align 4 - .long 1b,syscall_fault -.previous - pushl %eax SAVE_ALL GET_THREAD_INFO(%ebp) cmpl $(nr_syscalls), %eax jae syscall_badsys - testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp) + testb $_TIF_SYSCALL_TRACE,TI_flags(%ebp) jnz syscall_trace_entry call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) cli - movl TI_FLAGS(%ebp), %ecx + movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work + +#ifdef CONFIG_X86_SWITCH_PAGETABLES + + GET_THREAD_INFO(%ebp) + movl TI_virtual_stack(%ebp), %edx + movl TI_user_pgd(%ebp), %ecx + movl %esp, %ebx + andl $(THREAD_SIZE-1), %ebx + orl %ebx, %edx +sysexit_ret_start_marker: + movl %edx, %esp + movl %ecx, %cr3 +#endif + /* + * only ebx is not restored by the userspace sysenter vsyscall + * code, it assumes it to be callee-saved. + */ + movl EBX(%esp), %ebx + /* if something modifies registers it must also disable sysexit */ + movl EIP(%esp), %edx movl OLDESP(%esp), %ecx + sti sysexit +#ifdef CONFIG_X86_SWITCH_PAGETABLES +sysexit_ret_end_marker: + nop +#endif # system call handler stub @@ -287,7 +417,7 @@ ENTRY(system_call) cmpl $(nr_syscalls), %eax jae syscall_badsys # system call tracing in operation - testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp) + testb $_TIF_SYSCALL_TRACE,TI_flags(%ebp) jnz syscall_trace_entry syscall_call: call *sys_call_table(,%eax,4) @@ -296,10 +426,23 @@ syscall_exit: cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - movl TI_FLAGS(%ebp), %ecx + movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work restore_all: +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS + movl EFLAGS(%esp), %eax # mix EFLAGS and CS + movb CS(%esp), %al + testl $(VM_MASK | 3), %eax + jz resume_kernelX # returning to kernel or vm86-space + + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? + jz resume_kernelX + + int $3 + +resume_kernelX: +#endif RESTORE_ALL # perform work that needs to be done immediately before resumption @@ -312,7 +455,7 @@ work_resched: cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - movl TI_FLAGS(%ebp), %ecx + movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all @@ -327,6 +470,22 @@ work_notifysig: # deal with pending s # vm86-space xorl %edx, %edx call do_notify_resume + +#if CONFIG_X86_HIGH_ENTRY + /* + * Reload db7 if necessary: + */ + movl TI_flags(%ebp), %ecx + testb $_TIF_DB7, %cl + jnz work_db7 + + jmp restore_all + +work_db7: + movl TI_task(%ebp), %edx; + movl task_thread_db7(%edx), %edx; + movl %edx, %db7; +#endif jmp restore_all ALIGN @@ -382,7 +541,7 @@ syscall_badsys: */ .data ENTRY(interrupt) -.text +.previous vector=0 ENTRY(irq_entries_start) @@ -392,7 +551,7 @@ ENTRY(irq_entries_start) jmp common_interrupt .data .long 1b -.text +.previous vector=vector+1 .endr @@ -433,12 +592,17 @@ error_code: movl ES(%esp), %edi # get the function address movl %eax, ORIG_EAX(%esp) movl %ecx, ES(%esp) - movl %esp, %edx pushl %esi # push the error code - pushl %edx # push the pt_regs pointer movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es + +/* clobbers edx, ebx and ebp */ + __SWITCH_KERNELSPACE + + leal 4(%esp), %edx # prepare pt_regs + pushl %edx # push pt_regs + call *%edi addl $8, %esp jmp ret_from_exception @@ -529,7 +693,7 @@ nmi_stack_correct: pushl %edx call do_nmi addl $8, %esp - RESTORE_ALL + jmp restore_all nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) @@ -606,6 +770,8 @@ ENTRY(spurious_interrupt_bug) pushl $do_spurious_interrupt_bug jmp error_code +.previous + .data ENTRY(sys_call_table) .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ @@ -882,5 +1048,11 @@ ENTRY(sys_call_table) .long sys_utimes .long sys_fadvise64_64 .long sys_ni_syscall /* sys_vserver */ + .long sys_mq_open + .long sys_mq_unlink /* 275 */ + .long sys_mq_timedsend + .long sys_mq_timedreceive + .long sys_mq_notify + .long sys_mq_getsetattr syscall_table_size=(.-sys_call_table) --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/arch/i386/kernel/entry_trampoline.c 2004-02-29 13:09:30.000000000 -0800 @@ -0,0 +1,73 @@ +/* + * linux/arch/i386/kernel/entry_trampoline.c + * + * (C) Copyright 2003 Ingo Molnar + * + * This file contains the needed support code for 4GB userspace + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern char __entry_tramp_start, __entry_tramp_end, __start___entry_text; + +void __init init_entry_mappings(void) +{ +#ifdef CONFIG_X86_HIGH_ENTRY + void *tramp; + + /* + * We need a high IDT and GDT for the 4G/4G split: + */ + trap_init_virtual_IDT(); + + __set_fixmap(FIX_ENTRY_TRAMPOLINE_0, __pa((unsigned long)&__entry_tramp_start), PAGE_KERNEL); + __set_fixmap(FIX_ENTRY_TRAMPOLINE_1, __pa((unsigned long)&__entry_tramp_start) + PAGE_SIZE, PAGE_KERNEL); + tramp = (void *)fix_to_virt(FIX_ENTRY_TRAMPOLINE_0); + + printk("mapped 4G/4G trampoline to %p.\n", tramp); + BUG_ON((void *)&__start___entry_text != tramp); + /* + * Virtual kernel stack: + */ + BUG_ON(__kmap_atomic_vaddr(KM_VSTACK0) & (THREAD_SIZE-1)); + BUG_ON(sizeof(struct desc_struct)*NR_CPUS*GDT_ENTRIES > 2*PAGE_SIZE); + BUG_ON((unsigned int)&__entry_tramp_end - (unsigned int)&__entry_tramp_start > 2*PAGE_SIZE); + + /* + * set up the initial thread's virtual stack related + * fields: + */ + current->thread.stack_page0 = virt_to_page((char *)current->thread_info); + current->thread.stack_page1 = virt_to_page((char *)current->thread_info + PAGE_SIZE); + current->thread_info->virtual_stack = (void *)__kmap_atomic_vaddr(KM_VSTACK0); + + __kunmap_atomic_type(KM_VSTACK0); + __kunmap_atomic_type(KM_VSTACK1); + __kmap_atomic(current->thread.stack_page0, KM_VSTACK0); + __kmap_atomic(current->thread.stack_page1, KM_VSTACK1); + +#endif + current->thread_info->real_stack = (void *)current->thread_info; + current->thread_info->user_pgd = NULL; + current->thread.esp0 = (unsigned long)current->thread_info->real_stack + THREAD_SIZE; +} + + + +void __init entry_trampoline_setup(void) +{ + /* + * old IRQ entries set up by the boot code will still hang + * around - they are a sign of hw trouble anyway, now they'll + * produce a double fault message. + */ + trap_init_virtual_GDT(); +} --- linux-2.6.4-rc1/arch/i386/kernel/head.S 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/head.S 2004-02-29 13:08:58.000000000 -0800 @@ -17,7 +17,7 @@ #include #include #include - +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -40,49 +40,89 @@ #define X86_VENDOR_ID CPU_PARAMS+36 /* offset dependent on NCAPINTS */ /* - * Initialize page tables + * This is how much memory *in addition to the memory covered up to + * and including _end* we need mapped initially. We need one bit for + * each possible page, but only in low memory, which means + * 2^32/4096/8 = 128K worst case (4G/4G split.) + * + * Modulo rounding, each megabyte assigned here requires a kilobyte of + * memory, which is currently unreclaimed. + * + * This should be a multiple of a page. */ -#define INIT_PAGE_TABLES \ - movl $pg0 - __PAGE_OFFSET, %edi; \ - /* "007" doesn't mean with license to kill, but PRESENT+RW+USER */ \ - movl $007, %eax; \ -2: stosl; \ - add $0x1000, %eax; \ - cmp $empty_zero_page - __PAGE_OFFSET, %edi; \ - jne 2b; +#define INIT_MAP_BEYOND_END (128*1024) + /* - * swapper_pg_dir is the main page directory, address 0x00101000 - * - * On entry, %esi points to the real-mode code as a 32-bit pointer. + * 32-bit kernel entrypoint; only used by the boot CPU. On entry, + * %esi points to the real-mode code as a 32-bit pointer. + * CS and DS must be 4 GB flat segments, but we don't depend on + * any particular GDT layout, because we load our own as soon as we + * can. */ ENTRY(startup_32) -#ifdef CONFIG_X86_VISWS /* - * On SGI Visual Workstations boot CPU starts in protected mode. + * Set segments to known values. */ - orw %bx, %bx - jnz 1f - INIT_PAGE_TABLES - movl $swapper_pg_dir - __PAGE_OFFSET, %eax - movl %eax, %cr3 - lgdt boot_gdt -1: -#endif + cld + lgdt boot_gdt_descr - __PAGE_OFFSET + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs /* - * Set segments to known values + * Initialize page tables. This creates a PDE and a set of page + * tables, which are located immediately beyond _end. The variable + * init_pg_tables_end is set up to point to the first "safe" location. + * + * Warning: don't use %esi or the stack in this code. However, %esp + * can be used as a GPR if you really need it... */ +page_pde_offset = (__PAGE_OFFSET >> 20); + + movl $(pg0 - __PAGE_OFFSET), %edi + movl $(swapper_pg_dir - __PAGE_OFFSET), %edx + movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ +10: + leal 0x007(%edi),%ecx /* Create PDE entry */ + movl %ecx,(%edx) /* Store identity PDE entry */ + movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ + addl $4,%edx + movl $1024, %ecx +11: + stosl + addl $0x1000,%eax + loop 11b + /* End condition: we must map up to and including INIT_MAP_BEYOND_END */ + /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */ + leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp + cmpl %ebp,%eax + jb 10b + movl %edi,(init_pg_tables_end - __PAGE_OFFSET) + +#ifdef CONFIG_SMP + xorl %ebx,%ebx /* This is the boot CPU (BSP) */ + jmp 3f + +/* + * Non-boot CPU entry point; entered from trampoline.S + * We can't lgdt here, because lgdt itself uses a data segment, but + * we know the trampoline has already loaded the boot_gdt_table GDT + * for us. + */ +ENTRY(startup_32_smp) cld movl $(__BOOT_DS),%eax movl %eax,%ds movl %eax,%es movl %eax,%fs movl %eax,%gs -#ifdef CONFIG_SMP - orw %bx,%bx - jz 1f + + xorl %ebx,%ebx + incl %ebx /* This is a secondary processor (AP) */ /* * New page tables may be in 4Mbyte page mode and may @@ -99,37 +139,40 @@ ENTRY(startup_32) * not yet offset PAGE_OFFSET.. */ #define cr4_bits mmu_cr4_features-__PAGE_OFFSET - cmpl $0,cr4_bits - je 3f + movl cr4_bits,%edx + andl %edx,%edx + jz 3f movl %cr4,%eax # Turn on paging options (PSE,PAE,..) - orl cr4_bits,%eax + orl %edx,%eax movl %eax,%cr4 - jmp 3f -1: -#endif - INIT_PAGE_TABLES + +3: +#endif /* CONFIG_SMP */ + /* * Enable paging */ -3: movl $swapper_pg_dir-__PAGE_OFFSET,%eax movl %eax,%cr3 /* set the page table pointer.. */ movl %cr0,%eax orl $0x80000000,%eax movl %eax,%cr0 /* ..and set paging (PG) bit */ - jmp 1f /* flush the prefetch-queue */ -1: - movl $1f,%eax - jmp *%eax /* make sure eip is relocated */ + ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 1: /* Set up the stack pointer */ lss stack_start,%esp -#ifdef CONFIG_SMP - orw %bx,%bx - jz 1f /* Initial CPU cleans BSS */ +/* + * Initialize eflags. Some BIOS's leave bits like NT set. This would + * confuse the debugger if this code is traced. + * XXX - best to initialize before switching to protected mode. + */ pushl $0 popfl + +#ifdef CONFIG_SMP + andl %ebx,%ebx + jz 1f /* Initial CPU cleans BSS */ jmp checkCPUtype 1: #endif /* CONFIG_SMP */ @@ -142,21 +185,15 @@ ENTRY(startup_32) movl $__bss_start,%edi movl $__bss_stop,%ecx subl %edi,%ecx - rep - stosb + shrl $2,%ecx + rep ; stosl /* * start system 32-bit setup. We need to re-do some of the things done * in 16-bit mode for the "real" operations. */ call setup_idt -/* - * Initialize eflags. Some BIOS's leave bits like NT set. This would - * confuse the debugger if this code is traced. - * XXX - best to initialize before switching to protected mode. - */ - pushl $0 - popfl + /* * Copy bootup parameters out of the way. First 2kB of * _empty_zero_page is for boot parameters, second 2kB @@ -273,7 +310,7 @@ is386: movl $2,%ecx # set MP call initialize_secondary jmp L6 1: -#endif +#endif /* CONFIG_SMP */ call start_kernel L6: jmp L6 # main should never return here, but @@ -309,6 +346,8 @@ check_x87: * and the kernel moved to PAGE_OFFSET. Interrupts * are enabled elsewhere, when we can be relatively * sure everything is ok. + * + * Warning: %esi is live across this function. */ setup_idt: lea ignore_int,%edx @@ -332,7 +371,7 @@ ENTRY(stack_start) /* This is the default interrupt "handler" :-) */ int_msg: - .asciz "Unknown interrupt\n" + .asciz "Unknown interrupt or fault at EIP %p %p %p\n" ALIGN ignore_int: cld @@ -344,9 +383,13 @@ ignore_int: movl $(__KERNEL_DS),%eax movl %eax,%ds movl %eax,%es + pushl 16(%esp) + pushl 24(%esp) + pushl 32(%esp) + pushl 40(%esp) pushl $int_msg call printk - popl %eax + addl $(5*4),%esp popl %ds popl %es popl %edx @@ -361,10 +404,17 @@ ignore_int: * segment size, and 32-bit linear address value: */ +.globl boot_gdt_descr .globl idt_descr .globl cpu_gdt_descr ALIGN +# early boot GDT descriptor (must use 1:1 address mapping) + .word 0 # 32 bit align gdt_desc.address +boot_gdt_descr: + .word __BOOT_DS+7 + .long boot_gdt_table - __PAGE_OFFSET + .word 0 # 32-bit align idt_desc.address idt_descr: .word IDT_ENTRIES*8-1 # idt contains 256 entries @@ -379,41 +429,25 @@ cpu_gdt_descr: .fill NR_CPUS-1,8,0 # space for the other GDT descriptors /* - * This is initialized to create an identity-mapping at 0-8M (for bootup - * purposes) and another mapping of the 0-8M area at virtual address - * PAGE_OFFSET. + * swapper_pg_dir is the main page directory, address 0x00101000 + * + * This is initialized to create an identity-mapping at 0 (for bootup + * purposes) and another mapping at virtual address PAGE_OFFSET. The + * values put here should be all invalid (zero); the valid + * entries are created dynamically at boot time. + * + * The code creates enough page tables to map 0-_end, the page tables + * themselves, plus INIT_MAP_BEYOND_END bytes; see comment at beginning. */ .org 0x1000 ENTRY(swapper_pg_dir) - .long 0x00102007 - .long 0x00103007 - .fill BOOT_USER_PGD_PTRS-2,4,0 - /* default: 766 entries */ - .long 0x00102007 - .long 0x00103007 - /* default: 254 entries */ - .fill BOOT_KERNEL_PGD_PTRS-2,4,0 + .fill 1024,4,0 -/* - * The page tables are initialized to only 8MB here - the final page - * tables are set up later depending on memory size. - */ .org 0x2000 -ENTRY(pg0) - -.org 0x3000 -ENTRY(pg1) - -/* - * empty_zero_page must immediately follow the page tables ! (The - * initialization loop counts until empty_zero_page) - */ - -.org 0x4000 ENTRY(empty_zero_page) + .fill 4096,1,0 -.org 0x5000 - +.org 0x3000 /* * Real beginning of normal "text" segment */ @@ -428,20 +462,19 @@ ENTRY(_stext) .data /* - * The Global Descriptor Table contains 28 quadwords, per-CPU. - */ -#if defined(CONFIG_SMP) || defined(CONFIG_X86_VISWS) -/* * The boot_gdt_table must mirror the equivalent in setup.S and is - * used only by the trampoline for booting other CPUs + * used only for booting. */ .align L1_CACHE_BYTES ENTRY(boot_gdt_table) .fill GDT_ENTRY_BOOT_CS,8,0 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ -#endif - .align L1_CACHE_BYTES + +/* + * The Global Descriptor Table contains 28 quadwords, per-CPU. + */ + .align PAGE_SIZE_asm ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* 0x0b reserved */ @@ -488,4 +521,3 @@ ENTRY(cpu_gdt_table) #ifdef CONFIG_SMP .fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */ #endif - --- linux-2.6.4-rc1/arch/i386/kernel/i386_ksyms.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/i386/kernel/i386_ksyms.c 2004-02-29 13:09:29.000000000 -0800 @@ -88,16 +88,11 @@ EXPORT_SYMBOL(get_cmos_time); EXPORT_SYMBOL(cpu_khz); EXPORT_SYMBOL(apm_info); -#ifdef CONFIG_DEBUG_IOVIRT -EXPORT_SYMBOL(__io_virt_debug); -#endif - EXPORT_SYMBOL_NOVERS(__down_failed); EXPORT_SYMBOL_NOVERS(__down_failed_interruptible); EXPORT_SYMBOL_NOVERS(__down_failed_trylock); EXPORT_SYMBOL_NOVERS(__up_wakeup); /* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_generic); /* Delay loops */ EXPORT_SYMBOL(__ndelay); EXPORT_SYMBOL(__udelay); @@ -111,13 +106,17 @@ EXPORT_SYMBOL_NOVERS(__get_user_4); EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strstr); +#if !defined(CONFIG_X86_UACCESS_INDIRECT) EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); +EXPORT_SYMBOL(__direct_strncpy_from_user); EXPORT_SYMBOL(clear_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__copy_from_user_ll); EXPORT_SYMBOL(__copy_to_user_ll); EXPORT_SYMBOL(strnlen_user); +#else /* CONFIG_X86_UACCESS_INDIRECT */ +EXPORT_SYMBOL(direct_csum_partial_copy_generic); +#endif EXPORT_SYMBOL(dma_alloc_coherent); EXPORT_SYMBOL(dma_free_coherent); --- linux-2.6.4-rc1/arch/i386/kernel/i387.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/i386/kernel/i387.c 2004-02-29 13:09:29.000000000 -0800 @@ -218,6 +218,7 @@ void set_fpu_mxcsr( struct task_struct * static int convert_fxsr_to_user( struct _fpstate __user *buf, struct i387_fxsave_struct *fxsave ) { + struct _fpreg tmp[8]; /* 80 bytes scratch area */ unsigned long env[7]; struct _fpreg __user *to; struct _fpxreg *from; @@ -234,23 +235,25 @@ static int convert_fxsr_to_user( struct if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) ) return 1; - to = &buf->_st[0]; + to = tmp; from = (struct _fpxreg *) &fxsave->st_space[0]; for ( i = 0 ; i < 8 ; i++, to++, from++ ) { unsigned long *t = (unsigned long *)to; unsigned long *f = (unsigned long *)from; - if (__put_user(*f, t) || - __put_user(*(f + 1), t + 1) || - __put_user(from->exponent, &to->exponent)) - return 1; + *t = *f; + *(t + 1) = *(f+1); + to->exponent = from->exponent; } + if (copy_to_user(buf->_st, tmp, sizeof(struct _fpreg [8]))) + return 1; return 0; } static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave, struct _fpstate __user *buf ) { + struct _fpreg tmp[8]; /* 80 bytes scratch area */ unsigned long env[7]; struct _fpxreg *to; struct _fpreg __user *from; @@ -258,6 +261,8 @@ static int convert_fxsr_from_user( struc if ( __copy_from_user( env, buf, 7 * sizeof(long) ) ) return 1; + if (copy_from_user(tmp, buf->_st, sizeof(struct _fpreg [8]))) + return 1; fxsave->cwd = (unsigned short)(env[0] & 0xffff); fxsave->swd = (unsigned short)(env[1] & 0xffff); @@ -269,15 +274,14 @@ static int convert_fxsr_from_user( struc fxsave->fos = env[6]; to = (struct _fpxreg *) &fxsave->st_space[0]; - from = &buf->_st[0]; + from = tmp; for ( i = 0 ; i < 8 ; i++, to++, from++ ) { unsigned long *t = (unsigned long *)to; unsigned long *f = (unsigned long *)from; - if (__get_user(*t, f) || - __get_user(*(t + 1), f + 1) || - __get_user(to->exponent, &from->exponent)) - return 1; + *t = *f; + *(t + 1) = *(f + 1); + to->exponent = from->exponent; } return 0; } --- linux-2.6.4-rc1/arch/i386/kernel/i8259.c 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/i386/kernel/i8259.c 2004-02-29 13:07:44.000000000 -0800 @@ -258,7 +258,7 @@ static int __init i8259A_init_sysfs(void { int error = sysdev_class_register(&i8259_sysdev_class); if (!error) - error = sys_device_register(&device_i8259A); + error = sysdev_register(&device_i8259A); return error; } @@ -401,7 +401,7 @@ static int __init init_timer_sysfs(void) { int error = sysdev_class_register(&timer_sysclass); if (!error) - error = sys_device_register(&device_timer); + error = sysdev_register(&device_timer); return error; } --- linux-2.6.4-rc1/arch/i386/kernel/init_task.c 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/init_task.c 2004-02-29 13:09:29.000000000 -0800 @@ -26,7 +26,7 @@ EXPORT_SYMBOL(init_mm); */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union) }; /* * Initial task structure. @@ -44,5 +44,5 @@ EXPORT_SYMBOL(init_task); * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -struct tss_struct init_tss[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = INIT_TSS }; +struct tss_struct init_tss[NR_CPUS] __attribute__((__section__(".data.tss"))) = { [0 ... NR_CPUS-1] = INIT_TSS }; --- linux-2.6.4-rc1/arch/i386/kernel/io_apic.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/io_apic.c 2004-02-29 13:08:14.000000000 -0800 @@ -317,8 +317,7 @@ struct irq_cpu_info { #define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) -#define CPU_TO_PACKAGEINDEX(i) \ - ((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i) +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) @@ -401,6 +400,7 @@ static void do_irq_balance(void) unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); unsigned long move_this_load = 0; int max_loaded = 0, min_loaded = 0; + int load; unsigned long useful_load_threshold = balanced_irq_interval + 10; int selected_irq; int tmp_loaded, first_attempt = 1; @@ -452,7 +452,7 @@ static void do_irq_balance(void) for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - if (physical_balance && i > cpu_sibling_map[i]) + if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (min_cpu_irq > CPU_IRQ(i)) { min_cpu_irq = CPU_IRQ(i); @@ -471,7 +471,7 @@ tryanothercpu: for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - if (physical_balance && i > cpu_sibling_map[i]) + if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (max_cpu_irq <= CPU_IRQ(i)) continue; @@ -551,9 +551,14 @@ tryanotherirq: * We seek the least loaded sibling by making the comparison * (A+B)/2 vs B */ - if (physical_balance && (CPU_IRQ(min_loaded) >> 1) > - CPU_IRQ(cpu_sibling_map[min_loaded])) - min_loaded = cpu_sibling_map[min_loaded]; + load = CPU_IRQ(min_loaded) >> 1; + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + if (load > CPU_IRQ(j)) { + /* This won't change cpu_sibling_map[min_loaded] */ + load = CPU_IRQ(j); + min_loaded = j; + } + } cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]); target_cpu_mask = cpumask_of_cpu(min_loaded); --- linux-2.6.4-rc1/arch/i386/kernel/irq.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/irq.c 2004-02-29 13:07:59.000000000 -0800 @@ -508,6 +508,8 @@ out: irq_exit(); + kgdb_process_breakpoint(); + return 1; } --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/arch/i386/kernel/kgdb_stub.c 2004-02-29 13:08:01.000000000 -0800 @@ -0,0 +1,2458 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (c) 2000 VERITAS Software Corporation. + * + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: David Grothe + * Updated by: Robert Walsh + * Updated by: wangdi + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Compatibility with 2.1.xx kernel by David Grothe + * + * Changes to allow auto initilization. All that is needed is that it + * be linked with the kernel and a break point (int 3) be executed. + * The header file defines BREAKPOINT to allow one to do + * this. It should also be possible, once the interrupt system is up, to + * call putDebugChar("+"). Once this is done, the remote debugger should + * get our attention by sending a ^C in a packet. George Anzinger + * + * Integrated into 2.2.5 kernel by Tigran Aivazian + * Added thread support, support for multiple processors, + * support for ia-32(x86) hardware debugging. + * Amit S. Kale ( akale@veritas.com ) + * + * Modified to support debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ +#define KGDB_VERSION "<20030915.1651.33>" +#include +#include +#include /* for strcpy */ +#include +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#include +#include +#include +#include +#include +#include +#include + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int tty_putDebugChar(int); /* write a single character */ +extern int tty_getDebugChar(void); /* read and return a single char */ +extern void tty_flushDebugChar(void); /* flush pending characters */ +extern int eth_putDebugChar(int); /* write a single character */ +extern int eth_getDebugChar(void); /* read and return a single char */ +extern void eth_flushDebugChar(void); /* flush pending characters */ + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 400 + +char *kgdb_version = KGDB_VERSION; + +/* debug > 0 prints ill-formed commands in valid packets & checksum errors */ +int debug_regs = 0; /* set to non-zero to print registers */ + +/* filled in by an external module */ +char *gdb_module_offsets; + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES 64 +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +enum regnames { _EAX, /* 0 */ + _ECX, /* 1 */ + _EDX, /* 2 */ + _EBX, /* 3 */ + _ESP, /* 4 */ + _EBP, /* 5 */ + _ESI, /* 6 */ + _EDI, /* 7 */ + _PC /* 8 also known as eip */ , + _PS /* 9 also known as eflags */ , + _CS, /* 10 */ + _SS, /* 11 */ + _DS, /* 12 */ + _ES, /* 13 */ + _FS, /* 14 */ + _GS /* 15 */ +}; + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* + * Put the error code here just in case the user cares. + * Likewise, the vector number here (since GDB only gets the signal + * number through the usual means, and that's not very specific). + * The called_from is the return address so he can tell how we entered kgdb. + * This will allow him to seperate out the various possible entries. + */ +#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */ + +#define PID_MAX PID_MAX_DEFAULT + +#ifdef CONFIG_SMP +void smp_send_nmi_allbutself(void); +#define IF_SMP(x) x +#undef MAX_NO_CPUS +#ifndef CONFIG_NO_KGDB_CPUS +#define CONFIG_NO_KGDB_CPUS 2 +#endif +#if CONFIG_NO_KGDB_CPUS > NR_CPUS +#define MAX_NO_CPUS NR_CPUS +#else +#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS +#endif +#define hold_init hold_on_sstep: 1, +#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL) +#define NUM_CPUS num_online_cpus() +#else +#define IF_SMP(x) +#define hold_init +#undef MAX_NO_CPUS +#define MAX_NO_CPUS 1 +#define NUM_CPUS 1 +#endif +#define NOCPU (struct task_struct *)0xbad1fbad +/* *INDENT-OFF* */ +struct kgdb_info { + int used_malloc; + void *called_from; + long long entry_tsc; + int errcode; + int vector; + int print_debug_info; +#ifdef CONFIG_SMP + int hold_on_sstep; + struct { + volatile struct task_struct *task; + int pid; + int hold; + struct pt_regs *regs; + } cpus_waiting[MAX_NO_CPUS]; +#endif +} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1}; + +/* *INDENT-ON* */ + +#define used_m kgdb_info.used_malloc +/* + * This is little area we set aside to contain the stack we + * need to build to allow gdb to call functions. We use one + * per cpu to avoid locking issues. We will do all this work + * with interrupts off so that should take care of the protection + * issues. + */ +#define LOOKASIDE_SIZE 200 /* should be more than enough */ +#define MALLOC_MAX 200 /* Max malloc size */ +struct { + unsigned int esp; + int array[LOOKASIDE_SIZE]; +} fn_call_lookaside[MAX_NO_CPUS]; + +static int trap_cpu; +static unsigned int OLD_esp; + +#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE] +#define IF_BIT 0x200 +#define TF_BIT 0x100 + +#define MALLOC_ROUND 8-1 + +static char malloc_array[MALLOC_MAX]; +IF_SMP(static void to_gdb(const char *mess)); +void * +malloc(int size) +{ + + if (size <= (MALLOC_MAX - used_m)) { + int old_used = used_m; + used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND)); + return &malloc_array[old_used]; + } else { + return NULL; + } +} + +/* + * I/O dispatch functions... + * Based upon kgdboe, either call the ethernet + * handler or the serial one.. + */ +void +putDebugChar(int c) +{ + if (!kgdboe) { + tty_putDebugChar(c); + } else { + eth_putDebugChar(c); + } +} + +int +getDebugChar(void) +{ + if (!kgdboe) { + return tty_getDebugChar(); + } else { + return eth_getDebugChar(); + } +} + +void +flushDebugChar(void) +{ + if (!kgdboe) { + tty_flushDebugChar(); + } else { + eth_flushDebugChar(); + } +} + +/* + * Gdb calls functions by pushing agruments, including a return address + * on the stack and the adjusting EIP to point to the function. The + * whole assumption in GDB is that we are on a different stack than the + * one the "user" i.e. code that hit the break point, is on. This, of + * course is not true in the kernel. Thus various dodges are needed to + * do the call without directly messing with EIP (which we can not change + * as it is just a location and not a register. To adjust it would then + * require that we move every thing below EIP up or down as needed. This + * will not work as we may well have stack relative pointer on the stack + * (such as the pointer to regs, for example). + + * So here is what we do: + * We detect gdb attempting to store into the stack area and instead, store + * into the fn_call_lookaside.array at the same relative location as if it + * were the area ESP pointed at. We also trap ESP modifications + * and uses these to adjust fn_call_lookaside.esp. On entry + * fn_call_lookaside.esp will be set to point at the last entry in + * fn_call_lookaside.array. This allows us to check if it has changed, and + * if so, on exit, we add the registers we will use to do the move and a + * trap/ interrupt return exit sequence. We then adjust the eflags in the + * regs array (remember we now have a copy in the fn_call_lookaside.array) to + * kill the interrupt bit, AND we change EIP to point at our set up stub. + * As part of the register set up we preset the registers to point at the + * begining and end of the fn_call_lookaside.array, so all the stub needs to + * do is move words from the array to the stack until ESP= the desired value + * then do the rti. This will then transfer to the desired function with + * all the correct registers. Nifty huh? + */ +extern asmlinkage void fn_call_stub(void); +extern asmlinkage void fn_rtn_stub(void); +/* *INDENT-OFF* */ +__asm__("fn_rtn_stub:\n\t" + "movl %eax,%esp\n\t" + "fn_call_stub:\n\t" + "1:\n\t" + "addl $-4,%ebx\n\t" + "movl (%ebx), %eax\n\t" + "pushl %eax\n\t" + "cmpl %esp,%ecx\n\t" + "jne 1b\n\t" + "popl %eax\n\t" + "popl %ebx\n\t" + "popl %ecx\n\t" + "iret \n\t"); +/* *INDENT-ON* */ +#define gdb_i386vector kgdb_info.vector +#define gdb_i386errcode kgdb_info.errcode +#define waiting_cpus kgdb_info.cpus_waiting +#define remote_debug kgdb_info.print_debug_info +#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold +/* gdb locks */ + +#ifdef CONFIG_SMP +static int in_kgdb_called; +static spinlock_t waitlocks[MAX_NO_CPUS] = + {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; +/* + * The following array has the thread pointer of each of the "other" + * cpus. We make it global so it can be seen by gdb. + */ +volatile int in_kgdb_entry_log[MAX_NO_CPUS]; +volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; +/* +static spinlock_t continuelocks[MAX_NO_CPUS]; +*/ +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +/* waiters on our spinlock plus us */ +static atomic_t spinlock_waiters = ATOMIC_INIT(1); +static int spinlock_count = 0; +static int spinlock_cpu = 0; +/* + * Note we use nested spin locks to account for the case where a break + * point is encountered when calling a function by user direction from + * kgdb. Also there is the memory exception recursion to account for. + * Well, yes, but this lets other cpus thru too. Lets add a + * cpu id to the lock. + */ +#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \ + spinlock_cpu != smp_processor_id()){\ + atomic_inc(&spinlock_waiters); \ + while (! spin_trylock(x)) {\ + in_kgdb(®s);\ + }\ + atomic_dec(&spinlock_waiters); \ + spinlock_count = 1; \ + spinlock_cpu = smp_processor_id(); \ + }else{ \ + spinlock_count++; \ + } +#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x) +#else +unsigned kgdb_spinlock = 0; +#define KGDB_SPIN_LOCK(x) --*x +#define KGDB_SPIN_UNLOCK(x) ++*x +#endif + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + if ((remote_debug) && (checksum != xmitcsum)) { + printk + ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n", + checksum, xmitcsum, buffer); + } + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + + if (remote_debug) + printk("R:%s\n", buffer); + flushDebugChar(); +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + + if (!kgdboe) { + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + putDebugChar(ch); + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + + } while ((getDebugChar() & 0x7f) != '+'); + } else { + /* + * For udp, we can not transfer too much bytes once. + * We only transfer MAX_SEND_COUNT size bytes each time + */ + +#define MAX_SEND_COUNT 30 + + int send_count = 0, i = 0; + char send_buf[MAX_SEND_COUNT]; + + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + send_count = 0; + while ((ch = buffer[count])) { + if (send_count >= MAX_SEND_COUNT) { + for(i = 0; i < MAX_SEND_COUNT; i++) { + putDebugChar(send_buf[i]); + } + flushDebugChar(); + send_count = 0; + } else { + send_buf[send_count] = ch; + checksum += ch; + count ++; + send_count++; + } + } + for(i = 0; i < send_count; i++) + putDebugChar(send_buf[i]); + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + } while ((getDebugChar() & 0x7f) != '+'); + } +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static short error; + +void +debug_error(char *format, char *parm) +{ + if (remote_debug) + printk(format, parm); +} + +static void +print_regs(struct pt_regs *regs) +{ + printk("EAX=%08lx ", regs->eax); + printk("EBX=%08lx ", regs->ebx); + printk("ECX=%08lx ", regs->ecx); + printk("EDX=%08lx ", regs->edx); + printk("\n"); + printk("ESI=%08lx ", regs->esi); + printk("EDI=%08lx ", regs->edi); + printk("EBP=%08lx ", regs->ebp); + printk("ESP=%08lx ", (long) ®s->esp); + printk("\n"); + printk(" DS=%08x ", regs->xds); + printk(" ES=%08x ", regs->xes); + printk(" SS=%08x ", __KERNEL_DS); + printk(" FL=%08lx ", regs->eflags); + printk("\n"); + printk(" CS=%08x ", regs->xcs); + printk(" IP=%08lx ", regs->eip); +#if 0 + printk(" FS=%08x ", regs->fs); + printk(" GS=%08x ", regs->gs); +#endif + printk("\n"); + +} /* print_regs */ + +#define NEW_esp fn_call_lookaside[trap_cpu].esp + +static void +regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_EAX] = regs->eax; + gdb_regs[_EBX] = regs->ebx; + gdb_regs[_ECX] = regs->ecx; + gdb_regs[_EDX] = regs->edx; + gdb_regs[_ESI] = regs->esi; + gdb_regs[_EDI] = regs->edi; + gdb_regs[_EBP] = regs->ebp; + gdb_regs[_DS] = regs->xds; + gdb_regs[_ES] = regs->xes; + gdb_regs[_PS] = regs->eflags; + gdb_regs[_CS] = regs->xcs; + gdb_regs[_PC] = regs->eip; + /* Note, as we are a debugging the kernel, we will always + * trap in kernel code, this means no priviledge change, + * and so the pt_regs structure is not completely valid. In a non + * privilege change trap, only EFLAGS, CS and EIP are put on the stack, + * SS and ESP are not stacked, this means that the last 2 elements of + * pt_regs is not valid (they would normally refer to the user stack) + * also, using regs+1 is no good because you end up will a value that is + * 2 longs (8) too high. This used to cause stepping over functions + * to fail, so my fix is to use the address of regs->esp, which + * should point at the end of the stack frame. Note I have ignored + * completely exceptions that cause an error code to be stacked, such + * as double fault. Stuart Hughes, Zentropix. + * original code: gdb_regs[_ESP] = (int) (regs + 1) ; + + * this is now done on entry and moved to OLD_esp (as well as NEW_esp). + */ + gdb_regs[_ESP] = NEW_esp; + gdb_regs[_SS] = __KERNEL_DS; + gdb_regs[_FS] = 0xFFFF; + gdb_regs[_GS] = 0xFFFF; +} /* regs_to_gdb_regs */ + +static void +gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) +{ + regs->eax = gdb_regs[_EAX]; + regs->ebx = gdb_regs[_EBX]; + regs->ecx = gdb_regs[_ECX]; + regs->edx = gdb_regs[_EDX]; + regs->esi = gdb_regs[_ESI]; + regs->edi = gdb_regs[_EDI]; + regs->ebp = gdb_regs[_EBP]; + regs->xds = gdb_regs[_DS]; + regs->xes = gdb_regs[_ES]; + regs->eflags = gdb_regs[_PS]; + regs->xcs = gdb_regs[_CS]; + regs->eip = gdb_regs[_PC]; + NEW_esp = gdb_regs[_ESP]; /* keep the value */ +#if 0 /* can't change these */ + regs->esp = gdb_regs[_ESP]; + regs->xss = gdb_regs[_SS]; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif + +} /* gdb_regs_to_regs */ +extern void scheduling_functions_start_here(void); +extern void scheduling_functions_end_here(void); +#define first_sched ((unsigned long) scheduling_functions_start_here) +#define last_sched ((unsigned long) scheduling_functions_end_here) + +int thread_list = 0; + +void +get_gdb_regs(struct task_struct *p, struct pt_regs *regs, int *gdb_regs) +{ + unsigned long stack_page; + int count = 0; + IF_SMP(int i); + if (!p || p == current) { + regs_to_gdb_regs(gdb_regs, regs); + return; + } +#ifdef CONFIG_SMP + for (i = 0; i < MAX_NO_CPUS; i++) { + if (p == kgdb_info.cpus_waiting[i].task) { + regs_to_gdb_regs(gdb_regs, + kgdb_info.cpus_waiting[i].regs); + gdb_regs[_ESP] = + (int) &kgdb_info.cpus_waiting[i].regs->esp; + + return; + } + } +#endif + memset(gdb_regs, 0, NUMREGBYTES); + gdb_regs[_ESP] = p->thread.esp; + gdb_regs[_PC] = p->thread.eip; + gdb_regs[_EBP] = *(int *) gdb_regs[_ESP]; + gdb_regs[_EDI] = *(int *) (gdb_regs[_ESP] + 4); + gdb_regs[_ESI] = *(int *) (gdb_regs[_ESP] + 8); + +/* + * This code is to give a more informative notion of where a process + * is waiting. It is used only when the user asks for a thread info + * list. If he then switches to the thread, s/he will find the task + * is in schedule, but a back trace should show the same info we come + * up with. This code was shamelessly purloined from process.c. It was + * then enhanced to provide more registers than simply the program + * counter. + */ + + if (!thread_list) { + return; + } + + if (p->state == TASK_RUNNING) + return; + stack_page = (unsigned long) p->thread_info; + if (gdb_regs[_ESP] < stack_page || gdb_regs[_ESP] > + THREAD_SIZE - sizeof(long) + stack_page) + return; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + do { + if (gdb_regs[_EBP] < stack_page || + gdb_regs[_EBP] > THREAD_SIZE - 2*sizeof(long) + stack_page) + return; + gdb_regs[_PC] = *(unsigned long *) (gdb_regs[_EBP] + 4); + gdb_regs[_ESP] = gdb_regs[_EBP] + 8; + gdb_regs[_EBP] = *(unsigned long *) gdb_regs[_EBP]; + if (gdb_regs[_PC] < first_sched || gdb_regs[_PC] >= last_sched) + return; + } while (count++ < 16); + return; +} + +/* Indicate to caller of mem2hex or hex2mem that there has been an + error. */ +static volatile int mem_err = 0; +static volatile int mem_err_expected = 0; +static volatile int mem_err_cnt = 0; +static int garbage_loc = -1; + +int +get_char(char *addr) +{ + return *addr; +} + +void +set_char(char *addr, int val, int may_fault) +{ + /* + * This code traps references to the area mapped to the kernel + * stack as given by the regs and, instead, stores to the + * fn_call_lookaside[cpu].array + */ + if (may_fault && + (unsigned int) addr < OLD_esp && + ((unsigned int) addr > (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) { + addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr); + } + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set mem_err in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + /* printk("%lx = ", mem) ; */ + + ch = get_char(mem++); + + /* printk("%02x\n", ch & 0xFF) ; */ + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault fetching from addr %lx\n", + (long) (mem - 1)); + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + mem_err_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +/* NOTE: We use the may fault flag to also indicate if the write is to + * the registers (0) or "other" memory (!=0) + */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch, may_fault); + + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault storing to addr %lx\n", + (long) (mem - 1)); + return (mem); + } + } + if (may_fault) + mem_err_expected = 0; + return (mem); +} + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToInt(char **ptr, int *intValue) +{ + int numChars = 0; + int hexValue; + + *intValue = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *intValue = (*intValue << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#define stubhex(h) hex(h) +#ifdef old_thread_list + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +#ifdef old_thread_list +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} +int +int_to_hex_v(unsigned char * id, int value) +{ + unsigned char *start = id; + int shift; + int ch; + + for (shift = 28; shift >= 0; shift -= 4) { + if ((ch = (value >> shift) & 0xf) || (id != start)) { + *id = hexchars[ch]; + id++; + } + } + if (id == start) + *id++ = '0'; + return id - start; +} +#ifdef old_thread_list + +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} +#endif +static int +cmp_str(char *s1, char *s2, int count) +{ + while (count--) { + if (*s1++ != *s2++) + return 0; + } + return 1; +} + +#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */ +extern struct task_struct *kgdb_get_idle(int cpu); +#define idle_task(cpu) kgdb_get_idle(cpu) +#else +#define idle_task(cpu) init_tasks[cpu] +#endif + +extern int kgdb_pid_init_done; + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) { + + return idle_task(pid - PID_MAX); + } else { + /* + * find_task_by_pid is relatively safe all the time + * Other pid functions require lock downs which imply + * that we may be interrupting them (as we get here + * in the middle of most any lock down). + * Still we don't want to call until the table exists! + */ + if (kgdb_pid_init_done){ + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } + } + } + return NULL; +} +/* *INDENT-OFF* */ +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned addr; +} breakinfo[4] = { {enabled:0}, + {enabled:0}, + {enabled:0}, + {enabled:0}}; +/* *INDENT-ON* */ +unsigned hw_breakpoint_status; +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned dr7; + + asm volatile ("movl %%db7, %0\n":"=r" (dr7) + :); + /* *INDENT-OFF* */ + do { + unsigned addr0, addr1, addr2, addr3; + asm volatile ("movl %%db0, %0\n" + "movl %%db1, %1\n" + "movl %%db2, %2\n" + "movl %%db3, %3\n" + :"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3) + :); + } while (0); + /* *INDENT-ON* */ + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movl %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movl %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movl %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movl %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movl %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +#ifdef CONFIG_SMP +static int in_kgdb_console = 0; + +int +in_kgdb(struct pt_regs *regs) +{ + unsigned flags; + int cpu = smp_processor_id(); + in_kgdb_called = 1; + if (!spin_is_locked(&kgdb_spinlock)) { + if (in_kgdb_here_log[cpu] || /* we are holding this cpu */ + in_kgdb_console) { /* or we are doing slow i/o */ + return 1; + } + return 0; + } + + /* As I see it the only reason not to let all cpus spin on + * the same spin_lock is to allow selected ones to proceed. + * This would be a good thing, so we leave it this way. + * Maybe someday.... Done ! + + * in_kgdb() is called from an NMI so we don't pretend + * to have any resources, like printk() for example. + */ + + kgdb_local_irq_save(flags); /* only local here, to avoid hanging */ + /* + * log arival of this cpu + * The NMI keeps on ticking. Protect against recurring more + * than once, and ignor the cpu that has the kgdb lock + */ + in_kgdb_entry_log[cpu]++; + in_kgdb_here_log[cpu] = regs; + if (cpu == spinlock_cpu || waiting_cpus[cpu].task) + goto exit_in_kgdb; + + /* + * For protection of the initilization of the spin locks by kgdb + * it locks the kgdb spinlock before it gets the wait locks set + * up. We wait here for the wait lock to be taken. If the + * kgdb lock goes away first?? Well, it could be a slow exit + * sequence where the wait lock is removed prior to the kgdb lock + * so if kgdb gets unlocked, we just exit. + */ + + while (spin_is_locked(&kgdb_spinlock) && + !spin_is_locked(waitlocks + cpu)) ; + if (!spin_is_locked(&kgdb_spinlock)) + goto exit_in_kgdb; + + waiting_cpus[cpu].task = current; + waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu); + waiting_cpus[cpu].regs = regs; + + spin_unlock_wait(waitlocks + cpu); + + /* + * log departure of this cpu + */ + waiting_cpus[cpu].task = 0; + waiting_cpus[cpu].pid = 0; + waiting_cpus[cpu].regs = 0; + correct_hw_break(); + exit_in_kgdb: + in_kgdb_here_log[cpu] = 0; + kgdb_local_irq_restore(flags); + return 1; + /* + spin_unlock(continuelocks + smp_processor_id()); + */ +} + +void +smp__in_kgdb(struct pt_regs regs) +{ + ack_APIC_irq(); + in_kgdb(®s); +} +#else +int +in_kgdb(struct pt_regs *regs) +{ + return (kgdb_spinlock); +} +#endif + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +kgdb_handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + struct task_struct *thread_list_start = 0, *thread = NULL; + int addr, length; + int breakno, breaktype; + char *ptr; + int newPC; + threadref thref; + int threadid; + int thread_min = PID_MAX + MAX_NO_CPUS; +#ifdef old_thread_list + int maxthreads; +#endif + int nothreads; + unsigned long flags; + int gdb_regs[NUMREGBYTES / 4]; + int dr6; + IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */ +#define NO_NMI 1 +#define NO_SYNC 2 +#define regs (*linux_regs) +#define NUMREGS NUMREGBYTES/4 + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { + printk("ignoring non-kernel exception\n"); + print_regs(®s); + return (0); + } + /* + * If we're using eth mode, set the 'mode' in the netdevice. + */ + + if (kgdboe) + netpoll_set_trap(1); + + kgdb_local_irq_save(flags); + + /* Get kgdb spinlock */ + + KGDB_SPIN_LOCK(&kgdb_spinlock); + rdtscll(kgdb_info.entry_tsc); + /* + * We depend on this spinlock and the NMI watch dog to control the + * other cpus. They will arrive at "in_kgdb()" as a result of the + * NMI and will wait there for the following spin locks to be + * released. + */ +#ifdef CONFIG_SMP + +#if 0 + if (cpu_callout_map & ~MAX_CPU_MASK) { + printk("kgdb : too many cpus, possibly not mapped" + " in contiguous space, change MAX_NO_CPUS" + " in kgdb_stub and make new kernel.\n" + " cpu_callout_map is %lx\n", cpu_callout_map); + goto exit_just_unlock; + } +#endif + if (spinlock_count == 1) { + int time = 0, end_time, dum = 0; + int i; + int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0) + }; + if (remote_debug) { + printk("kgdb : cpu %d entry, syncing others\n", + smp_processor_id()); + } + for (i = 0; i < MAX_NO_CPUS; i++) { + /* + * Use trylock as we may already hold the lock if + * we are holding the cpu. Net result is all + * locked. + */ + spin_trylock(&waitlocks[i]); + } + for (i = 0; i < MAX_NO_CPUS; i++) + cpu_logged_in[i] = 0; + /* + * Wait for their arrival. We know the watch dog is active if + * in_kgdb() has ever been called, as it is always called on a + * watchdog tick. + */ + rdtsc(dum, time); + end_time = time + 2; /* Note: we use the High order bits! */ + i = 1; + if (num_online_cpus() > 1) { + int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()]; + smp_send_nmi_allbutself(); + + while (i < num_online_cpus() && time != end_time) { + int j; + for (j = 0; j < MAX_NO_CPUS; j++) { + if (waiting_cpus[j].task && + waiting_cpus[j].task != NOCPU && + !cpu_logged_in[j]) { + i++; + cpu_logged_in[j] = 1; + if (remote_debug) { + printk + ("kgdb : cpu %d arrived at kgdb\n", + j); + } + break; + } else if (!waiting_cpus[j].task && + !cpu_online(j)) { + waiting_cpus[j].task = NOCPU; + cpu_logged_in[j] = 1; + waiting_cpus[j].hold = 1; + break; + } + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + + int wait = 100000; + while (wait--) ; + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + printk + ("kgdb : cpu %d stall" + " in in_kgdb\n", + j); + i++; + cpu_logged_in[j] = 1; + waiting_cpus[j].task = + (struct task_struct + *) 1; + } + } + } + + if (in_kgdb_entry_log[smp_processor_id()] > + (me_in_kgdb + 10)) { + break; + } + + rdtsc(dum, time); + } + if (i < num_online_cpus()) { + printk + ("kgdb : time out, proceeding without sync\n"); +#if 0 + printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n", + waiting_cpus[0].task != 0, + waiting_cpus[1].task != 0); + printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n", + cpu_logged_in[0], cpu_logged_in[1]); + printk + ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n", + in_kgdb_here_log[0] != 0, + in_kgdb_here_log[1] != 0); +#endif + entry_state = NO_SYNC; + } else { +#if 0 + int ent = + in_kgdb_entry_log[smp_processor_id()] - + me_in_kgdb; + printk("kgdb : sync after %d entries\n", ent); +#endif + } + } else { + if (remote_debug) { + printk + ("kgdb : %d cpus, but watchdog not active\n" + "proceeding without locking down other cpus\n", + num_online_cpus()); + entry_state = NO_NMI; + } + } + } +#endif + + if (remote_debug) { + unsigned long *lp = (unsigned long *) &linux_regs; + + printk("handle_exception(exceptionVector=%d, " + "signo=%d, err_code=%d, linux_regs=%p)\n", + exceptionVector, signo, err_code, linux_regs); + if (debug_regs) { + print_regs(®s); + printk("Stk: %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[0], lp[1], lp[2], lp[3], + lp[4], lp[5], lp[6], lp[7]); + printk(" %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[8], lp[9], lp[10], lp[11], + lp[12], lp[13], lp[14], lp[15]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[16], lp[17], lp[18], lp[19], + lp[20], lp[21], lp[22], lp[23]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[24], lp[25], lp[26], lp[27], + lp[28], lp[29], lp[30], lp[31]); + } + } + + /* Disable hardware debugging while we are in kgdb */ + /* Get the debug register status register */ +/* *INDENT-OFF* */ + __asm__("movl %0,%%db7" + : /* no output */ + :"r"(0)); + + asm volatile ("movl %%db6, %0\n" + :"=r" (hw_breakpoint_status) + :); + +/* *INDENT-ON* */ + switch (exceptionVector) { + case 0: /* divide error */ + case 1: /* debug exception */ + case 2: /* NMI */ + case 3: /* breakpoint */ + case 4: /* overflow */ + case 5: /* bounds check */ + case 6: /* invalid opcode */ + case 7: /* device not available */ + case 8: /* double fault (errcode) */ + case 10: /* invalid TSS (errcode) */ + case 12: /* stack fault (errcode) */ + case 16: /* floating point error */ + case 17: /* alignment check (errcode) */ + default: /* any undocumented */ + break; + case 11: /* segment not present (errcode) */ + case 13: /* general protection (errcode) */ + case 14: /* page fault (special errcode) */ + case 19: /* cache flush denied */ + if (mem_err_expected) { + /* + * This fault occured because of the + * get_char or set_char routines. These + * two routines use either eax of edx to + * indirectly reference the location in + * memory that they are working with. + * For a page fault, when we return the + * instruction will be retried, so we + * have to make sure that these + * registers point to valid memory. + */ + mem_err = 1; /* set mem error flag */ + mem_err_expected = 0; + mem_err_cnt++; /* helps in debugging */ + /* make valid address */ + regs.eax = (long) &garbage_loc; + /* make valid address */ + regs.edx = (long) &garbage_loc; + if (remote_debug) + printk("Return after memory error: " + "mem_err_cnt=%d\n", mem_err_cnt); + if (debug_regs) + print_regs(®s); + goto exit_kgdb; + } + break; + } + if (remote_debug) + printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id()); + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + kgdb_info.called_from = __builtin_return_address(0); +#ifdef CONFIG_SMP + /* + * OK, we can now communicate, lets tell gdb about the sync. + * but only if we had a problem. + */ + switch (entry_state) { + case NO_NMI: + to_gdb("NMI not active, other cpus not stopped\n"); + break; + case NO_SYNC: + to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n"); + default:; + } + +#endif +/* + * Set up the gdb function call area. + */ + trap_cpu = smp_processor_id(); + OLD_esp = NEW_esp = (int) (&linux_regs->esp); + + IF_SMP(once_again:) + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'd': + remote_debug = !(remote_debug); /* toggle debug flag */ + printk("Remote debug %s\n", + remote_debug ? "on" : "off"); + break; + case 'g': /* return the value of the CPU registers */ + get_gdb_regs(usethread, ®s, gdb_regs); + mem2hex((char *) gdb_regs, + remcomOutBuffer, NUMREGBYTES, 0); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], + (char *) gdb_regs, NUMREGBYTES, 0); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + case 'P':{ /* set the value of a single CPU register - + return OK */ + /* + * For some reason, gdb wants to talk about psudo + * registers (greater than 15). These may have + * meaning for ptrace, but for us it is safe to + * ignor them. We do this by dumping them into + * _GS which we also ignor, but do have memory for. + */ + int regno; + + ptr = &remcomInBuffer[1]; + regs_to_gdb_regs(gdb_regs, ®s); + if ((!usethread || usethread == current) && + hexToInt(&ptr, ®no) && + *ptr++ == '=' && (regno >= 0)) { + regno = + (regno >= NUMREGS ? _GS : regno); + hex2mem(ptr, (char *) &gdb_regs[regno], + 4, 0); + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + break; + } + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr) && + (*(ptr++) == ',') && (hexToInt(&ptr, &length))) { + ptr = 0; + /* + * hex doubles the byte count + */ + if (length > (BUFMAX / 2)) + length = BUFMAX / 2; + mem2hex((char *) addr, + remcomOutBuffer, length, 1); + if (mem_err) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + debug_error + ("malformed read memory command: %s\n", + remcomInBuffer); + } + break; + + /* MAA..AA,LLLL: + Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr) && + (*(ptr++) == ',') && + (hexToInt(&ptr, &length)) && (*(ptr++) == ':')) { + hex2mem(ptr, (char *) addr, length, 1); + + if (mem_err) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } else { + strcpy(remcomOutBuffer, "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + debug_error + ("malformed write memory command: %s\n", + remcomInBuffer); + } + break; + case 'S': + remcomInBuffer[0] = 's'; + case 'C': + /* Csig;AA..AA where ;AA..AA is optional + * continue with signal + * Since signals are meaning less to us, delete that + * part and then fall into the 'c' code. + */ + ptr = &remcomInBuffer[1]; + length = 2; + while (*ptr && *ptr != ';') { + length++; + ptr++; + } + if (*ptr) { + do { + ptr++; + *(ptr - length++) = *ptr; + } while (*ptr); + } else { + remcomInBuffer[1] = 0; + } + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + /* D detach, reply OK and then continue */ + case 'c': + case 's': + case 'D': + + /* try to read optional parameter, + pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) { + if (remote_debug) + printk("Changing EIP to 0x%x\n", addr); + + regs.eip = addr; + } + + newPC = regs.eip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + /* detach is a friendly version of continue. Note that + debugging is still enabled (e.g hit control C) + */ + if (remcomInBuffer[0] == 'D') { + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + } + + if (remote_debug) { + printk("Resuming execution\n"); + print_regs(®s); + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno) && + (breakinfo[breakno].type == 0)) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + + if (kgdboe) + netpoll_set_trap(0); + + correct_hw_break(); + asm volatile ("movl %0, %%db6\n"::"r" (0)); + goto exit_kgdb; + + /* kill the program */ + case 'k': /* do nothing */ + break; + + /* query */ + case 'q': + nothreads = 0; + switch (remcomInBuffer[1]) { + case 'f': + threadid = 1; + thread_list = 2; + thread_list_start = (usethread ? : current); + case 's': + if (!cmp_str(&remcomInBuffer[2], + "ThreadInfo", 10)) + break; + + remcomOutBuffer[nothreads++] = 'm'; + for (; threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + nothreads += int_to_hex_v( + &remcomOutBuffer[ + nothreads], + threadid); + if (thread_min > threadid) + thread_min = threadid; + remcomOutBuffer[ + nothreads] = ','; + nothreads++; + if (nothreads > BUFMAX - 10) + break; + } + } + if (remcomOutBuffer[nothreads - 1] == 'm') { + remcomOutBuffer[nothreads - 1] = 'l'; + } else { + nothreads--; + } + remcomOutBuffer[nothreads] = 0; + break; + +#ifdef old_thread_list /* Old thread info request */ + case 'L': + /* List threads */ + thread_list = 2; + thread_list_start = (usethread ? : current); + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + do { + int buf_thread_limit = + (BUFMAX - 22) / BUF_THREAD_ID_SIZE; + if (maxthreads > buf_thread_limit) { + maxthreads = buf_thread_limit; + } + } while (0); + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads && + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + if (thread_min > threadid) + thread_min = threadid; + } + } + + if (threadid == PID_MAX + MAX_NO_CPUS) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; +#endif + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + if (!threadid) { + /* + * idle thread + */ + for (threadid = PID_MAX; + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + if (current == + idle_task(threadid - + PID_MAX)) + break; + } + } + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, + err_code, remcomOutBuffer); + break; + case 'T':{ + char * nptr; + /* Thread extra info */ + if (!cmp_str(&remcomInBuffer[2], + "hreadExtraInfo,", 15)) { + break; + } + ptr = &remcomInBuffer[17]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + nptr = &thread->comm[0]; + length = 0; + ptr = &remcomOutBuffer[0]; + do { + length++; + ptr = pack_hex_byte(ptr, *nptr++); + } while (*nptr && length < 16); + /* + * would like that 16 to be the size of + * task_struct.comm but don't know the + * syntax.. + */ + *ptr = 0; + } + } + break; + + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + /* + * Just in case I forget what this is all about, + * the "thread info" command to gdb causes it + * to ask for a thread list. It then switches + * to each thread and asks for the registers. + * For this (and only this) usage, we want to + * fudge the registers of tasks not on the run + * list (i.e. waiting) to show the routine that + * called schedule. Also, gdb, is a minimalist + * in that if the current thread is the last + * it will not re-read the info when done. + * This means that in this case we must show + * the real registers. So here is how we do it: + * Each entry we keep track of the min + * thread in the list (the last that gdb will) + * get info for. We also keep track of the + * starting thread. + * "thread_list" is cleared when switching back + * to the min thread if it is was current, or + * if it was not current, thread_list is set + * to 1. When the switch to current comes, + * if thread_list is 1, clear it, else do + * nothing. + */ + usethread = thread; + if ((thread_list == 1) && + (thread == thread_list_start)) { + thread_list = 0; + } + if (thread_list && (threadid == thread_min)) { + if (thread == thread_list_start) { + thread_list = 0; + } else { + thread_list = 1; + } + } + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + if (thread_min > threadid) + thread_min = threadid; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; + + case 'Y': /* set up a hardware breakpoint */ + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + ptr++; + hexToInt(&ptr, &breaktype); + ptr++; + hexToInt(&ptr, &length); + ptr++; + hexToInt(&ptr, &addr); + if (set_hw_break(breakno & 0x3, + breaktype & 0x3, + length & 0x3, addr) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + case 'r': /* reboot */ + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + /*to_gdb("Rebooting\n"); */ + /* triplefault no return from here */ + { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt[0])); + BREAKPOINT; + } + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + } /* while(1==1) */ + /* + * reached by goto only. + */ + exit_kgdb: + /* + * Here is where we set up to trap a gdb function call. NEW_esp + * will be changed if we are trying to do this. We handle both + * adding and subtracting, thus allowing gdb to put grung on + * the stack which it removes later. + */ + if (NEW_esp != OLD_esp) { + int *ptr = END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) + ptr -= (OLD_esp - NEW_esp) / sizeof (int); + *--ptr = linux_regs->eflags; + *--ptr = linux_regs->xcs; + *--ptr = linux_regs->eip; + *--ptr = linux_regs->ecx; + *--ptr = linux_regs->ebx; + *--ptr = linux_regs->eax; + linux_regs->ecx = NEW_esp - (sizeof (int) * 6); + linux_regs->ebx = (unsigned int) END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) { + linux_regs->eip = (unsigned int) fn_call_stub; + } else { + linux_regs->eip = (unsigned int) fn_rtn_stub; + linux_regs->eax = NEW_esp; + } + linux_regs->eflags &= ~(IF_BIT | TF_BIT); + } +#ifdef CONFIG_SMP + /* + * Release gdb wait locks + * Sanity check time. Must have at least one cpu to run. Also single + * step must not be done if the current cpu is on hold. + */ + if (spinlock_count == 1) { + int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep; + int cpu_avail = 0; + int i; + + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!cpu_online(i)) + break; + if (!hold_cpu(i)) { + cpu_avail = 1; + } + } + /* + * Early in the bring up there will be NO cpus on line... + */ + if (!cpu_avail && !cpus_empty(cpu_online_map)) { + to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n"); + goto once_again; + } + if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) { + to_gdb + ("Current cpu must be unblocked to single step\n"); + goto once_again; + } + if (!(ss_hold)) { + int i; + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!hold_cpu(i)) { + spin_unlock(&waitlocks[i]); + } + } + } else { + spin_unlock(&waitlocks[smp_processor_id()]); + } + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + /* + * If this cpu is on hold, this is where we + * do it. Note, the NMI will pull us out of here, + * but will return as the above lock is not held. + * We will stay here till another cpu releases the lock for us. + */ + spin_unlock_wait(waitlocks + smp_processor_id()); + kgdb_local_irq_restore(flags); + return (0); + } +#if 0 +exit_just_unlock: +#endif +#endif + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + kgdb_local_irq_restore(flags); + return (0); +} + +/* this function is used to set up exception handlers for tracing and + * breakpoints. + * This function is not needed as the above line does all that is needed. + * We leave it for backward compatitability... + */ +void +set_debug_traps(void) +{ + /* + * linux_debug_hook is defined in traps.c. We store a pointer + * to our own exception handler into it. + + * But really folks, every hear of labeled common, an old Fortran + * concept. Lots of folks can reference it and it is define if + * anyone does. Only one can initialize it at link time. We do + * this with the hook. See the statement above. No need for any + * executable code and it is ready as soon as the kernel is + * loaded. Very desirable in kernel debugging. + + linux_debug_hook = handle_exception ; + */ + + /* In case GDB is started before us, ack any packets (presumably + "$?#xx") sitting there. + putDebugChar ('+'); + + initialized = 1; + */ +} + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ +/* But really, just use the BREAKPOINT macro. We will handle the int stuff + */ + +#ifdef later +/* + * possibly we should not go thru the traps.c code at all? Someday. + */ +void +do_kgdb_int3(struct pt_regs *regs, long error_code) +{ + kgdb_handle_exception(3, 5, error_code, regs); + return; +} +#endif +#undef regs +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS +asmlinkage void +bad_sys_call_exit(int stuff) +{ + struct pt_regs *regs = (struct pt_regs *) &stuff; + printk("Sys call %d return with %x preempt_count\n", + (int) regs->orig_eax, preempt_count()); +} +#endif +#ifdef CONFIG_STACK_OVERFLOW_TEST +#include +asmlinkage void +stack_overflow(void) +{ +#ifdef BREAKPOINT + BREAKPOINT; +#else + printk("Kernel stack overflow, looping forever\n"); +#endif + while (1) { + } +} +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE) +char gdbconbuf[BUFMAX]; + +static void +kgdb_gdb_message(const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + /* + * This takes care of NMI while spining out chars to gdb + */ + IF_SMP(in_kgdb_console = 1); + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } + IF_SMP(in_kgdb_console = 0); +} +#endif +#ifdef CONFIG_SMP +static void +to_gdb(const char *s) +{ + int count = 0; + while (s[count] && (count++ < BUFMAX)) ; + kgdb_gdb_message(s, count); +} +#endif +#ifdef CONFIG_KGDB_CONSOLE +#include +#include +#include +#include +#include + +void +kgdb_console_write(struct console *co, const char *s, unsigned count) +{ + + if (gdb_i386vector == -1) { + /* + * We have not yet talked to gdb. What to do... + * lets break, on continue we can do the write. + * But first tell him whats up. Uh, well no can do, + * as this IS the console. Oh well... + * We do need to wait or the messages will be lost. + * Other option would be to tell the above code to + * ignore this breakpoint and do an auto return, + * but that might confuse gdb. Also this happens + * early enough in boot up that we don't have the traps + * set up yet, so... + */ + breakpoint(); + } + kgdb_gdb_message(s, count); +} + +/* + * ------------------------------------------------------------ + * Serial KGDB driver + * ------------------------------------------------------------ + */ + +static struct console kgdbcons = { + name:"kgdb", + write:kgdb_console_write, +#ifdef CONFIG_KGDB_USER_CONSOLE + device:kgdb_console_device, +#endif + flags:CON_PRINTBUFFER | CON_ENABLED, + index:-1, +}; + +/* + * The trick here is that this file gets linked before printk.o + * That means we get to peer at the console info in the command + * line before it does. If we are up, we register, otherwise, + * do nothing. By returning 0, we allow printk to look also. + */ +static int kgdb_console_enabled; + +int __init +kgdb_console_init(char *str) +{ + if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) { + register_console(&kgdbcons); + kgdb_console_enabled = 1; + } + return 0; /* let others look at the string */ +} + +__setup("console=", kgdb_console_init); + +#ifdef CONFIG_KGDB_USER_CONSOLE +static kdev_t kgdb_console_device(struct console *c); +/* This stuff sort of works, but it knocks out telnet devices + * we are leaving it here in case we (or you) find time to figure it out + * better.. + */ + +/* + * We need a real char device as well for when the console is opened for user + * space activities. + */ + +static int +kgdb_consdev_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static ssize_t +kgdb_consdev_write(struct file *file, const char *buf, + size_t count, loff_t * ppos) +{ + int size, ret = 0; + static char kbuf[128]; + static DECLARE_MUTEX(sem); + + /* We are not reentrant... */ + if (down_interruptible(&sem)) + return -ERESTARTSYS; + + while (count > 0) { + /* need to copy the data from user space */ + size = count; + if (size > sizeof (kbuf)) + size = sizeof (kbuf); + if (copy_from_user(kbuf, buf, size)) { + ret = -EFAULT; + break;; + } + kgdb_console_write(&kgdbcons, kbuf, size); + count -= size; + ret += size; + buf += size; + } + + up(&sem); + + return ret; +} + +struct file_operations kgdb_consdev_fops = { + open:kgdb_consdev_open, + write:kgdb_consdev_write +}; +static kdev_t +kgdb_console_device(struct console *c) +{ + return MKDEV(TTYAUX_MAJOR, 1); +} + +/* + * This routine gets called from the serial stub in the i386/lib + * This is so it is done late in bring up (just before the console open). + */ +void +kgdb_console_finit(void) +{ + if (kgdb_console_enabled) { + char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1)); + char *cp = cptr; + while (*cptr && *cptr != '(') + cptr++; + *cptr = 0; + unregister_chrdev(TTYAUX_MAJOR, cp); + register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops); + } +} +#endif +#endif +#ifdef CONFIG_KGDB_TS +#include /* time stamp code */ +#include /* in_interrupt */ +#ifdef CONFIG_KGDB_TS_64 +#define DATA_POINTS 64 +#endif +#ifdef CONFIG_KGDB_TS_128 +#define DATA_POINTS 128 +#endif +#ifdef CONFIG_KGDB_TS_256 +#define DATA_POINTS 256 +#endif +#ifdef CONFIG_KGDB_TS_512 +#define DATA_POINTS 512 +#endif +#ifdef CONFIG_KGDB_TS_1024 +#define DATA_POINTS 1024 +#endif +#ifndef DATA_POINTS +#define DATA_POINTS 128 /* must be a power of two */ +#endif +#define INDEX_MASK (DATA_POINTS - 1) +#if (INDEX_MASK & DATA_POINTS) +#error "CONFIG_KGDB_TS_COUNT must be a power of 2" +#endif +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + int data0; + int data1; +}; +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + struct task_struct *t1; + struct task_struct *t2; +}; +struct kgdb_and_then_struct kgdb_data[DATA_POINTS]; + +struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0]; +int kgdb_and_then_count; + +void +kgdb_tstamp(int line, char *source, int data0, int data1) +{ + static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; + int flags; + kgdb_local_irq_save(flags); + spin_lock(&ts_spin); + rdtscll(kgdb_and_then->at_time); +#ifdef CONFIG_SMP + kgdb_and_then->on_cpu = smp_processor_id(); +#endif + kgdb_and_then->task = current; + kgdb_and_then->from_ln = line; + kgdb_and_then->in_src = source; + kgdb_and_then->from = __builtin_return_address(0); + kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) | + (preempt_count() << 8)); + kgdb_and_then->data0 = data0; + kgdb_and_then->data1 = data1; + kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK]; + spin_unlock(&ts_spin); + kgdb_local_irq_restore(flags); +#ifdef CONFIG_PREEMPT + +#endif + return; +} +#endif +typedef int gdb_debug_hook(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs); +gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */ + +static int kgdb_need_breakpoint[NR_CPUS]; + +void kgdb_schedule_breakpoint(void) +{ + kgdb_need_breakpoint[smp_processor_id()] = 1; +} + +void kgdb_process_breakpoint(void) +{ + /* + * Handle a breakpoint queued from inside network driver code + * to avoid reentrancy issues + */ + if (kgdb_need_breakpoint[smp_processor_id()]) { + kgdb_need_breakpoint[smp_processor_id()] = 0; + BREAKPOINT; + } +} + --- linux-2.6.4-rc1/arch/i386/kernel/ldt.c 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/ldt.c 2004-02-29 13:09:29.000000000 -0800 @@ -2,7 +2,7 @@ * linux/kernel/ldt.c * * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 1999, 2003 Ingo Molnar */ #include @@ -18,6 +18,8 @@ #include #include #include +#include +#include #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *null) @@ -29,34 +31,31 @@ static void flush_ldt(void *null) static int alloc_ldt(mm_context_t *pc, int mincount, int reload) { - void *oldldt; - void *newldt; - int oldsize; + int oldsize, newsize, i; if (mincount <= pc->size) return 0; + /* + * LDT got larger - reallocate if necessary. + */ oldsize = pc->size; mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - pc->ldt = newldt; - wmb(); + newsize = mincount*LDT_ENTRY_SIZE; + for (i = 0; i < newsize; i += PAGE_SIZE) { + int nr = i/PAGE_SIZE; + BUG_ON(i >= 64*1024); + if (!pc->ldt_pages[nr]) { + pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER); + if (!pc->ldt_pages[nr]) + return -ENOMEM; + clear_highpage(pc->ldt_pages[nr]); + } + } pc->size = mincount; - wmb(); - if (reload) { #ifdef CONFIG_SMP cpumask_t mask; + preempt_disable(); load_LDT(pc); mask = cpumask_of_cpu(smp_processor_id()); @@ -67,21 +66,20 @@ static int alloc_ldt(mm_context_t *pc, i load_LDT(pc); #endif } - if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } return 0; } static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { - int err = alloc_ldt(new, old->size, 0); - if (err < 0) + int i, err, size = old->size, nr_pages = (size*LDT_ENTRY_SIZE + PAGE_SIZE-1)/PAGE_SIZE; + + err = alloc_ldt(new, size, 0); + if (err < 0) { + new->size = 0; return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + } + for (i = 0; i < nr_pages; i++) + copy_user_highpage(new->ldt_pages[i], old->ldt_pages[i], 0); return 0; } @@ -96,6 +94,7 @@ int init_new_context(struct task_struct init_MUTEX(&mm->context.sem); mm->context.size = 0; + memset(mm->context.ldt_pages, 0, sizeof(struct page *) * MAX_LDT_PAGES); old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { down(&old_mm->context.sem); @@ -107,23 +106,21 @@ int init_new_context(struct task_struct /* * No need to lock the MM as we are the last user + * Do not touch the ldt register, we are already + * in the next thread. */ void destroy_context(struct mm_struct *mm) { - if (mm->context.size) { - if (mm == current->active_mm) - clear_LDT(); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } + int i, nr_pages = (mm->context.size*LDT_ENTRY_SIZE + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) + __free_page(mm->context.ldt_pages[i]); + mm->context.size = 0; } static int read_ldt(void __user * ptr, unsigned long bytecount) { - int err; + int err, i; unsigned long size; struct mm_struct * mm = current->mm; @@ -138,8 +135,25 @@ static int read_ldt(void __user * ptr, u size = bytecount; err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; + /* + * This is necessary just in case we got here straight from a + * context-switch where the ptes were set but no tlb flush + * was done yet. We rather avoid doing a TLB flush in the + * context-switch path and do it here instead. + */ + __flush_tlb_global(); + + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + if (copy_to_user(ptr + i, kaddr, size - i)) + err = -EFAULT; + kunmap(mm->context.ldt_pages[nr]); + } up(&mm->context.sem); if (err < 0) return err; @@ -158,7 +172,7 @@ static int read_default_ldt(void __user err = 0; address = &default_ldt[0]; - size = 5*sizeof(struct desc_struct); + size = 5*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -200,7 +214,15 @@ static int write_ldt(void __user * ptr, goto out_unlock; } - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + /* + * No rescheduling allowed from this point to the install. + * + * We do a TLB flush for the same reason as in the read_ldt() path. + */ + preempt_disable(); + __flush_tlb_global(); + lp = (__u32 *) ((ldt_info.entry_number << 3) + + (char *) __kmap_atomic_vaddr(KM_LDT_PAGE0)); /* Allow LDTs to be cleared by the user. */ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { @@ -221,6 +243,7 @@ install: *lp = entry_1; *(lp+1) = entry_2; error = 0; + preempt_enable(); out_unlock: up(&mm->context.sem); @@ -248,3 +271,26 @@ asmlinkage int sys_modify_ldt(int func, } return ret; } + +/* + * load one particular LDT into the current CPU + */ +void load_LDT_nolock(mm_context_t *pc, int cpu) +{ + struct page **pages = pc->ldt_pages; + int count = pc->size; + int nr_pages, i; + + if (likely(!count)) { + pages = &default_ldt_page; + count = 5; + } + nr_pages = (count*LDT_ENTRY_SIZE + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) { + __kunmap_atomic_type(KM_LDT_PAGE0 - i); + __kmap_atomic(pages[i], KM_LDT_PAGE0 - i); + } + set_ldt_desc(cpu, (void *)__kmap_atomic_vaddr(KM_LDT_PAGE0), count); + load_LDT_desc(); +} --- linux-2.6.4-rc1/arch/i386/kernel/Makefile 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/Makefile 2004-02-29 13:09:29.000000000 -0800 @@ -7,19 +7,19 @@ extra-y := head.o init_task.o vmlinux.ld obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ - doublefault.o + doublefault.o entry_trampoline.o obj-y += cpu/ obj-y += timers/ obj-$(CONFIG_ACPI_BOOT) += acpi/ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca.o +obj-$(CONFIG_KGDB) += kgdb_stub.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o -obj-$(CONFIG_X86_SMP) += smp.o smpboot.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-$(CONFIG_X86_SMP) += smp.o smpboot.o trampoline.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o --- linux-2.6.4-rc1/arch/i386/kernel/mpparse.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/i386/kernel/mpparse.c 2004-02-29 13:09:29.000000000 -0800 @@ -668,7 +668,7 @@ void __init get_smp_config (void) * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc((void *)mpf->mpf_physptr)) { + if (!smp_read_mpc((void *)phys_to_virt(mpf->mpf_physptr))) { smp_found_config = 0; printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); @@ -1156,7 +1156,7 @@ void __init mp_parse_prt (void) continue; } if ((1<irq = acpi_irq_to_vector(irq); continue; --- linux-2.6.4-rc1/arch/i386/kernel/nmi.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/nmi.c 2004-02-29 13:07:58.000000000 -0800 @@ -31,7 +31,16 @@ #include #include +#ifdef CONFIG_KGDB +#include +#ifdef CONFIG_SMP +unsigned int nmi_watchdog = NMI_IO_APIC; +#else +unsigned int nmi_watchdog = NMI_LOCAL_APIC; +#endif +#else unsigned int nmi_watchdog = NMI_NONE; +#endif static unsigned int nmi_hz = HZ; unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ extern void show_registers(struct pt_regs *regs); @@ -248,7 +257,7 @@ static int __init init_lapic_nmi_sysfs(v error = sysdev_class_register(&nmi_sysclass); if (!error) - error = sys_device_register(&device_lapic_nmi); + error = sysdev_register(&device_lapic_nmi); return error; } /* must come after the local APIC's device_initcall() */ @@ -408,6 +417,9 @@ void touch_nmi_watchdog (void) for (i = 0; i < NR_CPUS; i++) alert_counter[i] = 0; } +#ifdef CONFIG_KGDB +int tune_watchdog = 5*HZ; +#endif void nmi_watchdog_tick (struct pt_regs * regs) { @@ -421,12 +433,24 @@ void nmi_watchdog_tick (struct pt_regs * sum = irq_stat[cpu].apic_timer_irqs; +#ifdef CONFIG_KGDB + if (! in_kgdb(regs) && last_irq_sums[cpu] == sum ) { + +#else if (last_irq_sums[cpu] == sum) { +#endif /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; +#ifdef CONFIG_KGDB + if (alert_counter[cpu] == tune_watchdog) { + kgdb_handle_exception(2, SIGPWR, 0, regs); + last_irq_sums[cpu] = sum; + alert_counter[cpu] = 0; + } +#endif if (alert_counter[cpu] == 5*nmi_hz) { spin_lock(&nmi_print_lock); /* --- linux-2.6.4-rc1/arch/i386/kernel/pci-dma.c 2003-06-14 12:18:02.000000000 -0700 +++ 25/arch/i386/kernel/pci-dma.c 2004-02-29 13:07:56.000000000 -0800 @@ -20,8 +20,9 @@ void *dma_alloc_coherent(struct device * /* ignore region specifiers */ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); - if (dev == NULL || (*dev->dma_mask < 0xffffffff)) + if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) gfp |= GFP_DMA; + ret = (void *)__get_free_pages(gfp, get_order(size)); if (ret != NULL) { --- linux-2.6.4-rc1/arch/i386/kernel/process.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/process.c 2004-02-29 13:09:29.000000000 -0800 @@ -45,6 +45,7 @@ #include #include #include +#include #ifdef CONFIG_MATH_EMULATION #include #endif @@ -302,6 +303,9 @@ void flush_thread(void) struct task_struct *tsk = current; memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); +#ifdef CONFIG_X86_HIGH_ENTRY + clear_thread_flag(TIF_DB7); +#endif memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -315,9 +319,8 @@ void release_thread(struct task_struct * if (dead_task->mm) { // temporary debugging check if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + printk("WARNING: dead process %8s still has LDT? <%d>\n", dead_task->comm, - dead_task->mm->context.ldt, dead_task->mm->context.size); BUG(); } @@ -352,7 +355,17 @@ int copy_thread(int nr, unsigned long cl p->thread.esp = (unsigned long) childregs; p->thread.esp0 = (unsigned long) (childregs+1); + /* + * get the two stack pages, for the virtual stack. + * + * IMPORTANT: this code relies on the fact that the task + * structure is an 8K aligned piece of physical memory. + */ + p->thread.stack_page0 = virt_to_page((unsigned long)p->thread_info); + p->thread.stack_page1 = virt_to_page((unsigned long)p->thread_info + PAGE_SIZE); + p->thread.eip = (unsigned long) ret_from_fork; + p->thread_info->real_stack = p->thread_info; savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); @@ -493,7 +506,7 @@ int dump_task_regs(struct task_struct *t * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; @@ -504,10 +517,41 @@ struct task_struct * __switch_to(struct __unlazy_fpu(prev_p); +#ifdef CONFIG_X86_HIGH_ENTRY + /* + * Set the ptes of the virtual stack. (NOTE: a one-page TLB flush is + * needed because otherwise NMIs could interrupt the + * user-return code with a virtual stack and stale TLBs.) + */ + __kunmap_atomic_type(KM_VSTACK0); + __kunmap_atomic_type(KM_VSTACK1); + __kmap_atomic(next->stack_page0, KM_VSTACK0); + __kmap_atomic(next->stack_page1, KM_VSTACK1); + + /* + * NOTE: here we rely on the task being the stack as well + */ + next_p->thread_info->virtual_stack = + (void *)__kmap_atomic_vaddr(KM_VSTACK0); + +#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) + /* + * If next was preempted on entry from userspace to kernel, + * and now it's on a different cpu, we need to adjust %esp. + * This assumes that entry.S does not copy %esp while on the + * virtual stack (with interrupts enabled): which is so, + * except within __SWITCH_KERNELSPACE itself. + */ + if (unlikely(next->esp >= TASK_SIZE)) { + next->esp &= THREAD_SIZE - 1; + next->esp |= (unsigned long) next_p->thread_info->virtual_stack; + } +#endif +#endif /* * Reload esp0, LDT and the page table pointer: */ - load_esp0(tss, next); + load_virtual_esp0(tss, next_p); /* * Load the per-thread Thread-Local Storage descriptor. --- linux-2.6.4-rc1/arch/i386/kernel/reboot.c 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/i386/kernel/reboot.c 2004-02-29 13:09:29.000000000 -0800 @@ -155,12 +155,11 @@ void machine_real_restart(unsigned char CMOS_WRITE(0x00, 0x8f); spin_unlock_irqrestore(&rtc_lock, flags); - /* Remap the kernel at virtual address zero, as well as offset zero - from the kernel segment. This assumes the kernel segment starts at - virtual address PAGE_OFFSET. */ - - memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); + /* + * Remap the first 16 MB of RAM (which includes the kernel image) + * at virtual address zero: + */ + setup_identity_mappings(swapper_pg_dir, 0, 16*1024*1024); /* * Use `swapper_pg_dir' as our page directory. --- linux-2.6.4-rc1/arch/i386/kernel/setup.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/setup.c 2004-02-29 13:08:58.000000000 -0800 @@ -50,6 +50,11 @@ #include "setup_arch_pre.h" #include "mach_resources.h" +/* This value is set up by the early boot code to point to the value + immediately after the boot time page tables. It contains a *physical* + address, and must not be in the .bss segment! */ +unsigned long init_pg_tables_end __initdata = ~0UL; + int disable_pse __initdata = 0; static inline char * __init machine_specific_memory_setup(void); @@ -115,7 +120,6 @@ extern void early_cpu_init(void); extern void dmi_scan_machine(void); extern void generic_apic_probe(char *); extern int root_mountflags; -extern char _end[]; unsigned long saved_videomode; @@ -569,6 +573,11 @@ static void __init parse_cmdline_early ( acpi_disabled = 0; } + /* acpi=strict disables out-of-spec workarounds */ + else if (!memcmp(from, "acpi=strict", 11)) { + acpi_strict = 1; + } + /* Limit ACPI just to boot-time to enable HT */ else if (!memcmp(from, "acpi=ht", 7)) { acpi_ht = 1; @@ -785,7 +794,7 @@ static unsigned long __init setup_memory * partially used pages are not usable - thus * we are rounding upwards: */ - start_pfn = PFN_UP(__pa(_end)); + start_pfn = PFN_UP(init_pg_tables_end); find_max_pfn(); @@ -1097,7 +1106,7 @@ void __init setup_arch(char **cmdline_p) init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; - init_mm.brk = (unsigned long) _end; + init_mm.brk = init_pg_tables_end + PAGE_OFFSET; code_resource.start = virt_to_phys(_text); code_resource.end = virt_to_phys(_etext)-1; --- linux-2.6.4-rc1/arch/i386/kernel/signal.c 2003-11-23 19:03:00.000000000 -0800 +++ 25/arch/i386/kernel/signal.c 2004-02-29 13:09:29.000000000 -0800 @@ -128,28 +128,29 @@ sys_sigaltstack(const stack_t __user *us */ static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax) +restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *__sc, int *peax) { - unsigned int err = 0; + struct sigcontext scratch; /* 88 bytes of scratch area */ /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) err |= __get_user(regs->x, &sc->x) + if (copy_from_user(&scratch, __sc, sizeof(scratch))) + return -EFAULT; + +#define COPY(x) regs->x = scratch.x #define COPY_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ + { unsigned short tmp = scratch.seg; \ regs->x##seg = tmp; } #define COPY_SEG_STRICT(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ + { unsigned short tmp = scratch.seg; \ regs->x##seg = tmp|3; } #define GET_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ + { unsigned short tmp = scratch.seg; \ loadsegment(seg,tmp); } GET_SEG(gs); @@ -168,27 +169,23 @@ restore_sigcontext(struct pt_regs *regs, COPY_SEG_STRICT(ss); { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); + unsigned int tmpflags = scratch.eflags; regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); regs->orig_eax = -1; /* disable syscall checks */ } { - struct _fpstate __user * buf; - err |= __get_user(buf, &sc->fpstate); + struct _fpstate * buf = scratch.fpstate; if (buf) { if (verify_area(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); + return -EFAULT; + if (restore_i387(buf)) + return -EFAULT; } } - err |= __get_user(*peax, &sc->eax); - return err; - -badframe: - return 1; + *peax = scratch.eax; + return 0; } asmlinkage int sys_sigreturn(unsigned long __unused) @@ -266,46 +263,47 @@ badframe: */ static int -setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, +setup_sigcontext(struct sigcontext __user *__sc, struct _fpstate __user *fpstate, struct pt_regs *regs, unsigned long mask) { - int tmp, err = 0; + struct sigcontext sc; /* 88 bytes of scratch area */ + int tmp; tmp = 0; __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int *)&sc->gs); + *(unsigned int *)&sc.gs = tmp; __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int *)&sc->fs); - - err |= __put_user(regs->xes, (unsigned int *)&sc->es); - err |= __put_user(regs->xds, (unsigned int *)&sc->ds); - err |= __put_user(regs->edi, &sc->edi); - err |= __put_user(regs->esi, &sc->esi); - err |= __put_user(regs->ebp, &sc->ebp); - err |= __put_user(regs->esp, &sc->esp); - err |= __put_user(regs->ebx, &sc->ebx); - err |= __put_user(regs->edx, &sc->edx); - err |= __put_user(regs->ecx, &sc->ecx); - err |= __put_user(regs->eax, &sc->eax); - err |= __put_user(current->thread.trap_no, &sc->trapno); - err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user(regs->eip, &sc->eip); - err |= __put_user(regs->xcs, (unsigned int *)&sc->cs); - err |= __put_user(regs->eflags, &sc->eflags); - err |= __put_user(regs->esp, &sc->esp_at_signal); - err |= __put_user(regs->xss, (unsigned int *)&sc->ss); + *(unsigned int *)&sc.fs = tmp; + *(unsigned int *)&sc.es = regs->xes; + *(unsigned int *)&sc.ds = regs->xds; + sc.edi = regs->edi; + sc.esi = regs->esi; + sc.ebp = regs->ebp; + sc.esp = regs->esp; + sc.ebx = regs->ebx; + sc.edx = regs->edx; + sc.ecx = regs->ecx; + sc.eax = regs->eax; + sc.trapno = current->thread.trap_no; + sc.err = current->thread.error_code; + sc.eip = regs->eip; + *(unsigned int *)&sc.cs = regs->xcs; + sc.eflags = regs->eflags; + sc.esp_at_signal = regs->esp; + *(unsigned int *)&sc.ss = regs->xss; tmp = save_i387(fpstate); if (tmp < 0) - err = 1; - else - err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate); + return 1; + sc.fpstate = tmp ? fpstate : NULL; /* non-iBCS2 extensions.. */ - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->thread.cr2, &sc->cr2); + sc.oldmask = mask; + sc.cr2 = current->thread.cr2; - return err; + if (copy_to_user(__sc, &sc, sizeof(sc))) + return 1; + return 0; } /* @@ -443,7 +441,7 @@ static void setup_rt_frame(int sig, stru /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(current->sas_ss_sp, (unsigned long *)&frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->esp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); @@ -551,7 +549,7 @@ handle_signal(unsigned long sig, siginfo * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -int do_signal(struct pt_regs *regs, sigset_t *oldset) +int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) { siginfo_t info; int signr; --- linux-2.6.4-rc1/arch/i386/kernel/smpboot.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/smpboot.c 2004-02-29 13:08:17.000000000 -0800 @@ -39,6 +39,7 @@ #include #include +#include #include #include #include @@ -934,7 +935,7 @@ static int boot_cpu_logical_apicid; /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; -int cpu_sibling_map[NR_CPUS] __cacheline_aligned; +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; static void __init smp_boot_cpus(unsigned int max_cpus) { @@ -953,6 +954,8 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); + cpus_clear(cpu_sibling_map[0]); + cpu_set(0, cpu_sibling_map[0]); /* * If we couldn't find an SMP configuration at boot time, @@ -1079,32 +1082,34 @@ static void __init smp_boot_cpus(unsigne Dprintk("Boot done.\n"); /* - * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so - * that we can tell the sibling CPU efficiently. + * construct cpu_sibling_map[], so that we can tell sibling CPUs + * efficiently. */ - if (cpu_has_ht && smp_num_siblings > 1) { - for (cpu = 0; cpu < NR_CPUS; cpu++) - cpu_sibling_map[cpu] = NO_PROC_ID; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - int i; - if (!cpu_isset(cpu, cpu_callout_map)) - continue; + for (cpu = 0; cpu < NR_CPUS; cpu++) + cpus_clear(cpu_sibling_map[cpu]); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + int siblings = 0; + int i; + if (!cpu_isset(cpu, cpu_callout_map)) + continue; + if (smp_num_siblings > 1) { for (i = 0; i < NR_CPUS; i++) { - if (i == cpu || !cpu_isset(i, cpu_callout_map)) + if (!cpu_isset(i, cpu_callout_map)) continue; if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_sibling_map[cpu] = i; - printk("cpu_sibling_map[%d] = %d\n", cpu, cpu_sibling_map[cpu]); - break; + siblings++; + cpu_set(i, cpu_sibling_map[cpu]); } } - if (cpu_sibling_map[cpu] == NO_PROC_ID) { - smp_num_siblings = 1; - printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu); - } + } else { + siblings++; + cpu_set(cpu, cpu_sibling_map[cpu]); } + + if (siblings != smp_num_siblings) + printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings); } smpboot_setup_io_apic(); @@ -1118,6 +1123,216 @@ static void __init smp_boot_cpus(unsigne synchronize_tsc_bp(); } +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_NUMA +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static DEFINE_PER_CPU(struct sched_domain, node_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = nodemask; + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpu->cpumask = CPU_MASK_NONE; + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + for (i = 0; i < MAX_NUMNODES; i++) { + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu_mask(j, nodemask) { + struct sched_domain *cpu_domain = cpu_sched_domain(j); + struct sched_group *cpu = &sched_group_phys[j]; + + if (j != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* + * Make each extra sibling increase power by 10% of + * the basic CPU. This is very arbitrary. + */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + node->cpu_power += cpu->cpu_power; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + /* Set up nodes */ + first_cpu = last_cpu = NULL; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *cpu = &sched_group_nodes[i]; + cpumask_t nodemask; + cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + cpu->cpumask = nodemask; + /* ->cpu_power already setup */ + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + int node = cpu_to_node(i); + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + struct sched_group *cpu_group = &sched_group_cpus[i]; + struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; + struct sched_group *node_group = &sched_group_nodes[node]; + + cpu_domain->parent = phys_domain; + phys_domain->parent = node_domain; + + node_domain->groups = node_group; + phys_domain->groups = phys_group; + cpu_domain->groups = cpu_group; + } +} +#else /* CONFIG_NUMA */ +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = cpu_possible_map; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_group *cpu = &sched_group_phys[i]; + + if (i != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* See SMT+NUMA setup for comment */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_group *cpu_group = &sched_group_cpus[i]; + struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)]; + cpu_domain->parent = phys_domain; + phys_domain->groups = phys_group; + cpu_domain->groups = cpu_group; + } +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_SCHED_SMT */ + /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) --- linux-2.6.4-rc1/arch/i386/kernel/smp.c 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/smp.c 2004-02-29 13:09:29.000000000 -0800 @@ -327,10 +327,12 @@ asmlinkage void smp_invalidate_interrupt if (flush_mm == cpu_tlbstate[cpu].active_mm) { if (cpu_tlbstate[cpu].state == TLBSTATE_OK) { +#ifndef CONFIG_X86_SWITCH_PAGETABLES if (flush_va == FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(flush_va); +#endif } else leave_mm(cpu); } @@ -396,21 +398,6 @@ static void flush_tlb_others(cpumask_t c spin_unlock(&tlbstate_lock); } -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - local_flush_tlb(); - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - preempt_enable(); -} - void flush_tlb_mm (struct mm_struct * mm) { cpumask_t cpu_mask; @@ -442,7 +429,10 @@ void flush_tlb_page(struct vm_area_struc if (current->active_mm == mm) { if(current->mm) - __flush_tlb_one(va); +#ifndef CONFIG_X86_SWITCH_PAGETABLES + __flush_tlb_one(va) +#endif + ; else leave_mm(smp_processor_id()); } @@ -466,7 +456,17 @@ void flush_tlb_all(void) { on_each_cpu(do_flush_tlb_all, 0, 1, 1); } - +#ifdef CONFIG_KGDB +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} +#endif /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing --- linux-2.6.4-rc1/arch/i386/kernel/sysenter.c 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/sysenter.c 2004-02-29 13:09:29.000000000 -0800 @@ -18,13 +18,18 @@ #include #include #include +#include extern asmlinkage void sysenter_entry(void); void enable_sep_cpu(void *info) { int cpu = get_cpu(); +#ifdef CONFIG_X86_HIGH_ENTRY + struct tss_struct *tss = (struct tss_struct *) __fix_to_virt(FIX_TSS_0) + cpu; +#else struct tss_struct *tss = init_tss + cpu; +#endif tss->ss1 = __KERNEL_CS; tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; --- linux-2.6.4-rc1/arch/i386/kernel/time.c 2004-02-03 20:42:34.000000000 -0800 +++ 25/arch/i386/kernel/time.c 2004-02-29 13:07:44.000000000 -0800 @@ -346,7 +346,7 @@ static int time_init_device(void) { int error = sysdev_class_register(&pit_sysclass); if (!error) - error = sys_device_register(&device_i8253); + error = sysdev_register(&device_i8253); return error; } --- linux-2.6.4-rc1/arch/i386/kernel/trampoline.S 2003-06-14 12:18:07.000000000 -0700 +++ 25/arch/i386/kernel/trampoline.S 2004-02-29 13:08:58.000000000 -0800 @@ -23,9 +23,13 @@ * and IP is zero. Thus, data addresses need to be absolute * (no relocation) and are taken with regard to r_base. * - * If you work on this file, check the object module with objdump - * --full-contents --reloc to make sure there are no relocation - * entries except for the gdt one.. + * If you work on this file, check the object module with + * objdump --reloc to make sure there are no relocation + * entries except for: + * + * TYPE VALUE + * R_386_32 startup_32_smp + * R_386_32 boot_gdt_table */ #include @@ -42,7 +46,6 @@ r_base = . mov %cs, %ax # Code and data in the same place mov %ax, %ds - mov $1, %bx # Flag an SMP trampoline cli # We should be safe anyway movl $0xA5A5A5A5, trampoline_data - r_base @@ -54,22 +57,18 @@ r_base = . xor %ax, %ax inc %ax # protected mode (PE) bit lmsw %ax # into protected mode - jmp flush_instr -flush_instr: - ljmpl $__BOOT_CS, $0x00100000 - # jump to startup_32 in arch/i386/kernel/head.S - -boot_idt: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L + # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S + ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET) -# -# NOTE: here we actually use CPU#0's GDT - but that is OK, we reload -# the proper GDT shortly after booting up the secondary CPUs. -# -ENTRY(boot_gdt) + # These need to be in the same 64K segment as the above; + # hence we don't use the boot_gdt_descr defined in head.S +boot_gdt: .word __BOOT_DS + 7 # gdt limit - .long boot_gdt_table-__PAGE_OFFSET # gdt base = gdt (first SMP CPU) + .long boot_gdt_table-__PAGE_OFFSET # gdt base + +boot_idt: + .word 0 # idt limit = 0 + .long 0 # idt base = 0L .globl trampoline_end trampoline_end: --- linux-2.6.4-rc1/arch/i386/kernel/traps.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/kernel/traps.c 2004-02-29 13:09:29.000000000 -0800 @@ -54,12 +54,8 @@ #include "mach_traps.h" -asmlinkage int system_call(void); -asmlinkage void lcall7(void); -asmlinkage void lcall27(void); - -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 } }; +struct desc_struct default_ldt[] __attribute__((__section__(".data.default_ldt"))) = { { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } }; +struct page *default_ldt_page; /* Do we ignore FPU interrupts ? */ char ignore_fpu_irq = 0; @@ -91,6 +87,41 @@ asmlinkage void alignment_check(void); asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); +#ifdef CONFIG_KGDB +extern void sysenter_entry(void); +#include +#include +void set_intr_gate(unsigned int n, void *addr); +static void set_intr_usr_gate(unsigned int n, void *addr); +/* + * Should be able to call this breakpoint() very early in + * bring up. Just hard code the call where needed. + * The breakpoint() code is here because set_?_gate() functions + * are local (static) to trap.c. They need be done only once, + * but it does not hurt to do them over. + */ +void breakpoint(void) +{ + init_entry_mappings(); + set_intr_usr_gate(3,&int3); /* disable ints on trap */ + set_intr_gate(1,&debug); + set_intr_gate(14,&page_fault); + + BREAKPOINT; +} +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (!user_mode(regs) ) \ + { \ + kgdb_handle_exception(trapnr, signr, error_code, regs); \ + after; \ + } else if ((trapnr == 3) && (regs->eflags &0x200)) local_irq_enable(); \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + + static int kstack_depth_to_print = 24; void show_trace(struct task_struct *task, unsigned long * stack) @@ -175,8 +206,9 @@ void show_registers(struct pt_regs *regs ss = regs->xss & 0xffff; } print_modules(); - printk("CPU: %d\nEIP: %04x:[<%08lx>] %s\nEFLAGS: %08lx\n", - smp_processor_id(), 0xffff & regs->xcs, regs->eip, print_tainted(), regs->eflags); + printk("CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\nEFLAGS: %08lx\n", + smp_processor_id(), 0xffff & regs->xcs, + regs->eip, print_tainted(), regs->eflags); print_symbol("EIP is at %s\n", regs->eip); printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", @@ -192,23 +224,27 @@ void show_registers(struct pt_regs *regs * time of the fault.. */ if (in_kernel) { + u8 *eip; printk("\nStack: "); show_stack(NULL, (unsigned long*)esp); printk("Code: "); - if(regs->eip < PAGE_OFFSET) - goto bad; - for(i=0;i<20;i++) - { - unsigned char c; - if(__get_user(c, &((unsigned char*)regs->eip)[i])) { -bad: + eip = (u8 *)regs->eip - 43; + for (i = 0; i < 64; i++, eip++) { + unsigned char c = 0xff; + + if ((user_mode(regs) && get_user(c, eip)) || + (!user_mode(regs) && __direct_get_user(c, eip))) { + printk(" Bad EIP value."); break; } - printk("%02x ", c); + if (eip == (u8 *)regs->eip) + printk("<%02x> ", c); + else + printk("%02x ", c); } } printk("\n"); @@ -276,6 +312,15 @@ void die(const char * str, struct pt_reg #endif if (nl) printk("\n"); +#ifdef CONFIG_KGDB + /* This is about the only place we want to go to kgdb even if in + * user mode. But we must go in via a trap so within kgdb we will + * always be in kernel mode. + */ + if (user_mode(regs)) + BREAKPOINT; +#endif + CHK_REMOTE_DEBUG(0,SIGTRAP,err,regs,) show_registers(regs); bust_spinlocks(0); spin_unlock_irq(&die_lock); @@ -345,6 +390,7 @@ static inline void do_trap(int trapnr, i #define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,)\ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -362,7 +408,9 @@ asmlinkage void do_##name(struct pt_regs #define DO_VM86_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr, signr, error_code,regs, return)\ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + return; \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -409,8 +457,10 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) + if (!fixup_exception(regs)){ + CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,) die("general protection fault", regs, error_code); + } } static void mem_parity_error(unsigned char reason, struct pt_regs * regs) @@ -549,10 +599,18 @@ asmlinkage void do_debug(struct pt_regs if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); - /* Mask out spurious debug traps due to lazy DR7 setting */ + /* + * Mask out spurious debug traps due to lazy DR7 setting or + * due to 4G/4G kernel mode: + */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { if (!tsk->thread.debugreg[7]) goto clear_dr7; + if (!user_mode(regs)) { + // restore upon return-to-userspace: + set_thread_flag(TIF_DB7); + goto clear_dr7; + } } if (regs->eflags & VM_MASK) @@ -572,8 +630,18 @@ asmlinkage void do_debug(struct pt_regs * allowing programs to debug themselves without the ptrace() * interface. */ +#ifdef CONFIG_KGDB + /* + * I think this is the only "real" case of a TF in the kernel + * that really belongs to user space. Others are + * "Ours all ours!" + */ + if (((regs->xcs & 3) == 0) && ((void *)regs->eip == sysenter_entry)) + goto clear_TF_reenable; +#else if ((regs->xcs & 3) == 0) goto clear_TF_reenable; +#endif if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } @@ -585,6 +653,17 @@ asmlinkage void do_debug(struct pt_regs info.si_errno = 0; info.si_code = TRAP_BRKPT; +#ifdef CONFIG_KGDB + /* + * If this is a kernel mode trap, we need to reset db7 to allow us + * to continue sanely ALSO skip the signal delivery + */ + if ((regs->xcs & 3) == 0) + goto clear_dr7; + + /* if not kernel, allow ints but only if they were on */ + if ( regs->eflags & 0x200) local_irq_enable(); +#endif /* If this is a kernel mode trap, save the user PC on entry to * the kernel, that's what the debugger can make sense of. */ @@ -599,6 +678,7 @@ clear_dr7: __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); + CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) return; debug_vm86: @@ -794,19 +874,53 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ -#ifdef CONFIG_X86_F00F_BUG -void __init trap_init_f00f_bug(void) +void __init trap_init_virtual_IDT(void) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); - /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. + * "idt" is magic - it overlaps the idt_descr + * variable so that updating idt will automatically + * update the idt descriptor.. */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); + __set_fixmap(FIX_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + idt_descr.address = __fix_to_virt(FIX_IDT); + __asm__ __volatile__("lidt %0" : : "m" (idt_descr)); } -#endif + +void __init trap_init_virtual_GDT(void) +{ + int cpu = smp_processor_id(); + struct Xgt_desc_struct *gdt_desc = cpu_gdt_descr + cpu; + struct Xgt_desc_struct tmp_desc = {0, 0}; + struct tss_struct * t; + + __asm__ __volatile__("sgdt %0": "=m" (tmp_desc): :"memory"); + +#ifdef CONFIG_X86_HIGH_ENTRY + if (!cpu) { + __set_fixmap(FIX_GDT_0, __pa(cpu_gdt_table), PAGE_KERNEL); + __set_fixmap(FIX_GDT_1, __pa(cpu_gdt_table) + PAGE_SIZE, PAGE_KERNEL); + __set_fixmap(FIX_TSS_0, __pa(init_tss), PAGE_KERNEL); + __set_fixmap(FIX_TSS_1, __pa(init_tss) + 1*PAGE_SIZE, PAGE_KERNEL); + __set_fixmap(FIX_TSS_2, __pa(init_tss) + 2*PAGE_SIZE, PAGE_KERNEL); + __set_fixmap(FIX_TSS_3, __pa(init_tss) + 3*PAGE_SIZE, PAGE_KERNEL); + } + + gdt_desc->address = __fix_to_virt(FIX_GDT_0) + sizeof(cpu_gdt_table[0]) * cpu; +#else + gdt_desc->address = (unsigned long)cpu_gdt_table[cpu]; +#endif + __asm__ __volatile__("lgdt %0": "=m" (*gdt_desc)); + +#ifdef CONFIG_X86_HIGH_ENTRY + t = (struct tss_struct *) __fix_to_virt(FIX_TSS_0) + cpu; +#else + t = init_tss + cpu; +#endif + set_tss_desc(cpu, t); + cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; + load_TR_desc(); +} #define _set_gate(gate_addr,type,dpl,addr,seg) \ do { \ @@ -833,20 +947,26 @@ void set_intr_gate(unsigned int n, void _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); } -static void __init set_trap_gate(unsigned int n, void *addr) +void __init set_trap_gate(unsigned int n, void *addr) { _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); } -static void __init set_system_gate(unsigned int n, void *addr) +void __init set_system_gate(unsigned int n, void *addr) { _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); } -static void __init set_call_gate(void *a, void *addr) +void __init set_call_gate(void *a, void *addr) { _set_gate(a,12,3,addr,__KERNEL_CS); } +#ifdef CONFIG_KGDB +void set_intr_usr_gate(unsigned int n, void *addr) +{ + _set_gate(idt_table+n,14,3,addr,__KERNEL_CS); +} +#endif static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { @@ -865,11 +985,16 @@ void __init trap_init(void) #ifdef CONFIG_X86_LOCAL_APIC init_apic_mappings(); #endif + init_entry_mappings(); set_trap_gate(0,÷_error); set_intr_gate(1,&debug); set_intr_gate(2,&nmi); +#ifndef CONFIG_KGDB set_system_gate(3,&int3); /* int3-5 can be called from all */ +#else + set_intr_usr_gate(3,&int3); /* int3-5 can be called from all */ +#endif set_system_gate(4,&overflow); set_system_gate(5,&bounds); set_trap_gate(6,&invalid_op); --- linux-2.6.4-rc1/arch/i386/kernel/vm86.c 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/i386/kernel/vm86.c 2004-02-29 13:09:29.000000000 -0800 @@ -95,7 +95,7 @@ #define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1) struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); -struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs) +struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) { struct tss_struct *tss; struct pt_regs *ret; @@ -125,7 +125,7 @@ struct pt_regs * save_v86_state(struct k tss = init_tss + get_cpu(); current->thread.esp0 = current->thread.saved_esp0; current->thread.sysenter_cs = __KERNEL_CS; - load_esp0(tss, ¤t->thread); + load_virtual_esp0(tss, current); current->thread.saved_esp0 = 0; put_cpu(); @@ -305,7 +305,7 @@ static void do_sys_vm86(struct kernel_vm tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; - load_esp0(tss, &tsk->thread); + load_virtual_esp0(tss, tsk); put_cpu(); tsk->thread.screen_bitmap = info->screen_bitmap; --- linux-2.6.4-rc1/arch/i386/kernel/vmlinux.lds.S 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/vmlinux.lds.S 2004-02-29 13:09:29.000000000 -0800 @@ -3,6 +3,9 @@ */ #include +#include +#include +#include OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) @@ -10,7 +13,7 @@ ENTRY(startup_32) jiffies = jiffies_64; SECTIONS { - . = 0xC0000000 + 0x100000; + . = __PAGE_OFFSET + 0x100000; /* read-only */ _text = .; /* Text and read-only data */ .text : { @@ -19,6 +22,19 @@ SECTIONS *(.gnu.warning) } = 0x9090 +#ifdef CONFIG_X86_4G + . = ALIGN(PAGE_SIZE_asm); + __entry_tramp_start = .; + . = FIX_ENTRY_TRAMPOLINE_0_addr; + __start___entry_text = .; + .entry.text : AT (__entry_tramp_start) { *(.entry.text) } + __entry_tramp_end = __entry_tramp_start + SIZEOF(.entry.text); + . = __entry_tramp_end; + . = ALIGN(PAGE_SIZE_asm); +#else + .entry.text : { *(.entry.text) } +#endif + _etext = .; /* End of text section */ . = ALIGN(16); /* Exception table */ @@ -34,15 +50,12 @@ SECTIONS CONSTRUCTORS } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE_asm); __nosave_begin = .; .data_nosave : { *(.data.nosave) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE_asm); __nosave_end = .; - . = ALIGN(4096); - .data.page_aligned : { *(.data.idt) } - . = ALIGN(32); .data.cacheline_aligned : { *(.data.cacheline_aligned) } @@ -52,7 +65,7 @@ SECTIONS .data.init_task : { *(.data.init_task) } /* will be freed after init */ - . = ALIGN(4096); /* Init code and data */ + . = ALIGN(PAGE_SIZE_asm); /* Init code and data */ __init_begin = .; .init.text : { _sinittext = .; @@ -91,7 +104,7 @@ SECTIONS from .altinstructions and .eh_frame */ .exit.text : { *(.exit.text) } .exit.data : { *(.exit.data) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE_asm); __initramfs_start = .; .init.ramfs : { *(.init.ramfs) } __initramfs_end = .; @@ -99,16 +112,33 @@ SECTIONS __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE_asm); __init_end = .; /* freed after init ends here */ - + + . = ALIGN(PAGE_SIZE_asm); + .data.page_aligned_tss : { *(.data.tss) } + + . = ALIGN(PAGE_SIZE_asm); + .data.page_aligned_default_ldt : { *(.data.default_ldt) } + + . = ALIGN(PAGE_SIZE_asm); + .data.page_aligned_idt : { *(.data.idt) } + + . = ALIGN(PAGE_SIZE_asm); + .data.page_aligned_gdt : { *(.data.gdt) } + __bss_start = .; /* BSS */ .bss : { *(.bss) } + . = ALIGN(4); __bss_stop = .; _end = . ; + /* This is where the kernel creates the early boot page tables */ + . = ALIGN(4096); + pg0 = .; + /* Sections to be discarded */ /DISCARD/ : { *(.exitcall.exit) @@ -122,4 +152,6 @@ SECTIONS .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } + + } --- linux-2.6.4-rc1/arch/i386/kernel/vsyscall.lds 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/vsyscall.lds 2004-02-29 13:09:29.000000000 -0800 @@ -5,7 +5,7 @@ */ /* This must match . */ -VSYSCALL_BASE = 0xffffe000; +VSYSCALL_BASE = 0xffffd000; SECTIONS { --- linux-2.6.4-rc1/arch/i386/kernel/vsyscall-sysenter.S 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/vsyscall-sysenter.S 2004-02-29 13:09:29.000000000 -0800 @@ -7,6 +7,11 @@ .type __kernel_vsyscall,@function __kernel_vsyscall: .LSTART_vsyscall: + cmpl $192, %eax + jne 1f + int $0x80 + ret +1: push %ecx .Lpush_ecx: push %edx --- linux-2.6.4-rc1/arch/i386/lib/checksum.S 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/lib/checksum.S 2004-02-29 13:09:29.000000000 -0800 @@ -280,14 +280,14 @@ unsigned int csum_partial_copy_generic ( .previous .align 4 -.globl csum_partial_copy_generic +.globl direct_csum_partial_copy_generic #ifndef CONFIG_X86_USE_PPRO_CHECKSUM #define ARGBASE 16 #define FP 12 -csum_partial_copy_generic: +direct_csum_partial_copy_generic: subl $4,%esp pushl %edi pushl %esi @@ -422,7 +422,7 @@ DST( movb %cl, (%edi) ) #define ARGBASE 12 -csum_partial_copy_generic: +direct_csum_partial_copy_generic: pushl %ebx pushl %edi pushl %esi --- linux-2.6.4-rc1/arch/i386/lib/dec_and_lock.c 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/lib/dec_and_lock.c 2004-02-29 13:09:27.000000000 -0800 @@ -10,6 +10,7 @@ #include #include +#ifndef ATOMIC_DEC_AND_LOCK int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { int counter; @@ -38,3 +39,5 @@ slow_path: spin_unlock(lock); return 0; } +#endif + --- linux-2.6.4-rc1/arch/i386/lib/getuser.S 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/lib/getuser.S 2004-02-29 13:09:29.000000000 -0800 @@ -9,6 +9,7 @@ * return value. */ #include +#include /* @@ -28,7 +29,7 @@ .globl __get_user_1 __get_user_1: GET_THREAD_INFO(%edx) - cmpl TI_ADDR_LIMIT(%edx),%eax + cmpl TI_addr_limit(%edx),%eax jae bad_get_user 1: movzbl (%eax),%edx xorl %eax,%eax @@ -40,7 +41,7 @@ __get_user_2: addl $1,%eax jc bad_get_user GET_THREAD_INFO(%edx) - cmpl TI_ADDR_LIMIT(%edx),%eax + cmpl TI_addr_limit(%edx),%eax jae bad_get_user 2: movzwl -1(%eax),%edx xorl %eax,%eax @@ -52,7 +53,7 @@ __get_user_4: addl $3,%eax jc bad_get_user GET_THREAD_INFO(%edx) - cmpl TI_ADDR_LIMIT(%edx),%eax + cmpl TI_addr_limit(%edx),%eax jae bad_get_user 3: movl -3(%eax),%edx xorl %eax,%eax --- linux-2.6.4-rc1/arch/i386/lib/iodebug.c 2003-06-14 12:18:33.000000000 -0700 +++ /dev/null 2002-08-30 16:31:37.000000000 -0700 @@ -1,11 +0,0 @@ -#include - -void * __io_virt_debug(unsigned long x, const char *file, int line) -{ - if (x < PAGE_OFFSET) { - printk("io mapaddr 0x%05lx not valid at %s:%d!\n", x, file, line); - return __va(x); - } - return (void *)x; -} - --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/arch/i386/lib/kgdb_serial.c 2004-02-29 13:07:59.000000000 -0800 @@ -0,0 +1,499 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * Modified to allow invokation early in boot see also + * kgdb.h for instructions by George Anzinger(george@mvista.com) + * Modified to handle debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_KGDB_USER_CONSOLE +extern void kgdb_console_finit(void); +#endif +#define PRNT_off +#define TEST_EXISTANCE +#ifdef PRNT +#define dbprintk(s) printk s +#else +#define dbprintk(s) +#endif +#define TEST_INTERRUPT_off +#ifdef TEST_INTERRUPT +#define intprintk(s) printk s +#else +#define intprintk(s) +#endif + +#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT) + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +struct async_struct *gdb_async_info; +static int gdb_async_irq; + +#define outb_px(a,b) outb_p(b,a) + +static void program_uart(struct async_struct *info); +static void write_char(struct async_struct *info, int chr); +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(struct async_struct *info) +{ + char it = inb_p(info->port + UART_LSR); + + if (it & UART_LSR_DR) + return (inb_p(info->port + UART_RX)); + /* + * If we have a framing error assume somebody messed with + * our uart. Reprogram it and send '-' both ways... + */ + if (it & 0xc) { + program_uart(info); + write_char(info, '-'); + return ('-'); + } + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + + * Locking here is a bit of a problem. We MUST not lock out communication + * if we are trying to talk to gdb about a kgdb entry. ON the other hand + * we can loose chars in the console pass thru if we don't lock. It is also + * possible that we could hold the lock or be waiting for it when kgdb + * NEEDS to talk. Since kgdb locks down the world, it does not need locks. + * We do, of course have possible issues with interrupting a uart operation, + * but we will just depend on the uart status to help keep that straight. + + */ +static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +#endif + +static int +read_char(struct async_struct *info) +{ + int chr; + unsigned long flags; + local_irq_save(flags); +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_lock(&uart_interrupt_lock); + } +#endif + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + } else { + chr = read_data_bfr(info); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_unlock(&uart_interrupt_lock); + } +#endif + local_irq_restore(flags); + return (chr); +} + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(struct async_struct *info, int chr) +{ + while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ; + + outb_p(chr, info->port + UART_TX); + +} /* write_char */ + +/* + * Mostly we don't need a spinlock, but since the console goes + * thru here with interrutps on, well, we need to catch those + * chars. + */ +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine tty_getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via tty_putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static irqreturn_t +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct async_struct *info; + unsigned long flags; + + info = gdb_async_info; + if (!info || !info->tty || irq != gdb_async_irq) + return IRQ_NONE; + + local_irq_save(flags); + spin_lock(&uart_interrupt_lock); + do { + int chr = read_data_bfr(info); + intprintk(("Debug char on int: %x hex\n", chr)); + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + BREAKPOINT; + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { + /* buffer overflow tosses early char */ + read_char(info); + } + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI); + spin_unlock(&uart_interrupt_lock); + local_irq_restore(flags); + return IRQ_HANDLED; +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +/* These structure are filled in with values defined in asm/kgdb_local.h + */ +static struct serial_state state = SB_STATE; +static struct async_struct local_info = SB_INFO; +static int ok_to_enable_ints = 0; +static void kgdb_enable_ints_now(void); + +extern char *kgdb_version; +/* + * Hook an IRQ for KGDB. + * + * This routine is called from tty_putDebugChar, below. + */ +static int ints_disabled = 1; +int +gdb_hook_interrupt(struct async_struct *info, int verb) +{ + struct serial_state *state = info->state; + unsigned long flags; + int port; +#ifdef TEST_EXISTANCE + int scratch, scratch2; +#endif + + /* The above fails if memory managment is not set up yet. + * Rather than fail the set up, just keep track of the fact + * and pick up the interrupt thing later. + */ + gdb_async_info = info; + port = gdb_async_info->port; + gdb_async_irq = state->irq; + if (verb) { + printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n", + kgdb_version, + port, + gdb_async_irq, gdb_async_info->state->custom_divisor); + } + local_irq_save(flags); +#ifdef TEST_EXISTANCE + /* Existance test */ + /* Should not need all this, but just in case.... */ + + scratch = inb_p(port + UART_IER); + outb_px(port + UART_IER, 0); + outb_px(0xff, 0x080); + scratch2 = inb_p(port + UART_IER); + outb_px(port + UART_IER, scratch); + if (scratch2) { + printk + ("gdb_hook_interrupt: Could not clear IER, not a UART!\n"); + local_irq_restore(flags); + return 1; /* We failed; there's nothing here */ + } + scratch2 = inb_p(port + UART_LCR); + outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */ + outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */ + outb_px(port + UART_LCR, 0); + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO); + scratch = inb_p(port + UART_IIR) >> 6; + if (scratch == 1) { + printk("gdb_hook_interrupt: Undefined UART type!" + " Not a UART! \n"); + local_irq_restore(flags); + return 1; + } else { + dbprintk(("gdb_hook_interrupt: UART type " + "is %d where 0=16450, 2=16550 3=16550A\n", scratch)); + } + scratch = inb_p(port + UART_MCR); + outb_px(port + UART_MCR, UART_MCR_LOOP | scratch); + outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A); + scratch2 = inb_p(port + UART_MSR) & 0xF0; + outb_px(port + UART_MCR, scratch); + if (scratch2 != 0x90) { + printk("gdb_hook_interrupt: " + "Loop back test failed! Not a UART!\n"); + local_irq_restore(flags); + return scratch2 + 1000; /* force 0 to fail */ + } +#endif /* test existance */ + program_uart(info); + local_irq_restore(flags); + + return (0); + +} /* gdb_hook_interrupt */ + +static void +program_uart(struct async_struct *info) +{ + int port = info->port; + + (void) inb_p(port + UART_RX); + outb_px(port + UART_IER, 0); + + (void) inb_p(port + UART_RX); /* serial driver comments say */ + (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */ + (void) inb_p(port + UART_MSR); + outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB); + outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */ + outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */ + outb_px(port + UART_MCR, info->MCR); + + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */ + outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */ + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */ + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + return; +} + +/* + * tty_getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. In the + */ +int kgdb_in_isr = 0; +int kgdb_in_lsr = 0; +extern spinlock_t kgdb_spinlock; + +/* Caller takes needed protections */ + +int +tty_getDebugChar(void) +{ + volatile int chr, dum, time, end_time; + + dbprintk(("tty_getDebugChar(port %x): ", gdb_async_info->port)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + /* + * This trick says if we wait a very long time and get + * no char, return the -1 and let the upper level deal + * with it. + */ + rdtsc(dum, time); + end_time = time + 2; + while (((chr = read_char(gdb_async_info)) == -1) && + (end_time - time) > 0) { + rdtsc(dum, time); + }; + /* + * This covers our butts if some other code messes with + * our uart, hay, it happens :o) + */ + if (chr == -1) + program_uart(gdb_async_info); + + dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' ')); + return (chr); + +} /* tty_getDebugChar */ + +static int count = 3; +static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; + +static int __init +kgdb_enable_ints(void) +{ + if (kgdboe) { + return 0; + } + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 1); + } + ok_to_enable_ints = 1; + kgdb_enable_ints_now(); +#ifdef CONFIG_KGDB_USER_CONSOLE + kgdb_console_finit(); +#endif + return 0; +} + +#ifdef CONFIG_SERIAL_8250 +void shutdown_for_kgdb(struct async_struct *gdb_async_info); +#endif + +#ifdef CONFIG_DISCONTIGMEM +static inline int kgdb_mem_init_done(void) +{ + return highmem_start_page != NULL; +} +#else +static inline int kgdb_mem_init_done(void) +{ + return max_mapnr != 0; +} +#endif + +static void +kgdb_enable_ints_now(void) +{ + if (!spin_trylock(&one_at_atime)) + return; + if (!ints_disabled) + goto exit; + if (kgdb_mem_init_done() && + ints_disabled) { /* don't try till mem init */ +#ifdef CONFIG_SERIAL_8250 + /* + * The ifdef here allows the system to be configured + * without the serial driver. + * Don't make it a module, however, it will steal the port + */ + shutdown_for_kgdb(gdb_async_info); +#endif + ints_disabled = request_irq(gdb_async_info->state->irq, + gdb_interrupt, + IRQ_T(gdb_async_info), + "KGDB-stub", NULL); + intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); + } + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + exit: + spin_unlock(&one_at_atime); +} + +/* + * tty_putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. Caller takes needed protections. + */ +void +tty_putDebugChar(int chr) +{ + dbprintk(("tty_putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n", + gdb_async_info->port, + chr, + chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + + write_char(gdb_async_info, chr); /* this routine will wait */ + count = (chr == '#') ? 0 : count + 1; + if ((count == 2)) { /* try to enable after */ + if (ints_disabled & ok_to_enable_ints) + kgdb_enable_ints_now(); /* try to enable after */ + + /* We do this a lot because, well we really want to get these + * interrupts. The serial driver will clear these bits when it + * initializes the chip. Every thing else it does is ok, + * but this. + */ + if (!ints_disabled) { + outb_px(gdb_async_info->port + UART_IER, + gdb_async_info->IER); + } + } + +} /* tty_putDebugChar */ + +/* + * This does nothing for the serial port, since it doesn't buffer. + */ + +void tty_flushDebugChar(void) +{ +} + +module_init(kgdb_enable_ints); --- linux-2.6.4-rc1/arch/i386/lib/Makefile 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/lib/Makefile 2004-02-29 13:08:50.000000000 -0800 @@ -9,4 +9,4 @@ lib-y = checksum.o delay.o \ lib-$(CONFIG_X86_USE_3DNOW) += mmx.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o -lib-$(CONFIG_DEBUG_IOVIRT) += iodebug.o +lib-$(CONFIG_KGDB) += kgdb_serial.o --- linux-2.6.4-rc1/arch/i386/lib/usercopy.c 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/i386/lib/usercopy.c 2004-02-29 13:09:29.000000000 -0800 @@ -76,7 +76,7 @@ do { \ * and returns @count. */ long -__strncpy_from_user(char *dst, const char __user *src, long count) +__direct_strncpy_from_user(char *dst, const char __user *src, long count) { long res; __do_strncpy_from_user(dst, src, count, res); @@ -102,7 +102,7 @@ __strncpy_from_user(char *dst, const cha * and returns @count. */ long -strncpy_from_user(char *dst, const char __user *src, long count) +direct_strncpy_from_user(char *dst, const char __user *src, long count) { long res = -EFAULT; if (access_ok(VERIFY_READ, src, 1)) @@ -147,7 +147,7 @@ do { \ * On success, this will be zero. */ unsigned long -clear_user(void __user *to, unsigned long n) +direct_clear_user(void __user *to, unsigned long n) { might_sleep(); if (access_ok(VERIFY_WRITE, to, n)) @@ -167,7 +167,7 @@ clear_user(void __user *to, unsigned lon * On success, this will be zero. */ unsigned long -__clear_user(void __user *to, unsigned long n) +__direct_clear_user(void __user *to, unsigned long n) { __do_clear_user(to, n); return n; @@ -184,7 +184,7 @@ __clear_user(void __user *to, unsigned l * On exception, returns 0. * If the string is too long, returns a value greater than @n. */ -long strnlen_user(const char __user *s, long n) +long direct_strnlen_user(const char __user *s, long n) { unsigned long mask = -__addr_ok(s); unsigned long res, tmp; @@ -575,3 +575,4 @@ unsigned long __copy_from_user_ll(void * n = __copy_user_zeroing_intel(to, (const void *) from, n); return n; } + --- linux-2.6.4-rc1/arch/i386/mach-voyager/Makefile 2003-06-14 12:17:58.000000000 -0700 +++ 25/arch/i386/mach-voyager/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,11 +1,6 @@ # # Makefile for the linux kernel. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# -# Note 2! The CFLAGS definitions are now in the main makefile... EXTRA_CFLAGS += -I../kernel obj-y := setup.o voyager_basic.o voyager_thread.o --- linux-2.6.4-rc1/arch/i386/Makefile 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/Makefile 2004-02-29 13:08:19.000000000 -0800 @@ -19,7 +19,7 @@ LDFLAGS := -m elf_i386 OBJCOPYFLAGS := -O binary -R .note -R .comment -S LDFLAGS_vmlinux := -CFLAGS += -pipe +CFLAGS += -pipe -msoft-float # prevent gcc from keeping the stack 16 byte aligned CFLAGS += $(call check_gcc,-mpreferred-stack-boundary=2,) @@ -97,6 +97,9 @@ mcore-$(CONFIG_X86_ES7000) := mach-es700 # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +mflags-$(CONFIG_KGDB) += -gdwarf-2 +mflags-$(CONFIG_KGDB_MORE) += $(shell echo $(CONFIG_KGDB_OPTIONS) | sed -e 's/"//g') + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ --- linux-2.6.4-rc1/arch/i386/math-emu/fpu_system.h 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/math-emu/fpu_system.h 2004-02-29 13:09:29.000000000 -0800 @@ -15,6 +15,7 @@ #include #include #include +#include /* This sets the pointer FPU_info to point to the argument part of the stack frame of math_emulate() */ @@ -22,7 +23,7 @@ /* s is always from a cpu register, and the cpu does bounds checking * during register load --> no further bounds checks needed */ -#define LDT_DESCRIPTOR(s) (((struct desc_struct *)current->mm->context.ldt)[(s) >> 3]) +#define LDT_DESCRIPTOR(s) (((struct desc_struct *)__kmap_atomic_vaddr(KM_LDT_PAGE0))[(s) >> 3]) #define SEG_D_SIZE(x) ((x).b & (3 << 21)) #define SEG_G_BIT(x) ((x).b & (1 << 23)) #define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1) --- linux-2.6.4-rc1/arch/i386/mm/discontig.c 2003-09-27 18:57:43.000000000 -0700 +++ 25/arch/i386/mm/discontig.c 2004-02-29 13:08:58.000000000 -0800 @@ -66,7 +66,7 @@ extern void find_max_pfn(void); extern void one_highpage_init(struct page *, int, int); extern struct e820map e820; -extern char _end; +extern unsigned long init_pg_tables_end; extern unsigned long highend_pfn, highstart_pfn; extern unsigned long max_low_pfn; extern unsigned long totalram_pages; @@ -237,7 +237,7 @@ unsigned long __init setup_memory(void) reserve_pages = calculate_numa_remap_pages(); /* partially used pages are not usable - thus round upwards */ - system_start_pfn = min_low_pfn = PFN_UP(__pa(&_end)); + system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); find_max_pfn(); system_max_low_pfn = max_low_pfn = find_max_low_pfn(); --- linux-2.6.4-rc1/arch/i386/mm/fault.c 2003-12-17 21:20:01.000000000 -0800 +++ 25/arch/i386/mm/fault.c 2004-02-29 13:09:29.000000000 -0800 @@ -27,6 +27,7 @@ #include #include #include +#include extern void die(const char *,struct pt_regs *,long); @@ -104,8 +105,17 @@ static inline unsigned long get_segment_ if (seg & (1<<2)) { /* Must lock the LDT while reading it. */ down(¤t->mm->context.sem); +#if 1 + /* horrible hack for 4/4 disabled kernels. + I'm not quite sure what the TLB flush is good for, + it's mindlessly copied from the read_ldt code */ + __flush_tlb_global(); + desc = kmap(current->mm->context.ldt_pages[(seg&~7)/PAGE_SIZE]); + desc = (void *)desc + ((seg & ~7) % PAGE_SIZE); +#else desc = current->mm->context.ldt; desc = (void *)desc + (seg & ~7); +#endif } else { /* Must disable preemption while reading the GDT. */ desc = (u32 *)&cpu_gdt_table[get_cpu()]; @@ -118,6 +128,9 @@ static inline unsigned long get_segment_ (desc[1] & 0xff000000); if (seg & (1<<2)) { +#if 1 + kunmap((void *)((unsigned long)desc & PAGE_MASK)); +#endif up(¤t->mm->context.sem); } else put_cpu(); @@ -243,6 +256,19 @@ asmlinkage void do_page_fault(struct pt_ * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 1) == 0. */ +#ifdef CONFIG_X86_4G + /* + * On 4/4 all kernels faults are either bugs, vmalloc or prefetch + */ + if (unlikely((regs->xcs & 3) == 0)) { + if (error_code & 3) + goto bad_area_nosemaphore; + + /* If it's vm86 fall through */ + if (!(regs->eflags & VM_MASK)) + goto vmalloc_fault; + } +#else if (unlikely(address >= TASK_SIZE)) { if (!(error_code & 5)) goto vmalloc_fault; @@ -252,6 +278,7 @@ asmlinkage void do_page_fault(struct pt_ */ goto bad_area_nosemaphore; } +#endif mm = tsk->mm; @@ -403,6 +430,12 @@ no_context: * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ +#ifdef CONFIG_KGDB + if (!user_mode(regs)){ + kgdb_handle_exception(14,SIGBUS, error_code, regs); + return; + } +#endif bust_spinlocks(1); --- linux-2.6.4-rc1/arch/i386/mm/hugetlbpage.c 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/i386/mm/hugetlbpage.c 2004-02-29 13:08:35.000000000 -0800 @@ -61,6 +61,27 @@ static struct page *alloc_fresh_huge_pag static void free_huge_page(struct page *page); +#ifdef CONFIG_NUMA + +static inline void huge_inc_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss += (HPAGE_SIZE / PAGE_SIZE); + mm->pernode_rss[page_nodenum(page)] += (HPAGE_SIZE / PAGE_SIZE); +} + +static inline void huge_dec_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss -= (HPAGE_SIZE / PAGE_SIZE); + mm->pernode_rss[page_nodenum(page)] -= (HPAGE_SIZE / PAGE_SIZE); +} + +#else /* !CONFIG_NUMA */ + +#define huge_inc_rss(mm, page) ((mm)->rss += (HPAGE_SIZE / PAGE_SIZE)) +#define huge_dec_rss(mm, page) ((mm)->rss -= (HPAGE_SIZE / PAGE_SIZE)) + +#endif /* CONFIG_NUMA */ + static struct page *alloc_hugetlb_page(void) { int i; @@ -105,7 +126,7 @@ static void set_huge_pte(struct mm_struc { pte_t entry; - mm->rss += (HPAGE_SIZE / PAGE_SIZE); + huge_inc_rss(mm, page); if (write_access) { entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); @@ -145,7 +166,7 @@ int copy_hugetlb_page_range(struct mm_st ptepage = pte_page(entry); get_page(ptepage); set_pte(dst_pte, entry); - dst->rss += (HPAGE_SIZE / PAGE_SIZE); + huge_inc_rss(dst, ptepage); addr += HPAGE_SIZE; } return 0; @@ -314,8 +335,8 @@ void unmap_hugepage_range(struct vm_area page = pte_page(*pte); huge_page_release(page); pte_clear(pte); + huge_dec_rss(mm, page); } - mm->rss -= (end - start) >> PAGE_SHIFT; flush_tlb_range(vma, start, end); } --- linux-2.6.4-rc1/arch/i386/mm/init.c 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/i386/mm/init.c 2004-02-29 13:09:29.000000000 -0800 @@ -40,125 +40,13 @@ #include #include #include +#include DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static int do_test_wp_bit(void); -/* - * Creates a middle page table and puts a pointer to it in the - * given global directory entry. This only returns the gd entry - * in non-PAE compilation mode, since the middle layer is folded. - */ -static pmd_t * __init one_md_table_init(pgd_t *pgd) -{ - pmd_t *pmd_table; - -#ifdef CONFIG_X86_PAE - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - if (pmd_table != pmd_offset(pgd, 0)) - BUG(); -#else - pmd_table = pmd_offset(pgd, 0); -#endif - - return pmd_table; -} - -/* - * Create a page table and place a pointer to it in a middle page - * directory entry. - */ -static pte_t * __init one_page_table_init(pmd_t *pmd) -{ - if (pmd_none(*pmd)) { - pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); - if (page_table != pte_offset_kernel(pmd, 0)) - BUG(); - - return page_table; - } - - return pte_offset_kernel(pmd, 0); -} - -/* - * This function initializes a certain range of kernel virtual memory - * with new bootmem page tables, everywhere page tables are missing in - * the given range. - */ - -/* - * NOTE: The pagetables are allocated contiguous on the physical space - * so we can cache the place of the first one and move around without - * checking the pgd every time. - */ -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) -{ - pgd_t *pgd; - pmd_t *pmd; - int pgd_idx, pmd_idx; - unsigned long vaddr; - - vaddr = start; - pgd_idx = pgd_index(vaddr); - pmd_idx = pmd_index(vaddr); - pgd = pgd_base + pgd_idx; - - for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { - if (pgd_none(*pgd)) - one_md_table_init(pgd); - - pmd = pmd_offset(pgd, vaddr); - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (pmd_none(*pmd)) - one_page_table_init(pmd); - - vaddr += PMD_SIZE; - } - pmd_idx = 0; - } -} - -/* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. - */ -static void __init kernel_physical_mapping_init(pgd_t *pgd_base) -{ - unsigned long pfn; - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - int pgd_idx, pmd_idx, pte_ofs; - - pgd_idx = pgd_index(PAGE_OFFSET); - pgd = pgd_base + pgd_idx; - pfn = 0; - - for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { - pmd = one_md_table_init(pgd); - if (pfn >= max_low_pfn) - continue; - for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { - /* Map with big pages if possible, otherwise create normal page tables. */ - if (cpu_has_pse) { - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); - pfn += PTRS_PER_PTE; - } else { - pte = one_page_table_init(pmd); - - for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); - } - } - } -} - static inline int page_kills_ppro(unsigned long pagenr) { if (pagenr >= 0x70000 && pagenr <= 0x7003F) @@ -206,11 +94,8 @@ static inline int page_is_ram(unsigned l return 0; } -#ifdef CONFIG_HIGHMEM pte_t *kmap_pte; -pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); EXPORT_SYMBOL(kmap_pte); #define kmap_get_fixmap_pte(vaddr) \ @@ -218,29 +103,7 @@ EXPORT_SYMBOL(kmap_pte); void __init kmap_init(void) { - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; -} - -void __init permanent_kmaps_init(pgd_t *pgd_base) -{ - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - unsigned long vaddr; - - vaddr = PKMAP_BASE; - page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); - - pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + kmap_pte = kmap_get_fixmap_pte(__fix_to_virt(FIX_KMAP_BEGIN)); } void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) @@ -255,6 +118,8 @@ void __init one_highpage_init(struct pag SetPageReserved(page); } +#ifdef CONFIG_HIGHMEM + #ifndef CONFIG_DISCONTIGMEM void __init set_highmem_pages_init(int bad_ppro) { @@ -266,12 +131,9 @@ void __init set_highmem_pages_init(int b #else extern void set_highmem_pages_init(int); #endif /* !CONFIG_DISCONTIGMEM */ - #else -#define kmap_init() do { } while (0) -#define permanent_kmaps_init(pgd_base) do { } while (0) -#define set_highmem_pages_init(bad_ppro) do { } while (0) -#endif /* CONFIG_HIGHMEM */ +# define set_highmem_pages_init(bad_ppro) do { } while (0) +#endif unsigned long __PAGE_KERNEL = _PAGE_KERNEL; @@ -281,30 +143,125 @@ unsigned long __PAGE_KERNEL = _PAGE_KERN extern void __init remap_numa_kva(void); #endif -static void __init pagetable_init (void) +static __init void prepare_pagetables(pgd_t *pgd_base, unsigned long address) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_base + pgd_index(address); + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) { + pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); + } +} + +static void __init fixrange_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { unsigned long vaddr; - pgd_t *pgd_base = swapper_pg_dir; + for (vaddr = start; vaddr != end; vaddr += PAGE_SIZE) + prepare_pagetables(pgd_base, vaddr); +} + +void setup_identity_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end) +{ + unsigned long vaddr; + pgd_t *pgd; + int i, j, k; + pmd_t *pmd; + pte_t *pte, *pte_base; + + pgd = pgd_base; + + for (i = 0; i < PTRS_PER_PGD; pgd++, i++) { + vaddr = i*PGDIR_SIZE; + if (end && (vaddr >= end)) + break; + pmd = pmd_offset(pgd, 0); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + if (cpu_has_pse) { + unsigned long __pe; + + set_in_cr4(X86_CR4_PSE); + boot_cpu_data.wp_works_ok = 1; + __pe = _KERNPG_TABLE + _PAGE_PSE + vaddr - start; + /* Make it "global" too if supported */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); +#if !defined(CONFIG_X86_SWITCH_PAGETABLES) + __pe += _PAGE_GLOBAL; + __PAGE_KERNEL |= _PAGE_GLOBAL; +#endif + } + set_pmd(pmd, __pmd(__pe)); + continue; + } + if (!pmd_present(*pmd)) + pte_base = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + else + pte_base = (pte_t *) page_address(pmd_page(*pmd)); + pte = pte_base; + for (k = 0; k < PTRS_PER_PTE; pte++, k++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + *pte = mk_pte_phys(vaddr-start, PAGE_KERNEL); + } + set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); + } + } +} + +static void __init pagetable_init (void) +{ + unsigned long vaddr, end; + pgd_t *pgd_base; #ifdef CONFIG_X86_PAE int i; - /* Init entries of the first-level page table to the zero page */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); #endif - /* Enable PSE if available */ - if (cpu_has_pse) { - set_in_cr4(X86_CR4_PSE); - } + /* + * This can be zero as well - no problem, in that case we exit + * the loops anyway due to the PTRS_PER_* conditions. + */ + end = (unsigned long)__va(max_low_pfn*PAGE_SIZE); - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __PAGE_KERNEL |= _PAGE_GLOBAL; + pgd_base = swapper_pg_dir; +#ifdef CONFIG_X86_PAE + /* + * It causes too many problems if there's no proper pmd set up + * for all 4 entries of the PGD - so we allocate all of them. + * PAE systems will not miss this extra 4-8K anyway ... + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + pmd_t *pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pgd(pgd_base + i, __pgd(__pa(pmd) + 0x1)); } +#endif + /* + * Set up lowmem-sized identity mappings at PAGE_OFFSET: + */ + setup_identity_mappings(pgd_base, PAGE_OFFSET, end); - kernel_physical_mapping_init(pgd_base); + /* + * Add flat-mode identity-mappings - SMP needs it when + * starting up on an AP from real-mode. (In the non-PAE + * case we already have these mappings through head.S.) + * All user-space mappings are explicitly cleared after + * SMP startup. + */ +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PAE) + setup_identity_mappings(pgd_base, 0, 16*1024*1024); +#endif remap_numa_kva(); /* @@ -312,38 +269,64 @@ static void __init pagetable_init (void) * created - mappings will be set by set_fixmap(): */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - page_table_range_init(vaddr, 0, pgd_base); + fixrange_init(vaddr, 0, pgd_base); - permanent_kmaps_init(pgd_base); +#ifdef CONFIG_HIGHMEM + { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; + /* + * Permanent kmaps: + */ + vaddr = PKMAP_BASE; + fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + + pgd = swapper_pg_dir + pgd_index(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; + } #endif } -void zap_low_mappings (void) +/* + * Clear kernel pagetables in a PMD_SIZE-aligned range. + */ +static void clear_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end) { - int i; + unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; + int i, j; + + pgd = pgd_base; + + for (i = 0; i < PTRS_PER_PGD; pgd++, i++) { + vaddr = i*PGDIR_SIZE; + if (end && (vaddr >= end)) + break; + pmd = pmd_offset(pgd, 0); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + pmd_clear(pmd); + } + } + flush_tlb_all(); +} + +void zap_low_mappings(void) +{ + printk("zapping low mappings.\n"); /* * Zap initial low-memory mappings. - * - * Note that "pgd_clear()" doesn't do it for - * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) -#ifdef CONFIG_X86_PAE - set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); -#else - set_pgd(swapper_pg_dir+i, __pgd(0)); -#endif - flush_tlb_all(); + clear_mappings(swapper_pg_dir, 0, 16*1024*1024); } #ifndef CONFIG_DISCONTIGMEM @@ -393,7 +376,15 @@ void __init paging_init(void) set_in_cr4(X86_CR4_PAE); #endif __flush_tlb_all(); - + /* + * Subtle. SMP is doing it's boot stuff late (because it has to + * fork idle threads) - but it also needs low mappings for the + * protected-mode entry to work. We zap these entries only after + * the WP-bit has been tested. + */ +#ifndef CONFIG_SMP + zap_low_mappings(); +#endif kmap_init(); zone_sizes_init(); } @@ -515,22 +506,18 @@ void __init mem_init(void) if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); - /* - * Subtle. SMP is doing it's boot stuff late (because it has to - * fork idle threads) - but it also needs low mappings for the - * protected-mode entry to work. We zap these entries only after - * the WP-bit has been tested. - */ -#ifndef CONFIG_SMP - zap_low_mappings(); -#endif + entry_trampoline_setup(); + default_ldt_page = virt_to_page(default_ldt); + load_LDT(&init_mm.context); } -kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; +kmem_cache_t *pgd_cache, *pmd_cache, *kpmd_cache; void __init pgtable_cache_init(void) { + void (*ctor)(void *, kmem_cache_t *, unsigned long); + void (*dtor)(void *, kmem_cache_t *, unsigned long); + if (PTRS_PER_PMD > 1) { pmd_cache = kmem_cache_create("pmd", PTRS_PER_PMD*sizeof(pmd_t), @@ -540,13 +527,36 @@ void __init pgtable_cache_init(void) NULL); if (!pmd_cache) panic("pgtable_cache_init(): cannot create pmd cache"); + + if (TASK_SIZE > PAGE_OFFSET) { + kpmd_cache = kmem_cache_create("kpmd", + PTRS_PER_PMD*sizeof(pmd_t), + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + kpmd_ctor, + NULL); + if (!kpmd_cache) + panic("pgtable_cache_init(): " + "cannot create kpmd cache"); + } } + + if (PTRS_PER_PMD == 1 || TASK_SIZE <= PAGE_OFFSET) + ctor = pgd_ctor; + else + ctor = NULL; + + if (PTRS_PER_PMD == 1 && TASK_SIZE <= PAGE_OFFSET) + dtor = pgd_dtor; + else + dtor = NULL; + pgd_cache = kmem_cache_create("pgd", PTRS_PER_PGD*sizeof(pgd_t), 0, SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + ctor, + dtor); if (!pgd_cache) panic("pgtable_cache_init(): Cannot create pgd cache"); } --- linux-2.6.4-rc1/arch/i386/mm/pgtable.c 2003-11-09 16:45:05.000000000 -0800 +++ 25/arch/i386/mm/pgtable.c 2004-02-29 13:09:29.000000000 -0800 @@ -21,6 +21,7 @@ #include #include #include +#include void show_mem(void) { @@ -157,11 +158,20 @@ void pmd_ctor(void *pmd, kmem_cache_t *c memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); } +void kpmd_ctor(void *__pmd, kmem_cache_t *cache, unsigned long flags) +{ + pmd_t *kpmd, *pmd; + kpmd = pmd_offset(&swapper_pg_dir[PTRS_PER_PGD-1], + (PTRS_PER_PMD - NR_SHARED_PMDS)*PMD_SIZE); + pmd = (pmd_t *)__pmd + (PTRS_PER_PMD - NR_SHARED_PMDS); + + memset(__pmd, 0, (PTRS_PER_PMD - NR_SHARED_PMDS)*sizeof(pmd_t)); + memcpy(pmd, kpmd, NR_SHARED_PMDS*sizeof(pmd_t)); +} + /* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking + * List of all pgd's needed so it can invalidate entries in both cached + * and uncached pgd's. This is essentially codepath-based locking * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. @@ -170,30 +180,60 @@ void pmd_ctor(void *pmd, kmem_cache_t *c * could be used. The locking scheme was chosen on the basis of * manfred's recommendations and having no core impact whatsoever. * -- wli + * + * The entire issue goes away when XKVA is configured. */ spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; LIST_HEAD(pgd_list); -void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +/* + * This is not that hard to figure out. + * (a) PTRS_PER_PMD == 1 means non-PAE. + * (b) PTRS_PER_PMD > 1 means PAE. + * (c) TASK_SIZE > PAGE_OFFSET means XKVA. + * (d) TASK_SIZE <= PAGE_OFFSET means non-XKVA. + * + * Do *NOT* back out the preconstruction like the patch I'm cleaning + * up after this very instant did, or at all, for that matter. + * This is never called when PTRS_PER_PMD > 1 && TASK_SIZE > PAGE_OFFSET. + * -- wli + */ +void pgd_ctor(void *__pgd, kmem_cache_t *cache, unsigned long unused) { + pgd_t *pgd = (pgd_t *)__pgd; unsigned long flags; - if (PTRS_PER_PMD == 1) - spin_lock_irqsave(&pgd_lock, flags); + if (PTRS_PER_PMD == 1) { + if (TASK_SIZE <= PAGE_OFFSET) + spin_lock_irqsave(&pgd_lock, flags); + else + memcpy(&pgd[PTRS_PER_PGD - NR_SHARED_PMDS], + &swapper_pg_dir[PTRS_PER_PGD - NR_SHARED_PMDS], + NR_SHARED_PMDS * sizeof(pgd_t)); + } - memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + if (TASK_SIZE <= PAGE_OFFSET) + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); if (PTRS_PER_PMD > 1) return; - list_add(&virt_to_page(pgd)->lru, &pgd_list); - spin_unlock_irqrestore(&pgd_lock, flags); - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + if (TASK_SIZE > PAGE_OFFSET) + memset(pgd, 0, (PTRS_PER_PGD - NR_SHARED_PMDS)*sizeof(pgd_t)); + else { + list_add(&virt_to_page(pgd)->lru, &pgd_list); + spin_unlock_irqrestore(&pgd_lock, flags); + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + } } -/* never called when PTRS_PER_PMD > 1 */ +/* + * Never called when PTRS_PER_PMD > 1 || TASK_SIZE > PAGE_OFFSET + * for with PAE we would list_del() multiple times, and for non-PAE + * with XKVA all the AGP pgd shootdown code is unnecessary. + */ void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ @@ -203,6 +243,12 @@ void pgd_dtor(void *pgd, kmem_cache_t *c spin_unlock_irqrestore(&pgd_lock, flags); } +/* + * See the comments above pgd_ctor() wrt. preconstruction. + * Do *NOT* memcpy() here. If you do, you back out important + * anti- cache pollution code. + * + */ pgd_t *pgd_alloc(struct mm_struct *mm) { int i; @@ -211,15 +257,33 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (PTRS_PER_PMD == 1 || !pgd) return pgd; + /* + * In the 4G userspace case alias the top 16 MB virtual + * memory range into the user mappings as well (these + * include the trampoline and CPU data structures). + */ for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + kmem_cache_t *cache; + pmd_t *pmd; + + if (TASK_SIZE > PAGE_OFFSET && i == USER_PTRS_PER_PGD - 1) + cache = kpmd_cache; + else + cache = pmd_cache; + + pmd = kmem_cache_alloc(cache, GFP_KERNEL); if (!pmd) goto out_oom; set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd)))); } - return pgd; + return pgd; out_oom: + /* + * we don't have to handle the kpmd_cache here, since it's the + * last allocation, and has either nothing to free or when it + * succeeds the whole operation succeeds. + */ for (i--; i >= 0; i--) kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); kmem_cache_free(pgd_cache, pgd); @@ -230,10 +294,29 @@ void pgd_free(pgd_t *pgd) { int i; - /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); /* in the non-PAE case, clear_page_tables() clears user pgd entries */ + if (PTRS_PER_PMD == 1) + goto out_free; + + /* in the PAE case user pgd entries are overwritten before usage */ + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + kmem_cache_t *cache; + pmd_t *pmd = __va(pgd_val(pgd[i]) - 1); + + /* + * only userspace pmd's are cleared for us + * by mm/memory.c; it's a slab cache invariant + * that we must separate the kernel pmd slab + * all times, else we'll have bad pmd's. + */ + if (TASK_SIZE > PAGE_OFFSET && i == USER_PTRS_PER_PGD - 1) + cache = kpmd_cache; + else + cache = pmd_cache; + + kmem_cache_free(cache, pmd); + } +out_free: kmem_cache_free(pgd_cache, pgd); } + --- linux-2.6.4-rc1/arch/i386/oprofile/nmi_int.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/i386/oprofile/nmi_int.c 2004-02-29 13:07:44.000000000 -0800 @@ -65,14 +65,14 @@ static int __init init_driverfs(void) { int error; if (!(error = sysdev_class_register(&oprofile_sysclass))) - error = sys_device_register(&device_oprofile); + error = sysdev_register(&device_oprofile); return error; } static void __exit exit_driverfs(void) { - sys_device_unregister(&device_oprofile); + sysdev_unregister(&device_oprofile); sysdev_class_unregister(&oprofile_sysclass); } --- linux-2.6.4-rc1/arch/i386/oprofile/op_model_p4.c 2003-08-22 19:23:40.000000000 -0700 +++ 25/arch/i386/oprofile/op_model_p4.c 2004-02-29 13:08:14.000000000 -0800 @@ -382,11 +382,8 @@ static struct p4_event_binding p4_events static unsigned int get_stagger(void) { #ifdef CONFIG_SMP - int cpu; - if (smp_num_siblings > 1) { - cpu = smp_processor_id(); - return (cpu_sibling_map[cpu] > cpu) ? 0 : 1; - } + int cpu = smp_processor_id(); + return (cpu != first_cpu(cpu_sibling_map[cpu])); #endif return 0; } --- linux-2.6.4-rc1/arch/i386/pci/common.c 2004-02-03 20:42:34.000000000 -0800 +++ 25/arch/i386/pci/common.c 2004-02-29 13:08:34.000000000 -0800 @@ -20,7 +20,8 @@ extern void pcibios_sort(void); #endif -unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2; +unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | + PCI_PROBE_MMCONF; int pcibios_last_bus = -1; struct pci_bus *pci_root_bus = NULL; @@ -198,6 +199,12 @@ char * __devinit pcibios_setup(char *st return NULL; } #endif +#ifdef CONFIG_PCI_MMCONFIG + else if (!strcmp(str, "nommconf")) { + pci_probe &= ~PCI_PROBE_MMCONF; + return NULL; + } +#endif else if (!strcmp(str, "noacpi")) { acpi_noirq_set(); return NULL; --- linux-2.6.4-rc1/arch/i386/pci/Makefile 2003-07-02 14:53:12.000000000 -0700 +++ 25/arch/i386/pci/Makefile 2004-02-29 13:08:34.000000000 -0800 @@ -1,6 +1,7 @@ obj-y := i386.o obj-$(CONFIG_PCI_BIOS) += pcbios.o +obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o obj-$(CONFIG_PCI_DIRECT) += direct.o pci-y := fixup.o --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/arch/i386/pci/mmconfig.c 2004-02-29 13:08:34.000000000 -0800 @@ -0,0 +1,109 @@ +/* + * mmconfig.c - Low-level direct PCI config space access via MMCONFIG + */ + +#include +#include +#include "pci.h" + +/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ +u32 pci_mmcfg_base_addr; + +#define mmcfg_virt_addr (fix_to_virt(FIX_PCIE_MCFG)) + +/* The base address of the last MMCONFIG device accessed */ +static u32 mmcfg_last_accessed_device; + +/* + * Functions for accessing PCI configuration space with MMCONFIG accesses + */ + +static inline void pci_exp_set_dev_base(int bus, int devfn) +{ + u32 dev_base = pci_mmcfg_base_addr | (bus << 20) | (devfn << 12); + if (dev_base != mmcfg_last_accessed_device) { + mmcfg_last_accessed_device = dev_base; + set_fixmap(FIX_PCIE_MCFG, dev_base); + } +} + +static int pci_mmcfg_read(int seg, int bus, int devfn, int reg, int len, u32 *value) +{ + unsigned long flags; + + if (!value || (bus > 255) || (devfn > 255) || (reg > 4095)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + pci_exp_set_dev_base(bus, devfn); + + switch (len) { + case 1: + *value = readb(mmcfg_virt_addr + reg); + break; + case 2: + *value = readw(mmcfg_virt_addr + reg); + break; + case 4: + *value = readl(mmcfg_virt_addr + reg); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +static int pci_mmcfg_write(int seg, int bus, int devfn, int reg, int len, u32 value) +{ + unsigned long flags; + + if ((bus > 255) || (devfn > 255) || (reg > 4095)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + pci_exp_set_dev_base(bus, devfn); + + switch (len) { + case 1: + writeb(value, mmcfg_virt_addr + reg); + break; + case 2: + writew(value, mmcfg_virt_addr + reg); + break; + case 4: + writel(value, mmcfg_virt_addr + reg); + break; + } + + /* Dummy read to flush PCI write */ + readl(mmcfg_virt_addr); + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +static struct pci_raw_ops pci_mmcfg = { + .read = pci_mmcfg_read, + .write = pci_mmcfg_write, +}; + +static int __init pci_mmcfg_init(void) +{ + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + goto out; + if (!pci_mmcfg_base_addr) + goto out; + + printk(KERN_INFO "PCI: Using MMCONFIG\n"); + raw_pci_ops = &pci_mmcfg; + pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; + + out: + return 0; +} + +arch_initcall(pci_mmcfg_init); --- linux-2.6.4-rc1/arch/i386/pci/pci.h 2004-02-03 20:42:34.000000000 -0800 +++ 25/arch/i386/pci/pci.h 2004-02-29 13:08:34.000000000 -0800 @@ -15,6 +15,9 @@ #define PCI_PROBE_BIOS 0x0001 #define PCI_PROBE_CONF1 0x0002 #define PCI_PROBE_CONF2 0x0004 +#define PCI_PROBE_MMCONF 0x0008 +#define PCI_PROBE_MASK 0x000f + #define PCI_NO_SORT 0x0100 #define PCI_BIOS_SORT 0x0200 #define PCI_NO_CHECKS 0x0400 --- linux-2.6.4-rc1/arch/ia64/ia32/sys_ia32.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/ia64/ia32/sys_ia32.c 2004-02-29 13:07:56.000000000 -0800 @@ -1023,143 +1023,6 @@ sys32_writev (int fd, struct compat_iove return ret; } -/* - * sys32_ipc() is the de-multiplexer for the SysV IPC calls in 32bit emulation.. - * - * This is really horribly ugly. - */ - -struct msgbuf32 { s32 mtype; char mtext[1]; }; - -struct ipc_perm32 { - key_t key; - compat_uid_t uid; - compat_gid_t gid; - compat_uid_t cuid; - compat_gid_t cgid; - compat_mode_t mode; - unsigned short seq; -}; - -struct ipc64_perm32 { - key_t key; - compat_uid32_t uid; - compat_gid32_t gid; - compat_uid32_t cuid; - compat_gid32_t cgid; - compat_mode_t mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - unsigned int unused1; - unsigned int unused2; -}; - -struct semid_ds32 { - struct ipc_perm32 sem_perm; /* permissions .. see ipc.h */ - compat_time_t sem_otime; /* last semop time */ - compat_time_t sem_ctime; /* last change time */ - u32 sem_base; /* ptr to first semaphore in array */ - u32 sem_pending; /* pending operations to be processed */ - u32 sem_pending_last; /* last pending operation */ - u32 undo; /* undo requests on this array */ - unsigned short sem_nsems; /* no. of semaphores in array */ -}; - -struct semid64_ds32 { - struct ipc64_perm32 sem_perm; - compat_time_t sem_otime; - unsigned int __unused1; - compat_time_t sem_ctime; - unsigned int __unused2; - unsigned int sem_nsems; - unsigned int __unused3; - unsigned int __unused4; -}; - -struct msqid_ds32 { - struct ipc_perm32 msg_perm; - u32 msg_first; - u32 msg_last; - compat_time_t msg_stime; - compat_time_t msg_rtime; - compat_time_t msg_ctime; - u32 wwait; - u32 rwait; - unsigned short msg_cbytes; - unsigned short msg_qnum; - unsigned short msg_qbytes; - compat_ipc_pid_t msg_lspid; - compat_ipc_pid_t msg_lrpid; -}; - -struct msqid64_ds32 { - struct ipc64_perm32 msg_perm; - compat_time_t msg_stime; - unsigned int __unused1; - compat_time_t msg_rtime; - unsigned int __unused2; - compat_time_t msg_ctime; - unsigned int __unused3; - unsigned int msg_cbytes; - unsigned int msg_qnum; - unsigned int msg_qbytes; - compat_pid_t msg_lspid; - compat_pid_t msg_lrpid; - unsigned int __unused4; - unsigned int __unused5; -}; - -struct shmid_ds32 { - struct ipc_perm32 shm_perm; - int shm_segsz; - compat_time_t shm_atime; - compat_time_t shm_dtime; - compat_time_t shm_ctime; - compat_ipc_pid_t shm_cpid; - compat_ipc_pid_t shm_lpid; - unsigned short shm_nattch; -}; - -struct shmid64_ds32 { - struct ipc64_perm32 shm_perm; - compat_size_t shm_segsz; - compat_time_t shm_atime; - unsigned int __unused1; - compat_time_t shm_dtime; - unsigned int __unused2; - compat_time_t shm_ctime; - unsigned int __unused3; - compat_pid_t shm_cpid; - compat_pid_t shm_lpid; - unsigned int shm_nattch; - unsigned int __unused4; - unsigned int __unused5; -}; - -struct shminfo64_32 { - unsigned int shmmax; - unsigned int shmmin; - unsigned int shmmni; - unsigned int shmseg; - unsigned int shmall; - unsigned int __unused1; - unsigned int __unused2; - unsigned int __unused3; - unsigned int __unused4; -}; - -struct shm_info32 { - int used_ids; - u32 shm_tot, shm_rss, shm_swp; - u32 swap_attempts, swap_successes; -}; - -struct ipc_kludge { - u32 msgp; - s32 msgtyp; -}; - #define SEMOP 1 #define SEMGET 2 #define SEMCTL 3 @@ -1173,454 +1036,6 @@ struct ipc_kludge { #define SHMGET 23 #define SHMCTL 24 -#define IPCOP_MASK(__x) (1UL << (__x)) - -static int -ipc_parse_version32 (int *cmd) -{ - if (*cmd & IPC_64) { - *cmd ^= IPC_64; - return IPC_64; - } else { - return IPC_OLD; - } -} - -static int -semctl32 (int first, int second, int third, void *uptr) -{ - union semun fourth; - u32 pad; - int err = 0, err2; - struct semid64_ds s; - mm_segment_t old_fs; - int version = ipc_parse_version32(&third); - - if (!uptr) - return -EINVAL; - if (get_user(pad, (u32 *)uptr)) - return -EFAULT; - if (third == SETVAL) - fourth.val = (int)pad; - else - fourth.__pad = (void *)A(pad); - switch (third) { - default: - err = -EINVAL; - break; - - case IPC_INFO: - case IPC_RMID: - case IPC_SET: - case SEM_INFO: - case GETVAL: - case GETPID: - case GETNCNT: - case GETZCNT: - case GETALL: - case SETVAL: - case SETALL: - err = sys_semctl(first, second, third, fourth); - break; - - case IPC_STAT: - case SEM_STAT: - fourth.__pad = &s; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_semctl(first, second, third, fourth); - set_fs(old_fs); - - if (version == IPC_64) { - struct semid64_ds32 *usp64 = (struct semid64_ds32 *) A(pad); - - if (!access_ok(VERIFY_WRITE, usp64, sizeof(*usp64))) { - err = -EFAULT; - break; - } - err2 = __put_user(s.sem_perm.key, &usp64->sem_perm.key); - err2 |= __put_user(s.sem_perm.uid, &usp64->sem_perm.uid); - err2 |= __put_user(s.sem_perm.gid, &usp64->sem_perm.gid); - err2 |= __put_user(s.sem_perm.cuid, &usp64->sem_perm.cuid); - err2 |= __put_user(s.sem_perm.cgid, &usp64->sem_perm.cgid); - err2 |= __put_user(s.sem_perm.mode, &usp64->sem_perm.mode); - err2 |= __put_user(s.sem_perm.seq, &usp64->sem_perm.seq); - err2 |= __put_user(s.sem_otime, &usp64->sem_otime); - err2 |= __put_user(s.sem_ctime, &usp64->sem_ctime); - err2 |= __put_user(s.sem_nsems, &usp64->sem_nsems); - } else { - struct semid_ds32 *usp32 = (struct semid_ds32 *) A(pad); - - if (!access_ok(VERIFY_WRITE, usp32, sizeof(*usp32))) { - err = -EFAULT; - break; - } - err2 = __put_user(s.sem_perm.key, &usp32->sem_perm.key); - err2 |= __put_user(s.sem_perm.uid, &usp32->sem_perm.uid); - err2 |= __put_user(s.sem_perm.gid, &usp32->sem_perm.gid); - err2 |= __put_user(s.sem_perm.cuid, &usp32->sem_perm.cuid); - err2 |= __put_user(s.sem_perm.cgid, &usp32->sem_perm.cgid); - err2 |= __put_user(s.sem_perm.mode, &usp32->sem_perm.mode); - err2 |= __put_user(s.sem_perm.seq, &usp32->sem_perm.seq); - err2 |= __put_user(s.sem_otime, &usp32->sem_otime); - err2 |= __put_user(s.sem_ctime, &usp32->sem_ctime); - err2 |= __put_user(s.sem_nsems, &usp32->sem_nsems); - } - if (err2) - err = -EFAULT; - break; - } - return err; -} - -static int -do_sys32_msgsnd (int first, int second, int third, void *uptr) -{ - struct msgbuf *p = kmalloc(second + sizeof(struct msgbuf), GFP_USER); - struct msgbuf32 *up = (struct msgbuf32 *)uptr; - mm_segment_t old_fs; - int err; - - if (!p) - return -ENOMEM; - err = get_user(p->mtype, &up->mtype); - err |= copy_from_user(p->mtext, &up->mtext, second); - if (err) - goto out; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_msgsnd(first, p, second, third); - set_fs(old_fs); - out: - kfree(p); - return err; -} - -static int -do_sys32_msgrcv (int first, int second, int msgtyp, int third, int version, void *uptr) -{ - struct msgbuf32 *up; - struct msgbuf *p; - mm_segment_t old_fs; - int err; - - if (!version) { - struct ipc_kludge *uipck = (struct ipc_kludge *)uptr; - struct ipc_kludge ipck; - - err = -EINVAL; - if (!uptr) - goto out; - err = -EFAULT; - if (copy_from_user(&ipck, uipck, sizeof(struct ipc_kludge))) - goto out; - uptr = (void *)A(ipck.msgp); - msgtyp = ipck.msgtyp; - } - err = -ENOMEM; - p = kmalloc(second + sizeof(struct msgbuf), GFP_USER); - if (!p) - goto out; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_msgrcv(first, p, second, msgtyp, third); - set_fs(old_fs); - if (err < 0) - goto free_then_out; - up = (struct msgbuf32 *)uptr; - if (put_user(p->mtype, &up->mtype) || copy_to_user(&up->mtext, p->mtext, err)) - err = -EFAULT; -free_then_out: - kfree(p); -out: - return err; -} - -static int -msgctl32 (int first, int second, void *uptr) -{ - int err = -EINVAL, err2; - struct msqid64_ds m64; - struct msqid_ds32 *up32 = (struct msqid_ds32 *)uptr; - struct msqid64_ds32 *up64 = (struct msqid64_ds32 *)uptr; - mm_segment_t old_fs; - int version = ipc_parse_version32(&second); - - switch (second) { - case IPC_INFO: - case IPC_RMID: - case MSG_INFO: - err = sys_msgctl(first, second, (struct msqid_ds *)uptr); - break; - - case IPC_SET: - if (version == IPC_64) { - err = get_user(m64.msg_perm.uid, &up64->msg_perm.uid); - err |= get_user(m64.msg_perm.gid, &up64->msg_perm.gid); - err |= get_user(m64.msg_perm.mode, &up64->msg_perm.mode); - err |= get_user(m64.msg_qbytes, &up64->msg_qbytes); - } else { - err = get_user(m64.msg_perm.uid, &up32->msg_perm.uid); - err |= get_user(m64.msg_perm.gid, &up32->msg_perm.gid); - err |= get_user(m64.msg_perm.mode, &up32->msg_perm.mode); - err |= get_user(m64.msg_qbytes, &up32->msg_qbytes); - } - if (err) - break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_msgctl(first, second, (struct msqid_ds *)&m64); - set_fs(old_fs); - break; - - case IPC_STAT: - case MSG_STAT: - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_msgctl(first, second, (struct msqid_ds *)&m64); - set_fs(old_fs); - - if (version == IPC_64) { - if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64))) { - err = -EFAULT; - break; - } - err2 = __put_user(m64.msg_perm.key, &up64->msg_perm.key); - err2 |= __put_user(m64.msg_perm.uid, &up64->msg_perm.uid); - err2 |= __put_user(m64.msg_perm.gid, &up64->msg_perm.gid); - err2 |= __put_user(m64.msg_perm.cuid, &up64->msg_perm.cuid); - err2 |= __put_user(m64.msg_perm.cgid, &up64->msg_perm.cgid); - err2 |= __put_user(m64.msg_perm.mode, &up64->msg_perm.mode); - err2 |= __put_user(m64.msg_perm.seq, &up64->msg_perm.seq); - err2 |= __put_user(m64.msg_stime, &up64->msg_stime); - err2 |= __put_user(m64.msg_rtime, &up64->msg_rtime); - err2 |= __put_user(m64.msg_ctime, &up64->msg_ctime); - err2 |= __put_user(m64.msg_cbytes, &up64->msg_cbytes); - err2 |= __put_user(m64.msg_qnum, &up64->msg_qnum); - err2 |= __put_user(m64.msg_qbytes, &up64->msg_qbytes); - err2 |= __put_user(m64.msg_lspid, &up64->msg_lspid); - err2 |= __put_user(m64.msg_lrpid, &up64->msg_lrpid); - if (err2) - err = -EFAULT; - } else { - if (!access_ok(VERIFY_WRITE, up32, sizeof(*up32))) { - err = -EFAULT; - break; - } - err2 = __put_user(m64.msg_perm.key, &up32->msg_perm.key); - err2 |= __put_user(m64.msg_perm.uid, &up32->msg_perm.uid); - err2 |= __put_user(m64.msg_perm.gid, &up32->msg_perm.gid); - err2 |= __put_user(m64.msg_perm.cuid, &up32->msg_perm.cuid); - err2 |= __put_user(m64.msg_perm.cgid, &up32->msg_perm.cgid); - err2 |= __put_user(m64.msg_perm.mode, &up32->msg_perm.mode); - err2 |= __put_user(m64.msg_perm.seq, &up32->msg_perm.seq); - err2 |= __put_user(m64.msg_stime, &up32->msg_stime); - err2 |= __put_user(m64.msg_rtime, &up32->msg_rtime); - err2 |= __put_user(m64.msg_ctime, &up32->msg_ctime); - err2 |= __put_user(m64.msg_cbytes, &up32->msg_cbytes); - err2 |= __put_user(m64.msg_qnum, &up32->msg_qnum); - err2 |= __put_user(m64.msg_qbytes, &up32->msg_qbytes); - err2 |= __put_user(m64.msg_lspid, &up32->msg_lspid); - err2 |= __put_user(m64.msg_lrpid, &up32->msg_lrpid); - if (err2) - err = -EFAULT; - } - break; - } - return err; -} - -static int -shmat32 (int first, int second, int third, int version, void *uptr) -{ - unsigned long raddr; - u32 *uaddr = (u32 *)A((u32)third); - int err; - - if (version == 1) - return -EINVAL; /* iBCS2 emulator entry point: unsupported */ - err = do_shmat(first, uptr, second, &raddr); - if (err) - return err; - return put_user(raddr, uaddr); -} - -static int -shmctl32 (int first, int second, void *uptr) -{ - int err = -EFAULT, err2; - - struct shmid64_ds s64; - struct shmid_ds32 *up32 = (struct shmid_ds32 *)uptr; - struct shmid64_ds32 *up64 = (struct shmid64_ds32 *)uptr; - mm_segment_t old_fs; - struct shm_info32 *uip = (struct shm_info32 *)uptr; - struct shm_info si; - int version = ipc_parse_version32(&second); - struct shminfo64 smi; - struct shminfo *usi32 = (struct shminfo *) uptr; - struct shminfo64_32 *usi64 = (struct shminfo64_32 *) uptr; - - switch (second) { - case IPC_INFO: - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_shmctl(first, second, (struct shmid_ds *)&smi); - set_fs(old_fs); - - if (version == IPC_64) { - if (!access_ok(VERIFY_WRITE, usi64, sizeof(*usi64))) { - err = -EFAULT; - break; - } - err2 = __put_user(smi.shmmax, &usi64->shmmax); - err2 |= __put_user(smi.shmmin, &usi64->shmmin); - err2 |= __put_user(smi.shmmni, &usi64->shmmni); - err2 |= __put_user(smi.shmseg, &usi64->shmseg); - err2 |= __put_user(smi.shmall, &usi64->shmall); - } else { - if (!access_ok(VERIFY_WRITE, usi32, sizeof(*usi32))) { - err = -EFAULT; - break; - } - err2 = __put_user(smi.shmmax, &usi32->shmmax); - err2 |= __put_user(smi.shmmin, &usi32->shmmin); - err2 |= __put_user(smi.shmmni, &usi32->shmmni); - err2 |= __put_user(smi.shmseg, &usi32->shmseg); - err2 |= __put_user(smi.shmall, &usi32->shmall); - } - if (err2) - err = -EFAULT; - break; - - case IPC_RMID: - case SHM_LOCK: - case SHM_UNLOCK: - err = sys_shmctl(first, second, (struct shmid_ds *)uptr); - break; - - case IPC_SET: - if (version == IPC_64) { - err = get_user(s64.shm_perm.uid, &up64->shm_perm.uid); - err |= get_user(s64.shm_perm.gid, &up64->shm_perm.gid); - err |= get_user(s64.shm_perm.mode, &up64->shm_perm.mode); - } else { - err = get_user(s64.shm_perm.uid, &up32->shm_perm.uid); - err |= get_user(s64.shm_perm.gid, &up32->shm_perm.gid); - err |= get_user(s64.shm_perm.mode, &up32->shm_perm.mode); - } - if (err) - break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_shmctl(first, second, (struct shmid_ds *)&s64); - set_fs(old_fs); - break; - - case IPC_STAT: - case SHM_STAT: - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_shmctl(first, second, (struct shmid_ds *)&s64); - set_fs(old_fs); - if (err < 0) - break; - if (version == IPC_64) { - if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64))) { - err = -EFAULT; - break; - } - err2 = __put_user(s64.shm_perm.key, &up64->shm_perm.key); - err2 |= __put_user(s64.shm_perm.uid, &up64->shm_perm.uid); - err2 |= __put_user(s64.shm_perm.gid, &up64->shm_perm.gid); - err2 |= __put_user(s64.shm_perm.cuid, &up64->shm_perm.cuid); - err2 |= __put_user(s64.shm_perm.cgid, &up64->shm_perm.cgid); - err2 |= __put_user(s64.shm_perm.mode, &up64->shm_perm.mode); - err2 |= __put_user(s64.shm_perm.seq, &up64->shm_perm.seq); - err2 |= __put_user(s64.shm_atime, &up64->shm_atime); - err2 |= __put_user(s64.shm_dtime, &up64->shm_dtime); - err2 |= __put_user(s64.shm_ctime, &up64->shm_ctime); - err2 |= __put_user(s64.shm_segsz, &up64->shm_segsz); - err2 |= __put_user(s64.shm_nattch, &up64->shm_nattch); - err2 |= __put_user(s64.shm_cpid, &up64->shm_cpid); - err2 |= __put_user(s64.shm_lpid, &up64->shm_lpid); - } else { - if (!access_ok(VERIFY_WRITE, up32, sizeof(*up32))) { - err = -EFAULT; - break; - } - err2 = __put_user(s64.shm_perm.key, &up32->shm_perm.key); - err2 |= __put_user(s64.shm_perm.uid, &up32->shm_perm.uid); - err2 |= __put_user(s64.shm_perm.gid, &up32->shm_perm.gid); - err2 |= __put_user(s64.shm_perm.cuid, &up32->shm_perm.cuid); - err2 |= __put_user(s64.shm_perm.cgid, &up32->shm_perm.cgid); - err2 |= __put_user(s64.shm_perm.mode, &up32->shm_perm.mode); - err2 |= __put_user(s64.shm_perm.seq, &up32->shm_perm.seq); - err2 |= __put_user(s64.shm_atime, &up32->shm_atime); - err2 |= __put_user(s64.shm_dtime, &up32->shm_dtime); - err2 |= __put_user(s64.shm_ctime, &up32->shm_ctime); - err2 |= __put_user(s64.shm_segsz, &up32->shm_segsz); - err2 |= __put_user(s64.shm_nattch, &up32->shm_nattch); - err2 |= __put_user(s64.shm_cpid, &up32->shm_cpid); - err2 |= __put_user(s64.shm_lpid, &up32->shm_lpid); - } - if (err2) - err = -EFAULT; - break; - - case SHM_INFO: - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_shmctl(first, second, (void *)&si); - set_fs(old_fs); - if (err < 0) - break; - - if (!access_ok(VERIFY_WRITE, uip, sizeof(*uip))) { - err = -EFAULT; - break; - } - err2 = __put_user(si.used_ids, &uip->used_ids); - err2 |= __put_user(si.shm_tot, &uip->shm_tot); - err2 |= __put_user(si.shm_rss, &uip->shm_rss); - err2 |= __put_user(si.shm_swp, &uip->shm_swp); - err2 |= __put_user(si.swap_attempts, &uip->swap_attempts); - err2 |= __put_user(si.swap_successes, &uip->swap_successes); - if (err2) - err = -EFAULT; - break; - - } - return err; -} - -extern int sem_ctls[]; -#define sc_semopm (sem_ctls[2]) - -static long -semtimedop32(int semid, struct sembuf *tsops, int nsops, - struct compat_timespec *timeout32) -{ - struct timespec t; - mm_segment_t oldfs; - long ret; - - /* parameter checking precedence should mirror sys_semtimedop() */ - if (nsops < 1 || semid < 0) - return -EINVAL; - if (nsops > sc_semopm) - return -E2BIG; - if (!access_ok(VERIFY_READ, tsops, nsops * sizeof(struct sembuf)) || - get_compat_timespec(&t, timeout32)) - return -EFAULT; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_semtimedop(semid, tsops, nsops, &t); - set_fs(oldfs); - return ret; -} - asmlinkage long sys32_ipc(u32 call, int first, int second, int third, u32 ptr, u32 fifth) { @@ -1632,36 +1047,36 @@ sys32_ipc(u32 call, int first, int secon switch (call) { case SEMTIMEDOP: if (fifth) - return semtimedop32(first, (struct sembuf *)AA(ptr), - second, (struct compat_timespec *)AA(fifth)); + return compat_sys_semtimedop(first, compat_ptr(ptr), + second, compat_ptr(fifth)); /* else fall through for normal semop() */ case SEMOP: /* struct sembuf is the same on 32 and 64bit :)) */ - return sys_semtimedop(first, (struct sembuf *)AA(ptr), second, + return sys_semtimedop(first, compat_ptr(ptr), second, NULL); case SEMGET: return sys_semget(first, second, third); case SEMCTL: - return semctl32(first, second, third, (void *)AA(ptr)); + return compat_sys_semctl(first, second, third, compat_ptr(ptr)); case MSGSND: - return do_sys32_msgsnd(first, second, third, (void *)AA(ptr)); + return compat_sys_msgsnd(first, second, third, compat_ptr(ptr)); case MSGRCV: - return do_sys32_msgrcv(first, second, fifth, third, version, (void *)AA(ptr)); + return compat_sys_msgrcv(first, second, fifth, third, version, compat_ptr(ptr)); case MSGGET: return sys_msgget((key_t) first, second); case MSGCTL: - return msgctl32(first, second, (void *)AA(ptr)); + return compat_sys_msgctl(first, second, compat_ptr(ptr)); case SHMAT: - return shmat32(first, second, third, version, (void *)AA(ptr)); + return compat_sys_shmat(first, second, third, version, compat_ptr(ptr)); break; case SHMDT: - return sys_shmdt((char *)AA(ptr)); + return sys_shmdt(compat_ptr(ptr)); case SHMGET: return sys_shmget(first, second, third); case SHMCTL: - return shmctl32(first, second, (void *)AA(ptr)); + return compat_sys_shmctl(first, second, compat_ptr(ptr)); default: return -ENOSYS; --- linux-2.6.4-rc1/arch/ia64/Kconfig 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/ia64/Kconfig 2004-02-29 13:09:27.000000000 -0800 @@ -288,39 +288,6 @@ config FORCE_MAX_ZONEORDER int default "18" -choice - prompt "Huge TLB page size" - depends on HUGETLB_PAGE - default HUGETLB_PAGE_SIZE_16MB - -config HUGETLB_PAGE_SIZE_4GB - depends on MCKINLEY - bool "4GB" - -config HUGETLB_PAGE_SIZE_1GB - depends on MCKINLEY - bool "1GB" - -config HUGETLB_PAGE_SIZE_256MB - bool "256MB" - -config HUGETLB_PAGE_SIZE_64MB - bool "64MB" - -config HUGETLB_PAGE_SIZE_16MB - bool "16MB" - -config HUGETLB_PAGE_SIZE_4MB - bool "4MB" - -config HUGETLB_PAGE_SIZE_1MB - bool "1MB" - -config HUGETLB_PAGE_SIZE_256KB - bool "256KB" - -endchoice - config IA64_PAL_IDLE bool "Use PAL_HALT_LIGHT in idle loop" help @@ -522,6 +489,13 @@ config BLK_DEV_RAM_SIZE depends on BLK_DEV_RAM default "4096" +config LOCKMETER + bool "Kernel lock metering" + depends on SMP + help + Say Y to enable kernel lock metering, which adds overhead to SMP locks, + but allows you to see various statistics using the lockstat command. + endmenu source "fs/Kconfig" @@ -649,7 +623,11 @@ config DEBUG_INFO debugging info resulting in a larger kernel image. Say Y here only if you plan to use gdb to debug the kernel. If you don't debug the kernel, you can say N. - + +config SYSVIPC_COMPAT + bool + depends on COMPAT && SYSVIPC + default y endmenu source "security/Kconfig" --- linux-2.6.4-rc1/arch/ia64/kernel/head.S 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/ia64/kernel/head.S 2004-02-29 13:07:38.000000000 -0800 @@ -816,6 +816,19 @@ GLOBAL_ENTRY(ia64_delay_loop) br.ret.sptk.many rp END(ia64_delay_loop) +GLOBAL_ENTRY(ia64_invoke_kernel_thread_helper) + .prologue + .save rp, r0 // this is the end of the call-chain + .body + alloc r2 = ar.pfs, 0, 0, 2, 0 + mov out0 = r9 + mov out1 = r11;; + br.call.sptk.many rp = kernel_thread_helper;; + mov out0 = r8 + br.call.sptk.many rp = sys_exit;; +1: br.sptk.few 1b // not reached +END(ia64_invoke_kernel_thread_helper) + #ifdef CONFIG_IA64_BRL_EMU /* --- linux-2.6.4-rc1/arch/ia64/kernel/iosapic.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/ia64/kernel/iosapic.c 2004-02-29 13:07:38.000000000 -0800 @@ -103,6 +103,7 @@ static spinlock_t iosapic_lock = SPIN_LO static struct iosapic_intr_info { char *addr; /* base address of IOSAPIC */ + u32 low32; /* current value of low word of Redirection table entry */ unsigned int gsi_base; /* first GSI assigned to this IOSAPIC */ char rte_index; /* IOSAPIC RTE index (-1 => not an IOSAPIC interrupt) */ unsigned char dmode : 3; /* delivery mode (see iosapic.h) */ @@ -213,6 +214,7 @@ set_rte (unsigned int vector, unsigned i writel(high32, addr + IOSAPIC_WINDOW); writel(IOSAPIC_RTE_LOW(rte_index), addr + IOSAPIC_REG_SELECT); writel(low32, addr + IOSAPIC_WINDOW); + iosapic_intr_info[vector].low32 = low32; } static void @@ -239,9 +241,10 @@ mask_irq (unsigned int irq) spin_lock_irqsave(&iosapic_lock, flags); { writel(IOSAPIC_RTE_LOW(rte_index), addr + IOSAPIC_REG_SELECT); - low32 = readl(addr + IOSAPIC_WINDOW); - low32 |= (1 << IOSAPIC_MASK_SHIFT); /* set only the mask bit */ + /* set only the mask bit */ + low32 = iosapic_intr_info[vec].low32 |= IOSAPIC_MASK; + writel(low32, addr + IOSAPIC_WINDOW); } spin_unlock_irqrestore(&iosapic_lock, flags); @@ -264,9 +267,7 @@ unmask_irq (unsigned int irq) spin_lock_irqsave(&iosapic_lock, flags); { writel(IOSAPIC_RTE_LOW(rte_index), addr + IOSAPIC_REG_SELECT); - low32 = readl(addr + IOSAPIC_WINDOW); - - low32 &= ~(1 << IOSAPIC_MASK_SHIFT); /* clear only the mask bit */ + low32 = iosapic_intr_info[vec].low32 &= ~IOSAPIC_MASK; writel(low32, addr + IOSAPIC_WINDOW); } spin_unlock_irqrestore(&iosapic_lock, flags); @@ -307,9 +308,7 @@ iosapic_set_affinity (unsigned int irq, { /* get current delivery mode by reading the low32 */ writel(IOSAPIC_RTE_LOW(rte_index), addr + IOSAPIC_REG_SELECT); - low32 = readl(addr + IOSAPIC_WINDOW); - - low32 &= ~(7 << IOSAPIC_DELIVERY_SHIFT); + low32 = iosapic_intr_info[vec].low32 & ~(7 << IOSAPIC_DELIVERY_SHIFT); if (redir) /* change delivery mode to lowest priority */ low32 |= (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT); @@ -317,6 +316,7 @@ iosapic_set_affinity (unsigned int irq, /* change delivery mode to fixed */ low32 |= (IOSAPIC_FIXED << IOSAPIC_DELIVERY_SHIFT); + iosapic_intr_info[vec].low32 = low32; writel(IOSAPIC_RTE_HIGH(rte_index), addr + IOSAPIC_REG_SELECT); writel(high32, addr + IOSAPIC_WINDOW); writel(IOSAPIC_RTE_LOW(rte_index), addr + IOSAPIC_REG_SELECT); --- linux-2.6.4-rc1/arch/ia64/kernel/irq.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/ia64/kernel/irq.c 2004-02-29 13:07:38.000000000 -0800 @@ -455,7 +455,6 @@ unsigned int do_IRQ(unsigned long irq, s unsigned int status; int cpu; - irq_enter(); cpu = smp_processor_id(); /* for CONFIG_PREEMPT, this must come after irq_enter()! */ kstat_cpu(cpu).irqs[irq]++; @@ -525,7 +524,6 @@ unsigned int do_IRQ(unsigned long irq, s desc->handler->end(irq); spin_unlock(&desc->lock); } - irq_exit(); return 1; } --- linux-2.6.4-rc1/arch/ia64/kernel/irq_ia64.c 2004-02-03 20:42:34.000000000 -0800 +++ 25/arch/ia64/kernel/irq_ia64.c 2004-02-29 13:07:38.000000000 -0800 @@ -120,6 +120,7 @@ ia64_handle_irq (ia64_vector vector, str * 16 (without this, it would be ~240, which could easily lead * to kernel stack overflows). */ + irq_enter(); saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); ia64_srlz_d(); while (vector != IA64_SPURIOUS_INT_VECTOR) { @@ -143,8 +144,7 @@ ia64_handle_irq (ia64_vector vector, str * handler needs to be able to wait for further keyboard interrupts, which can't * come through until ia64_eoi() has been done. */ - if (local_softirq_pending()) - do_softirq(); + irq_exit(); } #ifdef CONFIG_SMP --- linux-2.6.4-rc1/arch/ia64/kernel/ivt.S 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/ia64/kernel/ivt.S 2004-02-29 13:07:38.000000000 -0800 @@ -118,10 +118,11 @@ ENTRY(vhpt_miss) #ifdef CONFIG_HUGETLB_PAGE extr.u r26=r25,2,6 ;; - cmp.eq p8,p0=HPAGE_SHIFT,r26 + cmp.ne p8,p0=r18,r26 + sub r27=r26,r18 ;; (p8) dep r25=r18,r25,2,6 -(p8) shr r22=r22,HPAGE_SHIFT-PAGE_SHIFT +(p8) shr r22=r22,r27 #endif ;; cmp.eq p6,p7=5,r17 // is IFA pointing into to region 5? --- linux-2.6.4-rc1/arch/ia64/kernel/perfmon.c 2004-02-03 20:42:34.000000000 -0800 +++ 25/arch/ia64/kernel/perfmon.c 2004-02-29 13:07:38.000000000 -0800 @@ -82,7 +82,7 @@ #define PFM_REG_IMPL 0x1 /* register implemented */ #define PFM_REG_END 0x2 /* end marker */ #define PFM_REG_MONITOR (0x1<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */ -#define PFM_REG_COUNTING (0x2<<4|PFM_REG_MONITOR|PFM_REG_IMPL) /* a monitor + pmc.oi+ PMD used as a counter */ +#define PFM_REG_COUNTING (0x2<<4|PFM_REG_MONITOR) /* a monitor + pmc.oi+ PMD used as a counter */ #define PFM_REG_CONTROL (0x4<<4|PFM_REG_IMPL) /* PMU control register */ #define PFM_REG_CONFIG (0x8<<4|PFM_REG_IMPL) /* configuration register */ #define PFM_REG_BUFFER (0xc<<4|PFM_REG_IMPL) /* PMD used as buffer */ @@ -109,14 +109,15 @@ #define PMD_PMD_DEP(i) pmu_conf.pmd_desc[i].dep_pmd[0] #define PMC_PMD_DEP(i) pmu_conf.pmc_desc[i].dep_pmd[0] -/* k assumed unsigned (up to 64 registers) */ -#define IBR_IS_IMPL(k) (k< IA64_NUM_DBG_REGS) -#define DBR_IS_IMPL(k) (k< IA64_NUM_DBG_REGS) +#define PFM_NUM_IBRS IA64_NUM_DBG_REGS +#define PFM_NUM_DBRS IA64_NUM_DBG_REGS #define CTX_OVFL_NOBLOCK(c) ((c)->ctx_fl_block == 0) #define CTX_HAS_SMPL(c) ((c)->ctx_fl_is_sampling) #define PFM_CTX_TASK(h) (h)->ctx_task +#define PMU_PMC_OI 5 /* position of pmc.oi bit */ + /* XXX: does not support more than 64 PMDs */ #define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask) #define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL) @@ -218,6 +219,8 @@ /* * debugging */ +#define PFM_DEBUGGING 1 +#ifdef PFM_DEBUGGING #define DPRINT(a) \ do { \ if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \ @@ -227,18 +230,7 @@ do { \ if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \ } while (0) -/* - * Architected PMC structure - */ -typedef struct { - unsigned long pmc_plm:4; /* privilege level mask */ - unsigned long pmc_ev:1; /* external visibility */ - unsigned long pmc_oi:1; /* overflow interrupt */ - unsigned long pmc_pm:1; /* privileged monitor */ - unsigned long pmc_ig1:1; /* reserved */ - unsigned long pmc_es:8; /* event select */ - unsigned long pmc_ig2:48; /* reserved */ -} pfm_monitor_t; +#endif /* * 64-bit software counter structure @@ -469,20 +461,13 @@ typedef struct { #define PFM_CMD_STOP 0x08 /* command does not work on zombie context */ -#define PFM_CMD_IDX(cmd) (cmd) -#define PFM_CMD_IS_VALID(cmd) ((PFM_CMD_IDX(cmd) >= 0) && (PFM_CMD_IDX(cmd) < PFM_CMD_COUNT) \ - && pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_func != NULL) - -#define PFM_CMD_NAME(cmd) pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_name -#define PFM_CMD_READ_ARG(cmd) (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_READ) -#define PFM_CMD_RW_ARG(cmd) (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_RW) -#define PFM_CMD_USE_FD(cmd) (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_FD) -#define PFM_CMD_STOPPED(cmd) (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_STOP) +#define PFM_CMD_NAME(cmd) pfm_cmd_tab[(cmd)].cmd_name +#define PFM_CMD_READ_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_READ) +#define PFM_CMD_RW_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_RW) +#define PFM_CMD_USE_FD(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_FD) +#define PFM_CMD_STOPPED(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_STOP) #define PFM_CMD_ARG_MANY -1 /* cannot be zero */ -#define PFM_CMD_NARG(cmd) (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_narg) -#define PFM_CMD_ARG_SIZE(cmd) (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_argsize) -#define PFM_CMD_GETSIZE(cmd) (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_getsize) typedef struct { int debug; /* turn on/off debugging via syslog */ @@ -2834,10 +2819,11 @@ static int pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { struct thread_struct *thread = NULL; + struct task_struct *task; pfarg_reg_t *req = (pfarg_reg_t *)arg; - unsigned long value; - unsigned long smpl_pmds, reset_pmds; - unsigned int cnum, reg_flags, flags; + unsigned long value, pmc_pm; + unsigned long smpl_pmds, reset_pmds, impl_pmds; + unsigned int cnum, reg_flags, flags, pmc_type; int i, can_access_pmu = 0, is_loaded, is_system; int is_monitor, is_counting, state; int ret = -EINVAL; @@ -2846,12 +2832,13 @@ pfm_write_pmcs(pfm_context_t *ctx, void state = ctx->ctx_state; is_loaded = state == PFM_CTX_LOADED ? 1 : 0; is_system = ctx->ctx_fl_system; + task = ctx->ctx_task; + impl_pmds = pmu_conf.impl_pmds[0]; if (state == PFM_CTX_TERMINATED || state == PFM_CTX_ZOMBIE) return -EINVAL; - if (is_loaded) { - thread = &ctx->ctx_task->thread; + thread = &task->thread; /* * In system wide and when the context is loaded, access can only happen * when the caller is running on the CPU being monitored by the session. @@ -2861,7 +2848,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void DPRINT(("[%d] should be running on CPU%d\n", current->pid, ctx->ctx_cpu)); return -EBUSY; } - can_access_pmu = GET_PMU_OWNER() == ctx->ctx_task || is_system ? 1 : 0; + can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0; } for (i = 0; i < count; i++, req++) { @@ -2873,16 +2860,24 @@ pfm_write_pmcs(pfm_context_t *ctx, void reset_pmds = req->reg_reset_pmds[0]; flags = 0; - is_counting = PMC_IS_COUNTING(cnum); - is_monitor = PMC_IS_MONITOR(cnum); + + if (cnum >= PMU_MAX_PMCS) { + DPRINT(("pmc%u is invalid\n", cnum)); + goto error; + } + + pmc_type = pmu_conf.pmc_desc[cnum].type; + pmc_pm = (value >> pmu_conf.pmc_desc[cnum].pm_pos) & 0x1; + is_counting = (pmc_type & PFM_REG_COUNTING) == PFM_REG_COUNTING ? 1 : 0; + is_monitor = (pmc_type & PFM_REG_MONITOR) == PFM_REG_MONITOR ? 1 : 0; /* * we reject all non implemented PMC as well * as attempts to modify PMC[0-3] which are used * as status registers by the PMU */ - if (PMC_IS_IMPL(cnum) == 0 || PMC_IS_CONTROL(cnum)) { - DPRINT(("pmc%u is unimplemented or invalid\n", cnum)); + if ((pmc_type & PFM_REG_IMPL) == 0 || (pmc_type & PFM_REG_CONTROL) == PFM_REG_CONTROL) { + DPRINT(("pmc%u is unimplemented or no-access pmc_type=%x\n", cnum, pmc_type)); goto error; } /* @@ -2890,21 +2885,20 @@ pfm_write_pmcs(pfm_context_t *ctx, void * - system-wide session: PMCx.pm=1 (privileged monitor) * - per-task : PMCx.pm=0 (user monitor) */ - if ((is_monitor || is_counting) && value != PMC_DFL_VAL(cnum) && PFM_CHECK_PMC_PM(ctx, cnum, value)) { - DPRINT(("pmc%u pmc_pm=%ld fl_system=%d\n", + if (is_monitor && value != PMC_DFL_VAL(cnum) && is_system ^ pmc_pm) { + DPRINT(("pmc%u pmc_pm=%lu is_system=%d\n", cnum, - PMC_PM(cnum, value), - ctx->ctx_fl_system)); + pmc_pm, + is_system)); goto error; } if (is_counting) { - pfm_monitor_t *p = (pfm_monitor_t *)&value; /* * enforce generation of overflow interrupt. Necessary on all * CPUs. */ - p->pmc_oi = 1; + value |= 1 << PMU_PMC_OI; if (reg_flags & PFM_REGFL_OVFL_NOTIFY) { flags |= PFM_REGFL_OVFL_NOTIFY; @@ -2913,13 +2907,13 @@ pfm_write_pmcs(pfm_context_t *ctx, void if (reg_flags & PFM_REGFL_RANDOM) flags |= PFM_REGFL_RANDOM; /* verify validity of smpl_pmds */ - if ((smpl_pmds & pmu_conf.impl_pmds[0]) != smpl_pmds) { + if ((smpl_pmds & impl_pmds) != smpl_pmds) { DPRINT(("invalid smpl_pmds 0x%lx for pmc%u\n", smpl_pmds, cnum)); goto error; } /* verify validity of reset_pmds */ - if ((reset_pmds & pmu_conf.impl_pmds[0]) != reset_pmds) { + if ((reset_pmds & impl_pmds) != reset_pmds) { DPRINT(("invalid reset_pmds 0x%lx for pmc%u\n", reset_pmds, cnum)); goto error; } @@ -2935,7 +2929,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void * execute write checker, if any */ if (PMC_WR_FUNC(cnum)) { - ret = PMC_WR_FUNC(cnum)(ctx->ctx_task, ctx, cnum, &value, regs); + ret = PMC_WR_FUNC(cnum)(task, ctx, cnum, &value, regs); if (ret) goto error; ret = -EINVAL; } @@ -2997,7 +2991,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void * * The value in ctx_pmcs[] can only be changed in pfm_write_pmcs(). * - * The value in t->pmc[] may be modified on overflow, i.e., when + * The value in thread->pmcs[] may be modified on overflow, i.e., when * monitoring needs to be stopped. */ if (is_monitor) CTX_USED_MONITOR(ctx, 1UL << cnum); @@ -3056,11 +3050,6 @@ pfm_write_pmcs(pfm_context_t *ctx, void return 0; error: PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL); - - req->reg_flags = PFM_REG_RETFL_EINVAL; - - DPRINT(("pmc[%u]=0x%lx error %d\n", cnum, value, ret)); - return ret; } @@ -3068,6 +3057,7 @@ static int pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { struct thread_struct *thread = NULL; + struct task_struct *task; pfarg_reg_t *req = (pfarg_reg_t *)arg; unsigned long value, hw_value, ovfl_mask; unsigned int cnum; @@ -3080,25 +3070,26 @@ pfm_write_pmds(pfm_context_t *ctx, void is_loaded = state == PFM_CTX_LOADED ? 1 : 0; is_system = ctx->ctx_fl_system; ovfl_mask = pmu_conf.ovfl_val; + task = ctx->ctx_task; - if (state == PFM_CTX_TERMINATED || state == PFM_CTX_ZOMBIE) return -EINVAL; + if (unlikely(state == PFM_CTX_TERMINATED || state == PFM_CTX_ZOMBIE)) return -EINVAL; /* * on both UP and SMP, we can only write to the PMC when the task is * the owner of the local PMU. */ - if (is_loaded) { - thread = &ctx->ctx_task->thread; + if (likely(is_loaded)) { + thread = &task->thread; /* * In system wide and when the context is loaded, access can only happen * when the caller is running on the CPU being monitored by the session. * It does not have to be the owner (ctx_task) of the context per se. */ - if (is_system && ctx->ctx_cpu != smp_processor_id()) { + if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) { DPRINT(("[%d] should be running on CPU%d\n", current->pid, ctx->ctx_cpu)); return -EBUSY; } - can_access_pmu = GET_PMU_OWNER() == ctx->ctx_task || is_system ? 1 : 0; + can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0; } for (i = 0; i < count; i++, req++) { @@ -3118,7 +3109,7 @@ pfm_write_pmds(pfm_context_t *ctx, void if (PMD_WR_FUNC(cnum)) { unsigned long v = value; - ret = PMD_WR_FUNC(cnum)(ctx->ctx_task, ctx, cnum, &v, regs); + ret = PMD_WR_FUNC(cnum)(task, ctx, cnum, &v, regs); if (ret) goto abort_mission; value = v; @@ -3243,16 +3234,6 @@ abort_mission: * for now, we have only one possibility for error */ PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL); - - /* - * we change the return value to EFAULT in case we cannot write register return code. - * The caller first must correct this error, then a resubmission of the request will - * eventually yield the EINVAL. - */ - req->reg_flags = PFM_REG_RETFL_EINVAL; - - DPRINT(("pmd[%u]=0x%lx ret %d\n", cnum, value, ret)); - return ret; } @@ -3269,11 +3250,12 @@ static int pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { struct thread_struct *thread = NULL; - unsigned long val = 0UL, lval, ovfl_mask; + struct task_struct *task; + unsigned long val = 0UL, lval, ovfl_mask, sval; pfarg_reg_t *req = (pfarg_reg_t *)arg; unsigned int cnum, reg_flags = 0; int i, can_access_pmu = 0, state; - int is_loaded, is_system; + int is_loaded, is_system, is_counting; int ret = -EINVAL; /* @@ -3285,32 +3267,33 @@ pfm_read_pmds(pfm_context_t *ctx, void * is_loaded = state == PFM_CTX_LOADED ? 1 : 0; is_system = ctx->ctx_fl_system; ovfl_mask = pmu_conf.ovfl_val; + task = ctx->ctx_task; if (state == PFM_CTX_ZOMBIE) return -EINVAL; - if (is_loaded) { - thread = &ctx->ctx_task->thread; + if (likely(is_loaded)) { + thread = &task->thread; /* * In system wide and when the context is loaded, access can only happen * when the caller is running on the CPU being monitored by the session. * It does not have to be the owner (ctx_task) of the context per se. */ - if (is_system && ctx->ctx_cpu != smp_processor_id()) { + if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) { DPRINT(("[%d] should be running on CPU%d\n", current->pid, ctx->ctx_cpu)); return -EBUSY; } /* * this can be true when not self-monitoring only in UP */ - can_access_pmu = GET_PMU_OWNER() == ctx->ctx_task || is_system ? 1 : 0; + can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0; if (can_access_pmu) ia64_srlz_d(); } - DPRINT(("enter loaded=%d access_pmu=%d ctx_state=%d\n", + DPRINT(("loaded=%d access_pmu=%d ctx_state=%d\n", is_loaded, can_access_pmu, - ctx->ctx_state)); + state)); /* * on both UP and SMP, we can only read the PMD from the hardware register when @@ -3319,11 +3302,10 @@ pfm_read_pmds(pfm_context_t *ctx, void * for (i = 0; i < count; i++, req++) { - lval = 0UL; cnum = req->reg_num; reg_flags = req->reg_flags; - if (!PMD_IS_IMPL(cnum)) goto error; + if (unlikely(!PMD_IS_IMPL(cnum))) goto error; /* * we can only read the register that we use. That includes * the one we explicitely initialize AND the one we want included @@ -3332,7 +3314,11 @@ pfm_read_pmds(pfm_context_t *ctx, void * * Having this restriction allows optimization in the ctxsw routine * without compromising security (leaks) */ - if (!CTX_IS_USED_PMD(ctx, cnum)) goto error; + if (unlikely(!CTX_IS_USED_PMD(ctx, cnum))) goto error; + + sval = ctx->ctx_pmds[cnum].val; + lval = ctx->ctx_pmds[cnum].lval; + is_counting = PMD_IS_COUNTING(cnum); /* * If the task is not the current one, then we check if the @@ -3347,23 +3333,21 @@ pfm_read_pmds(pfm_context_t *ctx, void * * if context is zombie, then task does not exist anymore. * In this case, we use the full value saved in the context (pfm_flush_regs()). */ - val = state == PFM_CTX_LOADED ? thread->pmds[cnum] : 0UL; + val = is_loaded ? thread->pmds[cnum] : 0UL; } - if (PMD_IS_COUNTING(cnum)) { + if (is_counting) { /* * XXX: need to check for overflow when loaded */ val &= ovfl_mask; - val += ctx->ctx_pmds[cnum].val; - - lval = ctx->ctx_pmds[cnum].lval; + val += sval; } /* * execute read checker, if any */ - if (PMD_RD_FUNC(cnum)) { + if (unlikely(PMD_RD_FUNC(cnum))) { unsigned long v = val; ret = PMD_RD_FUNC(cnum)(ctx->ctx_task, ctx, cnum, &v, regs); if (ret) goto error; @@ -3373,12 +3357,7 @@ pfm_read_pmds(pfm_context_t *ctx, void * PFM_REG_RETFLAG_SET(reg_flags, 0); - DPRINT(("pmd[%u]=0x%lx loaded=%d access_pmu=%d ctx_state=%d\n", - cnum, - val, - is_loaded, - can_access_pmu, - ctx->ctx_state)); + DPRINT(("pmd[%u]=0x%lx\n", cnum, val)); /* * update register return value, abort all if problem during copy. @@ -3393,12 +3372,7 @@ pfm_read_pmds(pfm_context_t *ctx, void * return 0; error: - PFM_REG_RETFLAG_SET(reg_flags, PFM_REG_RETFL_EINVAL); - - req->reg_flags = PFM_REG_RETFL_EINVAL; - - DPRINT(("error pmd[%u]=0x%lx\n", cnum, val)); - + PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL); return ret; } @@ -3628,7 +3602,7 @@ pfm_restart(pfm_context_t *ctx, void *ar prefetch(ctx->ctx_smpl_hdr); rst_ctrl.bits.mask_monitoring = 0; - rst_ctrl.bits.reset_ovfl_pmds = 1; + rst_ctrl.bits.reset_ovfl_pmds = 0; if (state == PFM_CTX_LOADED) ret = pfm_buf_fmt_restart_active(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs); @@ -3748,6 +3722,7 @@ static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { struct thread_struct *thread = NULL; + struct task_struct *task; pfarg_dbreg_t *req = (pfarg_dbreg_t *)arg; unsigned long flags; dbreg_t dbreg; @@ -3762,6 +3737,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_ state = ctx->ctx_state; is_loaded = state == PFM_CTX_LOADED ? 1 : 0; is_system = ctx->ctx_fl_system; + task = ctx->ctx_task; if (state == PFM_CTX_TERMINATED || state == PFM_CTX_ZOMBIE) return -EINVAL; @@ -3770,17 +3746,17 @@ pfm_write_ibr_dbr(int mode, pfm_context_ * the owner of the local PMU. */ if (is_loaded) { - thread = &ctx->ctx_task->thread; + thread = &task->thread; /* * In system wide and when the context is loaded, access can only happen * when the caller is running on the CPU being monitored by the session. * It does not have to be the owner (ctx_task) of the context per se. */ - if (is_system && ctx->ctx_cpu != smp_processor_id()) { + if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) { DPRINT(("[%d] should be running on CPU%d\n", current->pid, ctx->ctx_cpu)); return -EBUSY; } - can_access_pmu = GET_PMU_OWNER() == ctx->ctx_task || is_system ? 1 : 0; + can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0; } /* @@ -3796,7 +3772,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_ * don't bother if we are loaded and task is being debugged */ if (is_loaded && (thread->flags & IA64_THREAD_DBG_VALID) != 0) { - DPRINT(("debug registers already in use for [%d]\n", ctx->ctx_task->pid)); + DPRINT(("debug registers already in use for [%d]\n", task->pid)); return -EBUSY; } @@ -3837,7 +3813,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_ * is shared by all processes running on it */ if (first_time && can_access_pmu) { - DPRINT(("[%d] clearing ibrs, dbrs\n", ctx->ctx_task->pid)); + DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid)); for (i=0; i < pmu_conf.num_ibrs; i++) { ia64_set_ibr(i, 0UL); ia64_srlz_i(); @@ -3860,7 +3836,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_ ret = -EINVAL; - if ((mode == PFM_CODE_RR && !IBR_IS_IMPL(rnum)) || ((mode == PFM_DATA_RR) && !DBR_IS_IMPL(rnum))) { + if ((mode == PFM_CODE_RR && rnum >= PFM_NUM_IBRS) || ((mode == PFM_DATA_RR) && rnum >= PFM_NUM_DBRS)) { DPRINT(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n", rnum, dbreg.val, mode, i, count)); @@ -4434,6 +4410,7 @@ pfm_context_unload(pfm_context_t *ctx, v struct task_struct *task = PFM_CTX_TASK(ctx); struct pt_regs *tregs; int state, is_system; + int ret; DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1)); @@ -4451,7 +4428,8 @@ pfm_context_unload(pfm_context_t *ctx, v /* * clear psr and dcr bits */ - pfm_stop(ctx, NULL, 0, regs); + ret = pfm_stop(ctx, NULL, 0, regs); + if (ret) return ret; ctx->ctx_state = state = PFM_CTX_UNLOADED; @@ -4760,37 +4738,45 @@ sys_perfmonctl (int fd, int cmd, void *a void *args_k = NULL; long ret; /* will expand int return types */ size_t base_sz, sz, xtra_sz = 0; - int narg, completed_args = 0, call_made = 0; + int narg, completed_args = 0, call_made = 0, cmd_flags; + int (*func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); + int (*getsize)(void *arg, size_t *sz); #define PFM_MAX_ARGSIZE 4096 /* - * reject any call if perfmon was disabled at initialization time - mask*/ - if (PFM_IS_DISABLED()) return -ENOSYS; + * reject any call if perfmon was disabled at initialization + */ + if (unlikely(PFM_IS_DISABLED())) return -ENOSYS; - if (unlikely(PFM_CMD_IS_VALID(cmd) == 0)) { + if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT)) { DPRINT(("[%d] invalid cmd=%d\n", current->pid, cmd)); return -EINVAL; } - DPRINT(("cmd=%s idx=%d valid=%d narg=0x%x argsz=%lu count=%d\n", + func = pfm_cmd_tab[cmd].cmd_func; + narg = pfm_cmd_tab[cmd].cmd_narg; + base_sz = pfm_cmd_tab[cmd].cmd_argsize; + getsize = pfm_cmd_tab[cmd].cmd_getsize; + cmd_flags = pfm_cmd_tab[cmd].cmd_flags; + + if (unlikely(func == NULL)) { + DPRINT(("[%d] invalid cmd=%d\n", current->pid, cmd)); + return -EINVAL; + } + + DPRINT(("cmd=%s idx=%d narg=0x%x argsz=%lu count=%d\n", PFM_CMD_NAME(cmd), - PFM_CMD_IDX(cmd), - PFM_CMD_IS_VALID(cmd), - PFM_CMD_NARG(cmd), - PFM_CMD_ARG_SIZE(cmd), + cmd, + narg, + base_sz, count)); /* * check if number of arguments matches what the command expects */ - narg = PFM_CMD_NARG(cmd); - if ((narg == PFM_CMD_ARG_MANY && count <= 0) || (narg > 0 && narg != count)) + if (unlikely((narg == PFM_CMD_ARG_MANY && count <= 0) || (narg > 0 && narg != count))) return -EINVAL; - /* get single argument size */ - base_sz = PFM_CMD_ARG_SIZE(cmd); - restart_args: sz = xtra_sz + base_sz*count; /* @@ -4804,7 +4790,7 @@ restart_args: /* * allocate default-sized argument buffer */ - if (count && args_k == NULL) { + if (likely(count && args_k == NULL)) { args_k = kmalloc(PFM_MAX_ARGSIZE, GFP_KERNEL); if (args_k == NULL) return -ENOMEM; } @@ -4824,11 +4810,11 @@ restart_args: /* * check if command supports extra parameters */ - if (completed_args == 0 && PFM_CMD_GETSIZE(cmd)) { + if (completed_args == 0 && getsize) { /* * get extra parameters size (based on main argument) */ - ret = PFM_CMD_GETSIZE(cmd)(args_k, &xtra_sz); + ret = (*getsize)(args_k, &xtra_sz); if (ret) goto error_args; completed_args = 1; @@ -4836,45 +4822,45 @@ restart_args: DPRINT(("[%d] restart_args sz=%lu xtra_sz=%lu\n", current->pid, sz, xtra_sz)); /* retry if necessary */ - if (xtra_sz) goto restart_args; + if (likely(xtra_sz)) goto restart_args; } - if (PFM_CMD_USE_FD(cmd)) { - - ret = -EBADF; + if (unlikely((cmd_flags & PFM_CMD_FD) == 0)) goto skip_fd; - file = fget(fd); - if (file == NULL) { - DPRINT(("[%d] invalid fd %d\n", current->pid, fd)); - goto error_args; - } - if (PFM_IS_FILE(file) == 0) { - DPRINT(("[%d] fd %d not related to perfmon\n", current->pid, fd)); - goto error_args; - } + ret = -EBADF; + file = fget(fd); + if (unlikely(file == NULL)) { + DPRINT(("[%d] invalid fd %d\n", current->pid, fd)); + goto error_args; + } + if (unlikely(PFM_IS_FILE(file) == 0)) { + DPRINT(("[%d] fd %d not related to perfmon\n", current->pid, fd)); + goto error_args; + } - ctx = (pfm_context_t *)file->private_data; - if (ctx == NULL) { - DPRINT(("[%d] no context for fd %d\n", current->pid, fd)); - goto error_args; - } + ctx = (pfm_context_t *)file->private_data; + if (unlikely(ctx == NULL)) { + DPRINT(("[%d] no context for fd %d\n", current->pid, fd)); + goto error_args; + } + prefetch(&ctx->ctx_state); - PROTECT_CTX(ctx, flags); + PROTECT_CTX(ctx, flags); - /* - * check task is stopped - */ - ret = pfm_check_task_state(ctx, cmd, flags); - if (ret) goto abort_locked; - } + /* + * check task is stopped + */ + ret = pfm_check_task_state(ctx, cmd, flags); + if (unlikely(ret)) goto abort_locked; - ret = (*pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_func)(ctx, args_k, count, regs); +skip_fd: + ret = (*func)(ctx, args_k, count, regs); call_made = 1; abort_locked: - if (ctx) { + if (likely(ctx)) { DPRINT(("[%d] context unlocked\n", current->pid)); UNPROTECT_CTX(ctx, flags); fput(file); @@ -4907,7 +4893,7 @@ pfm_resume_after_ovfl(pfm_context_t *ctx if (CTX_HAS_SMPL(ctx)) { rst_ctrl.bits.mask_monitoring = 0; - rst_ctrl.bits.reset_ovfl_pmds = 1; + rst_ctrl.bits.reset_ovfl_pmds = 0; if (state == PFM_CTX_LOADED) ret = pfm_buf_fmt_restart_active(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs); @@ -5096,7 +5082,7 @@ pfm_ovfl_notify_user(pfm_context_t *ctx, msg->pfm_ovfl_msg.msg_ovfl_pmds[1] = 0UL; msg->pfm_ovfl_msg.msg_ovfl_pmds[2] = 0UL; msg->pfm_ovfl_msg.msg_ovfl_pmds[3] = 0UL; - msg->pfm_ovfl_msg.msg_tstamp = ia64_get_itc(); /* relevant on UP only */ + msg->pfm_ovfl_msg.msg_tstamp = 0UL; } DPRINT(("ovfl msg: msg=%p no_msg=%d fd=%d pid=%d ovfl_pmds=0x%lx\n", @@ -5119,10 +5105,12 @@ pfm_end_notify_user(pfm_context_t *ctx) printk(KERN_ERR "perfmon: pfm_end_notify_user no more notification msgs\n"); return -1; } + /* no leak */ + memset(msg, 0, sizeof(*msg)); msg->pfm_end_msg.msg_type = PFM_MSG_END; msg->pfm_end_msg.msg_ctx_fd = ctx->ctx_fd; - msg->pfm_ovfl_msg.msg_tstamp = ia64_get_itc(); /* relevant on UP only */ + msg->pfm_ovfl_msg.msg_tstamp = 0UL; DPRINT(("end msg: msg=%p no_msg=%d ctx_fd=%d pid=%d\n", msg, @@ -5141,8 +5129,8 @@ pfm_overflow_handler(struct task_struct { pfm_ovfl_arg_t ovfl_arg; unsigned long mask; - unsigned long old_val, ovfl_val; - unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL; + unsigned long old_val, ovfl_val, new_val; + unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL, reset_pmds; unsigned long tstamp; pfm_ovfl_ctrl_t ovfl_ctrl; unsigned int i, has_smpl; @@ -5155,21 +5143,19 @@ pfm_overflow_handler(struct task_struct */ if (unlikely((pmc0 & 0x1) == 0)) goto sanity_check; - tstamp = ia64_get_itc(); - + tstamp = ia64_get_itc(); mask = pmc0 >> PMU_FIRST_COUNTER; ovfl_val = pmu_conf.ovfl_val; + has_smpl = CTX_HAS_SMPL(ctx); DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s " - "used_pmds=0x%lx reload_pmcs=0x%lx\n", + "used_pmds=0x%lx\n", pmc0, task ? task->pid: -1, (regs ? regs->cr_iip : 0), CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking", - ctx->ctx_used_pmds[0], - ctx->ctx_reload_pmcs[0])); + ctx->ctx_used_pmds[0])); - has_smpl = CTX_HAS_SMPL(ctx); /* * first we update the virtual counters @@ -5180,29 +5166,31 @@ pfm_overflow_handler(struct task_struct /* skip pmd which did not overflow */ if ((mask & 0x1) == 0) continue; - DPRINT_ovfl(("pmd[%d] overflowed hw_pmd=0x%lx ctx_pmd=0x%lx\n", - i, ia64_get_pmd(i), ctx->ctx_pmds[i].val)); - /* * Note that the pmd is not necessarily 0 at this point as qualified events * may have happened before the PMU was frozen. The residual count is not * taken into consideration here but will be with any read of the pmd via * pfm_read_pmds(). */ - old_val = ctx->ctx_pmds[i].val; - ctx->ctx_pmds[i].val += 1 + ovfl_val; + old_val = new_val = ctx->ctx_pmds[i].val; + new_val += 1 + ovfl_val; + ctx->ctx_pmds[i].val = new_val; /* * check for overflow condition */ - if (likely(old_val > ctx->ctx_pmds[i].val)) { + if (likely(old_val > new_val)) { ovfl_pmds |= 1UL << i; if (PMC_OVFL_NOTIFY(ctx, i)) ovfl_notify |= 1UL << i; } - DPRINT_ovfl(("ctx_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx smpl_pmds=0x%lx\n", - i, ctx->ctx_pmds[i].val, old_val, - ia64_get_pmd(i) & ovfl_val, ovfl_pmds, ovfl_notify, smpl_pmds)); + DPRINT_ovfl(("ctx_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n", + i, + new_val, + old_val, + ia64_get_pmd(i) & ovfl_val, + ovfl_pmds, + ovfl_notify)); } /* @@ -5214,6 +5202,7 @@ pfm_overflow_handler(struct task_struct * reset all control bits */ ovfl_ctrl.val = 0; + reset_pmds = 0UL; /* * if a sampling format module exists, then we "cache" the overflow by @@ -5225,7 +5214,7 @@ pfm_overflow_handler(struct task_struct int j, k, ret = 0; int this_cpu = smp_processor_id(); - pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER; + pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER; prefetch(ctx->ctx_smpl_hdr); @@ -5275,7 +5264,10 @@ pfm_overflow_handler(struct task_struct ovfl_ctrl.bits.notify_user |= ovfl_arg.ovfl_ctrl.bits.notify_user; ovfl_ctrl.bits.block_task |= ovfl_arg.ovfl_ctrl.bits.block_task; ovfl_ctrl.bits.mask_monitoring |= ovfl_arg.ovfl_ctrl.bits.mask_monitoring; - ovfl_ctrl.bits.reset_ovfl_pmds |= ovfl_arg.ovfl_ctrl.bits.reset_ovfl_pmds; /* yes or no */ + /* + * build the bitmask of pmds to reset now + */ + if (ovfl_arg.ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask; pfm_stats[this_cpu].pfm_smpl_handler_cycles += end_cycles - start_cycles; } @@ -5287,6 +5279,10 @@ pfm_overflow_handler(struct task_struct current->pid, pmd_mask<pid, + ovfl_pmds, + reset_pmds)); /* - * if we (still) have some overflowed PMD but no notification is requested - * then we use the short reset period. + * reset the requested PMD registers using the short reset values */ - if (ovfl_ctrl.bits.reset_ovfl_pmds) { - unsigned long bm = ovfl_pmds; + if (reset_pmds) { + unsigned long bm = reset_pmds; pfm_reset_regs(ctx, &bm, PFM_PMD_SHORT_RESET); } --- linux-2.6.4-rc1/arch/ia64/kernel/perfmon_default_smpl.c 2004-01-09 00:04:31.000000000 -0800 +++ 25/arch/ia64/kernel/perfmon_default_smpl.c 2004-02-29 13:07:38.000000000 -0800 @@ -178,6 +178,7 @@ default_handler(struct task_struct *task ent->tstamp = stamp; ent->cpu = smp_processor_id(); ent->set = arg->active_set; + ent->tgid = current->tgid; /* * selectively store PMDs in increasing index number --- linux-2.6.4-rc1/arch/ia64/kernel/process.c 2004-02-03 20:42:34.000000000 -0800 +++ 25/arch/ia64/kernel/process.c 2004-02-29 13:07:38.000000000 -0800 @@ -259,10 +259,12 @@ ia64_load_extra (struct task_struct *tas * * We get here through the following call chain: * - * - * sys_clone - * do_fork - * copy_thread + * from user-level: from kernel: + * + * + * sys_clone : + * do_fork do_fork + * copy_thread copy_thread * * This means that the stack layout is as follows: * @@ -276,9 +278,6 @@ ia64_load_extra (struct task_struct *tas * | | <-- sp (lowest addr) * +---------------------+ * - * Note: if we get called through kernel_thread() then the memory above "(highest addr)" - * is valid kernel stack memory that needs to be copied as well. - * * Observe that we copy the unat values that are in pt_regs and switch_stack. Spilling an * integer to address X causes bit N in ar.unat to be set to the NaT bit of the register, * with N=(X & 0x1ff)/8. Thus, copying the unat value preserves the NaT bits ONLY if the @@ -291,9 +290,9 @@ copy_thread (int nr, unsigned long clone unsigned long user_stack_base, unsigned long user_stack_size, struct task_struct *p, struct pt_regs *regs) { - unsigned long rbs, child_rbs, rbs_size, stack_offset, stack_top, stack_used; - struct switch_stack *child_stack, *stack; extern char ia64_ret_from_clone, ia32_ret_from_clone; + struct switch_stack *child_stack, *stack; + unsigned long rbs, child_rbs, rbs_size; struct pt_regs *child_ptregs; int retval = 0; @@ -306,16 +305,13 @@ copy_thread (int nr, unsigned long clone return 0; #endif - stack_top = (unsigned long) current + IA64_STK_OFFSET; stack = ((struct switch_stack *) regs) - 1; - stack_used = stack_top - (unsigned long) stack; - stack_offset = IA64_STK_OFFSET - stack_used; - child_stack = (struct switch_stack *) ((unsigned long) p + stack_offset); - child_ptregs = (struct pt_regs *) (child_stack + 1); + child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1; + child_stack = (struct switch_stack *) child_ptregs - 1; /* copy parent's switch_stack & pt_regs to child: */ - memcpy(child_stack, stack, stack_used); + memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack)); rbs = (unsigned long) current + IA64_RBS_OFFSET; child_rbs = (unsigned long) p + IA64_RBS_OFFSET; @@ -324,7 +320,7 @@ copy_thread (int nr, unsigned long clone /* copy the parent's register backing store to the child: */ memcpy((void *) child_rbs, (void *) rbs, rbs_size); - if (user_mode(child_ptregs)) { + if (likely(user_mode(child_ptregs))) { if ((clone_flags & CLONE_SETTLS) && !IS_IA32_PROCESS(regs)) child_ptregs->r13 = regs->r16; /* see sys_clone2() in entry.S */ if (user_stack_base) { @@ -341,14 +337,14 @@ copy_thread (int nr, unsigned long clone * been taken care of by the caller of sys_clone() * already. */ - child_ptregs->r12 = (unsigned long) (child_ptregs + 1); /* kernel sp */ + child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */ child_ptregs->r13 = (unsigned long) p; /* set `current' pointer */ } + child_stack->ar_bspstore = child_rbs + rbs_size; if (IS_IA32_PROCESS(regs)) child_stack->b0 = (unsigned long) &ia32_ret_from_clone; else child_stack->b0 = (unsigned long) &ia64_ret_from_clone; - child_stack->ar_bspstore = child_rbs + rbs_size; /* copy parts of thread_struct: */ p->thread.ksp = (unsigned long) child_stack - 16; @@ -358,8 +354,8 @@ copy_thread (int nr, unsigned long clone * therefore we must specify them explicitly here and not include them in * IA64_PSR_BITS_TO_CLEAR. */ - child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) - & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); + child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) + & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); /* * NOTE: The calling convention considers all floating point @@ -578,27 +574,43 @@ ia64_set_personality (struct elf64_hdr * pid_t kernel_thread (int (*fn)(void *), void *arg, unsigned long flags) { - struct task_struct *parent = current; - int result; - pid_t tid; + extern void ia64_invoke_kernel_thread_helper (void); + unsigned long *helper_fptr = (unsigned long *) &ia64_invoke_kernel_thread_helper; + struct { + struct switch_stack sw; + struct pt_regs pt; + } regs; + + memset(®s, 0, sizeof(regs)); + regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ + regs.pt.r1 = helper_fptr[1]; /* set GP */ + regs.pt.r9 = (unsigned long) fn; /* 1st argument */ + regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ + /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ + regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; + regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ + regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); + regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; + + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s.pt, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); - tid = clone(flags | CLONE_VM | CLONE_UNTRACED, 0); - if (parent != current) { +/* This gets called from kernel_thread() via ia64_invoke_thread_helper(). */ +int +kernel_thread_helper (int (*fn)(void *), void *arg) +{ #ifdef CONFIG_IA32_SUPPORT - if (IS_IA32_PROCESS(ia64_task_regs(current))) { - /* A kernel thread is always a 64-bit process. */ - current->thread.map_base = DEFAULT_MAP_BASE; - current->thread.task_size = DEFAULT_TASK_SIZE; - ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob); - ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1); - } -#endif - result = (*fn)(arg); - _exit(result); + if (IS_IA32_PROCESS(ia64_task_regs(current))) { + /* A kernel thread is always a 64-bit process. */ + current->thread.map_base = DEFAULT_MAP_BASE; + current->thread.task_size = DEFAULT_TASK_SIZE; + ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob); + ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1); } - return tid; +#endif + return (*fn)(arg); } -EXPORT_SYMBOL(kernel_thread); /* * Flush thread state. This is called when a thread does an execve(). --- linux-2.6.4-rc1/arch/ia64/lib/dec_and_lock.c 2004-01-09 00:04:31.000000000 -0800 +++ 25/arch/ia64/lib/dec_and_lock.c 2004-02-29 13:09:28.000000000 -0800 @@ -13,6 +13,7 @@ #include #include +#ifndef CONFIG_LOCKMETER /* * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock. Both of these * operations have to be done atomically, so that the count doesn't drop to zero without @@ -40,3 +41,4 @@ atomic_dec_and_lock (atomic_t *refcount, } EXPORT_SYMBOL(atomic_dec_and_lock); +#endif --- linux-2.6.4-rc1/arch/ia64/lib/swiotlb.c 2003-06-14 12:18:29.000000000 -0700 +++ 25/arch/ia64/lib/swiotlb.c 2004-02-29 13:07:52.000000000 -0800 @@ -47,7 +47,7 @@ #define IO_TLB_SHIFT 11 /* - * Used to do a quick range check in swiotlb_unmap_single and swiotlb_sync_single, to see + * Used to do a quick range check in swiotlb_unmap_single and swiotlb_sync_single_*, to see * if the memory was in fact allocated by this API. */ static char *io_tlb_start, *io_tlb_end; @@ -381,11 +381,24 @@ swiotlb_unmap_single (struct device *hwd * * If you perform a swiotlb_map_single() but wish to interrogate the buffer using the cpu, * yet do not wish to teardown the PCI dma mapping, you must call this function before - * doing so. At the next point you give the PCI dma address back to the card, the device - * again owns the buffer. + * doing so. At the next point you give the PCI dma address back to the card, you must + * first perform a swiotlb_dma_sync_for_device, and then the device again owns the buffer */ void -swiotlb_sync_single (struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir) +swiotlb_sync_single_for_cpu (struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir) +{ + char *dma_addr = phys_to_virt(dev_addr); + + if (dir == DMA_NONE) + BUG(); + if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end) + sync_single(hwdev, dma_addr, size, dir); + else if (dir == DMA_FROM_DEVICE) + mark_clean(dma_addr, size); +} + +void +swiotlb_sync_single_for_device (struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir) { char *dma_addr = phys_to_virt(dev_addr); @@ -456,11 +469,24 @@ swiotlb_unmap_sg (struct device *hwdev, * Make physical memory consistent for a set of streaming mode DMA translations after a * transfer. * - * The same as swiotlb_dma_sync_single but for a scatter-gather list, same rules and + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules and * usage. */ void -swiotlb_sync_sg (struct device *hwdev, struct scatterlist *sg, int nelems, int dir) +swiotlb_sync_sg_for_cpu (struct device *hwdev, struct scatterlist *sg, int nelems, int dir) +{ + int i; + + if (dir == DMA_NONE) + BUG(); + + for (i = 0; i < nelems; i++, sg++) + if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg)) + sync_single(hwdev, (void *) sg->dma_address, sg->dma_length, dir); +} + +void +swiotlb_sync_sg_for_device (struct device *hwdev, struct scatterlist *sg, int nelems, int dir) { int i; @@ -488,8 +514,10 @@ EXPORT_SYMBOL(swiotlb_map_single); EXPORT_SYMBOL(swiotlb_unmap_single); EXPORT_SYMBOL(swiotlb_map_sg); EXPORT_SYMBOL(swiotlb_unmap_sg); -EXPORT_SYMBOL(swiotlb_sync_single); -EXPORT_SYMBOL(swiotlb_sync_sg); +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu); +EXPORT_SYMBOL(swiotlb_sync_single_for_device); +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu); +EXPORT_SYMBOL(swiotlb_sync_sg_for_device); EXPORT_SYMBOL(swiotlb_alloc_coherent); EXPORT_SYMBOL(swiotlb_free_coherent); EXPORT_SYMBOL(swiotlb_dma_supported); --- linux-2.6.4-rc1/arch/ia64/mm/hugetlbpage.c 2004-01-09 00:04:31.000000000 -0800 +++ 25/arch/ia64/mm/hugetlbpage.c 2004-02-29 13:07:38.000000000 -0800 @@ -1,7 +1,11 @@ /* * IA-64 Huge TLB Page Support for Kernel. * - * Copyright (C) 2002, Rohit Seth + * Copyright (C) 2002-2004 Rohit Seth + * Copyright (C) 2003-2004 Ken Chen + * + * Sep, 2003: add numa support + * Feb, 2004: dynamic hugetlb page size via boot parameter */ #include @@ -18,11 +22,10 @@ #include #include -#define TASK_HPAGE_BASE (REGION_HPAGE << REGION_SHIFT) - static long htlbpagemem; int htlbpage_max; static long htlbzone_pages; +unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT; static struct list_head hugepage_freelists[MAX_NUMNODES]; static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED; @@ -407,7 +410,7 @@ unsigned long hugetlb_get_unmapped_area( return -EINVAL; /* This code assumes that REGION_HPAGE != 0. */ if ((REGION_NUMBER(addr) != REGION_HPAGE) || (addr & (HPAGE_SIZE - 1))) - addr = TASK_HPAGE_BASE; + addr = HPAGE_REGION_BASE; else addr = ALIGN(addr, HPAGE_SIZE); for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { @@ -520,6 +523,35 @@ static int __init hugetlb_setup(char *s) } __setup("hugepages=", hugetlb_setup); +static int __init hugetlb_setup_sz(char *str) +{ + u64 tr_pages; + unsigned long long size; + + if (ia64_pal_vm_page_size(&tr_pages, NULL) != 0) + /* + * shouldn't happen, but just in case. + */ + tr_pages = 0x15557000UL; + + size = memparse(str, &str); + if (*str || (size & (size-1)) || !(tr_pages & size) || + size <= PAGE_SIZE || + size >= (1UL << PAGE_SHIFT << MAX_ORDER)) { + printk(KERN_WARNING "Invalid huge page size specified\n"); + return 1; + } + + hpage_shift = __ffs(size); + /* + * boot cpu already executed ia64_mmu_init, and has HPAGE_SHIFT_DEFAULT + * override here with new page shift. + */ + ia64_set_rr(HPAGE_REGION_BASE, hpage_shift << 2); + return 1; +} +__setup("hugepagesz=", hugetlb_setup_sz); + static int __init hugetlb_init(void) { int i; @@ -540,7 +572,7 @@ static int __init hugetlb_init(void) printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem); return 0; } -module_init(hugetlb_init); +__initcall(hugetlb_init); int hugetlb_report_meminfo(char *buf) { --- linux-2.6.4-rc1/arch/ia64/mm/init.c 2004-02-03 20:42:34.000000000 -0800 +++ 25/arch/ia64/mm/init.c 2004-02-29 13:07:38.000000000 -0800 @@ -342,6 +342,10 @@ ia64_mmu_init (void *my_cpu_data) ia64_tlb_init(); +#ifdef CONFIG_HUGETLB_PAGE + ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2); +#endif + #ifdef CONFIG_IA64_MCA cpu = smp_processor_id(); --- linux-2.6.4-rc1/arch/ia64/pci/pci.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/ia64/pci/pci.c 2004-02-29 13:08:34.000000000 -0800 @@ -57,17 +57,16 @@ struct pci_fixup pcibios_fixups[1]; ((u64)(seg << 24) | (u64)(bus << 16) | \ (u64)(devfn << 8) | (u64)(reg)) - static int pci_sal_read (int seg, int bus, int devfn, int reg, int len, u32 *value) { int result = 0; u64 data = 0; - if (!value || (seg > 255) || (bus > 255) || (devfn > 255) || (reg > 255)) + if ((seg > 255) || (bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; - result = ia64_sal_pci_config_read(PCI_SAL_ADDRESS(seg, bus, devfn, reg), len, &data); + result = ia64_sal_pci_config_read(PCI_SAL_ADDRESS(seg, bus, devfn, reg), 0, len, &data); *value = (u32) data; @@ -80,15 +79,61 @@ pci_sal_write (int seg, int bus, int dev if ((seg > 255) || (bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; - return ia64_sal_pci_config_write(PCI_SAL_ADDRESS(seg, bus, devfn, reg), len, value); + return ia64_sal_pci_config_write(PCI_SAL_ADDRESS(seg, bus, devfn, reg), 0, len, value); } -struct pci_raw_ops pci_sal_ops = { +static struct pci_raw_ops pci_sal_ops = { .read = pci_sal_read, .write = pci_sal_write }; -struct pci_raw_ops *raw_pci_ops = &pci_sal_ops; /* default to SAL */ +/* SAL 3.2 adds support for extended config space. */ + +#define PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg) \ + ((u64)(seg << 28) | (u64)(bus << 20) | \ + (u64)(devfn << 12) | (u64)(reg)) + +static int +pci_sal_ext_read (int seg, int bus, int devfn, int reg, int len, u32 *value) +{ + int result = 0; + u64 data = 0; + + if ((seg > 65535) || (bus > 255) || (devfn > 255) || (reg > 4095)) + return -EINVAL; + + result = ia64_sal_pci_config_read(PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg), 1, len, &data); + + *value = (u32) data; + + return result; +} + +static int +pci_sal_ext_write (int seg, int bus, int devfn, int reg, int len, u32 value) +{ + if ((seg > 65535) || (bus > 255) || (devfn > 255) || (reg > 4095)) + return -EINVAL; + + return ia64_sal_pci_config_write(PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg), 1, len, value); +} + +static struct pci_raw_ops pci_sal_ext_ops = { + .read = pci_sal_ext_read, + .write = pci_sal_ext_write +}; + +struct pci_raw_ops *raw_pci_ops = &pci_sal_ops; /* default to SAL < 3.2 */ + +static int __init pci_set_sal_ops(void) +{ + if (sal_check_revision(3, 2)) { + raw_pci_ops = &pci_sal_ext_ops; + } + return 0; +} + +arch_initcall(pci_set_sal_ops); static int --- linux-2.6.4-rc1/arch/ia64/sn/io/machvec/pci_dma.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/ia64/sn/io/machvec/pci_dma.c 2004-02-29 13:07:52.000000000 -0800 @@ -437,7 +437,8 @@ sn_pci_unmap_single(struct pci_dev *hwde } /** - * sn_pci_dma_sync_single - make sure all DMAs have completed + * sn_pci_dma_sync_single_* - make sure all DMAs or CPU accesses + * have completed * @hwdev: device to sync * @dma_handle: DMA address to sync * @size: size of region @@ -448,14 +449,19 @@ sn_pci_unmap_single(struct pci_dev *hwde * anything on our platform. */ void -sn_pci_dma_sync_single(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction) +sn_pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction) { return; +} +void +sn_pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction) +{ + return; } /** - * sn_pci_dma_sync_sg - make sure all DMAs have completed + * sn_pci_dma_sync_sg_* - make sure all DMAs or CPU accesses have completed * @hwdev: device to sync * @sg: scatterlist to sync * @nents: number of entries in the scatterlist @@ -466,10 +472,15 @@ sn_pci_dma_sync_single(struct pci_dev *h * on our platform. */ void -sn_pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction) +sn_pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction) { return; +} +void +sn_pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction) +{ + return; } /** @@ -602,28 +613,51 @@ sn_dma_unmap_sg(struct device *dev, stru EXPORT_SYMBOL(sn_dma_unmap_sg); void -sn_dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size, +sn_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, + int direction) +{ + BUG_ON(dev->bus != &pci_bus_type); + + sn_pci_dma_sync_single_for_cpu(to_pci_dev(dev), dma_handle, size, (int)direction); +} +EXPORT_SYMBOL(sn_dma_sync_single_for_cpu); + +void +sn_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { BUG_ON(dev->bus != &pci_bus_type); - sn_pci_dma_sync_single(to_pci_dev(dev), dma_handle, size, (int)direction); + sn_pci_dma_sync_single_for_device(to_pci_dev(dev), dma_handle, size, (int)direction); +} +EXPORT_SYMBOL(sn_dma_sync_single_for_device); + +void +sn_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, + int direction) +{ + BUG_ON(dev->bus != &pci_bus_type); + + sn_pci_dma_sync_sg_for_cpu(to_pci_dev(dev), sg, nelems, (int)direction); } -EXPORT_SYMBOL(sn_dma_sync_single); +EXPORT_SYMBOL(sn_dma_sync_sg_for_cpu); void -sn_dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems, +sn_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction) { BUG_ON(dev->bus != &pci_bus_type); - sn_pci_dma_sync_sg(to_pci_dev(dev), sg, nelems, (int)direction); + sn_pci_dma_sync_sg_for_device(to_pci_dev(dev), sg, nelems, (int)direction); } -EXPORT_SYMBOL(sn_dma_sync_sg); +EXPORT_SYMBOL(sn_dma_sync_sg_for_device); EXPORT_SYMBOL(sn_pci_unmap_single); EXPORT_SYMBOL(sn_pci_map_single); -EXPORT_SYMBOL(sn_pci_dma_sync_single); +EXPORT_SYMBOL(sn_pci_dma_sync_single_for_cpu); +EXPORT_SYMBOL(sn_pci_dma_sync_single_for_device); +EXPORT_SYMBOL(sn_pci_dma_sync_sg_for_cpu); +EXPORT_SYMBOL(sn_pci_dma_sync_sg_for_device); EXPORT_SYMBOL(sn_pci_map_sg); EXPORT_SYMBOL(sn_pci_unmap_sg); EXPORT_SYMBOL(sn_pci_alloc_consistent); --- linux-2.6.4-rc1/arch/m68k/kernel/entry.S 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/m68k/kernel/entry.S 2004-02-29 13:08:05.000000000 -0800 @@ -528,7 +528,7 @@ sys_call_table: .long sys_ni_syscall /* old profil syscall holder */ .long sys_statfs .long sys_fstatfs /* 100 */ - .long sys_ioperm + .long sys_ni_syscall /* ioperm for i386 */ .long sys_socketcall .long sys_syslog .long sys_setitimer --- linux-2.6.4-rc1/arch/m68k/kernel/sys_m68k.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/m68k/kernel/sys_m68k.c 2004-02-29 13:08:05.000000000 -0800 @@ -261,12 +261,6 @@ asmlinkage int sys_ipc (uint call, int f return -EINVAL; } -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on) -{ - return -ENOSYS; -} - - /* Convert virtual (user) address VADDR to physical address PADDR */ #define virt_to_phys_040(vaddr) \ ({ \ --- linux-2.6.4-rc1/arch/m68knommu/kernel/syscalltable.S 2003-06-14 12:17:55.000000000 -0700 +++ 25/arch/m68knommu/kernel/syscalltable.S 2004-02-29 13:08:05.000000000 -0800 @@ -120,7 +120,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* old profil syscall holder */ .long sys_statfs .long sys_fstatfs /* 100 */ - .long sys_ioperm + .long sys_ni_syscall /* ioperm for i386 */ .long sys_socketcall .long sys_syslog .long sys_setitimer --- linux-2.6.4-rc1/arch/m68knommu/kernel/sys_m68k.c 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/m68knommu/kernel/sys_m68k.c 2004-02-29 13:08:05.000000000 -0800 @@ -193,12 +193,6 @@ asmlinkage int sys_ipc (uint call, int f return -EINVAL; } -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on) -{ - return -ENOSYS; -} - - /* sys_cacheflush -- flush (part of) the processor cache. */ asmlinkage int sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) --- linux-2.6.4-rc1/arch/mips/au1000/csb250/Makefile 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/mips/au1000/csb250/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -4,10 +4,6 @@ # # Makefile for the Cogent CSB250 Au1500 board. Copied from Pb1500. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# USE_STANDARD_AS_RULE := true --- linux-2.6.4-rc1/arch/mips/au1000/hydrogen3/Makefile 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/mips/au1000/hydrogen3/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -5,10 +5,6 @@ # # Makefile for the Alchemy Semiconductor PB1000 board. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# .S.s: $(CPP) $(CFLAGS) $< -o $*.s --- linux-2.6.4-rc1/arch/mips/au1000/mtx-1/Makefile 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/mips/au1000/mtx-1/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -6,9 +6,5 @@ # # Makefile for 4G Systems MTX-1 board. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# lib-y := init.o board_setup.o irqmap.o --- linux-2.6.4-rc1/arch/mips/au1000/pb1550/Makefile 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/mips/au1000/pb1550/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -5,10 +5,6 @@ # # Makefile for the Alchemy Semiconductor PB1000 board. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# .S.s: $(CPP) $(CFLAGS) $< -o $*.s --- linux-2.6.4-rc1/arch/mips/au1000/xxs1500/Makefile 2004-02-27 16:17:18.000000000 -0800 +++ 25/arch/mips/au1000/xxs1500/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -5,9 +5,5 @@ # # Makefile for MyCable XXS1500 board. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# lib-y := init.o board_setup.o irqmap.o --- linux-2.6.4-rc1/arch/mips/kernel/gdb-stub.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/mips/kernel/gdb-stub.c 2004-02-29 13:09:12.000000000 -0800 @@ -95,7 +95,7 @@ * Example: * $ cd ~/linux * $ make menuconfig - * $ make dep; make vmlinux + * $ make * * Step 3: * Download the kernel to the remote target and start --- linux-2.6.4-rc1/arch/mips/kernel/i8259.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/mips/kernel/i8259.c 2004-02-29 13:07:44.000000000 -0800 @@ -242,7 +242,7 @@ static int __init i8259A_init_sysfs(void { int error = sysdev_class_register(&i8259_sysdev_class); if (!error) - error = sys_device_register(&device_i8259A); + error = sysdev_register(&device_i8259A); return error; } --- linux-2.6.4-rc1/arch/mips/mm/dma-coherent.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/mips/mm/dma-coherent.c 2004-02-29 13:07:52.000000000 -0800 @@ -119,30 +119,55 @@ void dma_unmap_sg(struct device *dev, st EXPORT_SYMBOL(dma_unmap_sg); -void dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size, +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) { BUG_ON(direction == DMA_NONE); } -EXPORT_SYMBOL(dma_sync_single); +EXPORT_SYMBOL(dma_sync_single_for_cpu); -void dma_sync_single_range(struct device *dev, dma_addr_t dma_handle, +void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); +} + +EXPORT_SYMBOL(dma_sync_single_for_device); + +void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); +} + +EXPORT_SYMBOL(dma_sync_single_range_for_cpu); + +void dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, unsigned long offset, size_t size, enum dma_data_direction direction) { BUG_ON(direction == DMA_NONE); } -EXPORT_SYMBOL(dma_sync_single_range); +EXPORT_SYMBOL(dma_sync_single_range_for_device); -void dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems, +void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction direction) { BUG_ON(direction == DMA_NONE); } -EXPORT_SYMBOL(dma_sync_sg); +EXPORT_SYMBOL(dma_sync_sg_for_cpu); + +void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); +} + +EXPORT_SYMBOL(dma_sync_sg_for_device); int dma_supported(struct device *dev, u64 mask) { @@ -204,12 +229,20 @@ unsigned long pci_dac_dma_to_offset(stru EXPORT_SYMBOL(pci_dac_dma_to_offset); -void pci_dac_dma_sync_single(struct pci_dev *pdev, +void pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, + dma64_addr_t dma_addr, size_t len, int direction) +{ + BUG_ON(direction == PCI_DMA_NONE); +} + +EXPORT_SYMBOL(pci_dac_dma_sync_single_for_cpu); + +void pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) { BUG_ON(direction == PCI_DMA_NONE); } -EXPORT_SYMBOL(pci_dac_dma_sync_single); +EXPORT_SYMBOL(pci_dac_dma_sync_single_for_device); #endif /* CONFIG_PCI */ --- linux-2.6.4-rc1/arch/mips/mm/dma-ip27.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/mips/mm/dma-ip27.c 2004-02-29 13:07:52.000000000 -0800 @@ -125,30 +125,55 @@ void dma_unmap_sg(struct device *dev, st EXPORT_SYMBOL(dma_unmap_sg); -void dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size, +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) { BUG_ON(direction == DMA_NONE); } -EXPORT_SYMBOL(dma_sync_single); +EXPORT_SYMBOL(dma_sync_single_for_cpu); -void dma_sync_single_range(struct device *dev, dma_addr_t dma_handle, +void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); +} + +EXPORT_SYMBOL(dma_sync_single_for_device); + +void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); +} + +EXPORT_SYMBOL(dma_sync_single_range_for_cpu); + +void dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, unsigned long offset, size_t size, enum dma_data_direction direction) { BUG_ON(direction == DMA_NONE); } -EXPORT_SYMBOL(dma_sync_single_range); +EXPORT_SYMBOL(dma_sync_single_range_for_device); -void dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems, +void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction direction) { BUG_ON(direction == DMA_NONE); } -EXPORT_SYMBOL(dma_sync_sg); +EXPORT_SYMBOL(dma_sync_sg_for_cpu); + +void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); +} + +EXPORT_SYMBOL(dma_sync_sg_for_device); int dma_supported(struct device *dev, u64 mask) { @@ -208,10 +233,18 @@ unsigned long pci_dac_dma_to_offset(stru EXPORT_SYMBOL(pci_dac_dma_to_offset); -void pci_dac_dma_sync_single(struct pci_dev *pdev, +void pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, + dma64_addr_t dma_addr, size_t len, int direction) +{ + BUG_ON(direction == PCI_DMA_NONE); +} + +EXPORT_SYMBOL(pci_dac_dma_sync_single_for_cpu); + +void pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) { BUG_ON(direction == PCI_DMA_NONE); } -EXPORT_SYMBOL(pci_dac_dma_sync_single); +EXPORT_SYMBOL(pci_dac_dma_sync_single_for_device); --- linux-2.6.4-rc1/arch/mips/mm/dma-noncoherent.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/mips/mm/dma-noncoherent.c 2004-02-29 13:07:52.000000000 -0800 @@ -226,7 +226,7 @@ void dma_unmap_sg(struct device *dev, st EXPORT_SYMBOL(dma_unmap_sg); -void dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size, +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) { unsigned long addr; @@ -237,9 +237,35 @@ void dma_sync_single(struct device *dev, __dma_sync(addr, size, direction); } -EXPORT_SYMBOL(dma_sync_single); +EXPORT_SYMBOL(dma_sync_single_for_cpu); -void dma_sync_single_range(struct device *dev, dma_addr_t dma_handle, +void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + unsigned long addr; + + BUG_ON(direction == DMA_NONE); + + addr = dma_handle + PAGE_OFFSET; + __dma_sync(addr, size, direction); +} + +EXPORT_SYMBOL(dma_sync_single_for_device); + +void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, + unsigned long offset, size_t size, enum dma_data_direction direction) +{ + unsigned long addr; + + BUG_ON(direction == DMA_NONE); + + addr = dma_handle + offset + PAGE_OFFSET; + __dma_sync(addr, size, direction); +} + +EXPORT_SYMBOL(dma_sync_single_range_for_cpu); + +void dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, unsigned long offset, size_t size, enum dma_data_direction direction) { unsigned long addr; @@ -250,9 +276,9 @@ void dma_sync_single_range(struct device __dma_sync(addr, size, direction); } -EXPORT_SYMBOL(dma_sync_single_range); +EXPORT_SYMBOL(dma_sync_single_range_for_device); -void dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems, +void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction direction) { int i; @@ -265,7 +291,22 @@ void dma_sync_sg(struct device *dev, str sg->length, direction); } -EXPORT_SYMBOL(dma_sync_sg); +EXPORT_SYMBOL(dma_sync_sg_for_cpu); + +void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + int i; + + BUG_ON(direction == DMA_NONE); + + /* Make sure that gcc doesn't leave the empty loop body. */ + for (i = 0; i < nelems; i++, sg++) + __dma_sync((unsigned long)page_address(sg->page), + sg->length, direction); +} + +EXPORT_SYMBOL(dma_sync_sg_for_device); int dma_supported(struct device *dev, u64 mask) { @@ -329,7 +370,17 @@ unsigned long pci_dac_dma_to_offset(stru EXPORT_SYMBOL(pci_dac_dma_to_offset); -void pci_dac_dma_sync_single(struct pci_dev *pdev, +void pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, + dma64_addr_t dma_addr, size_t len, int direction) +{ + BUG_ON(direction == PCI_DMA_NONE); + + dma_cache_wback_inv(dma_addr + PAGE_OFFSET, len); +} + +EXPORT_SYMBOL(pci_dac_dma_sync_single_for_cpu); + +void pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) { BUG_ON(direction == PCI_DMA_NONE); @@ -337,6 +388,6 @@ void pci_dac_dma_sync_single(struct pci_ dma_cache_wback_inv(dma_addr + PAGE_OFFSET, len); } -EXPORT_SYMBOL(pci_dac_dma_sync_single); +EXPORT_SYMBOL(pci_dac_dma_sync_single_for_device); #endif /* CONFIG_PCI */ --- linux-2.6.4-rc1/arch/mips/momentum/jaguar_atx/Makefile 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/mips/momentum/jaguar_atx/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for Momentum Computer's Jaguar-ATX board. # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y += mv-irq.o int-handler.o irq.o pci-irq.o prom.o reset.o setup.o obj-$(CONFIG_PCI) += pci.o --- linux-2.6.4-rc1/arch/mips/tx4927/common/Makefile 2003-07-02 14:53:13.000000000 -0700 +++ 25/arch/mips/tx4927/common/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for common code for Toshiba TX4927 based systems # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := tx4927_prom.o obj-y += tx4927_setup.o --- linux-2.6.4-rc1/arch/parisc/kernel/drivers.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/parisc/kernel/drivers.c 2004-02-29 13:07:56.000000000 -0800 @@ -618,6 +618,7 @@ static void parisc_generic_device_regist tmp1); /* make the generic dma mask a pointer to the parisc one */ dev->dev.dma_mask = &dev->dma_mask; + dev->dev.coherent_dma_mask = dev->dma_mask; pr_debug("device_register(%s)\n", dev->dev.bus_id); device_register(&dev->dev); } --- linux-2.6.4-rc1/arch/parisc/kernel/pci-dma.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/parisc/kernel/pci-dma.c 2004-02-29 13:07:56.000000000 -0800 @@ -372,7 +372,7 @@ static void * pa11_dma_alloc_consistent ** ISA cards will certainly only support 24-bit DMA addressing. ** Not clear if we can, want, or need to support ISA. */ - if (!dev || *dev->dma_mask != 0xffffffff) + if (!dev || *dev->coherent_dma_mask < 0xffffffff) gfp |= GFP_DMA; #endif return (void *)vaddr; @@ -413,7 +413,7 @@ static void pa11_dma_unmap_single(struct /* * For PCI_DMA_FROMDEVICE this flush is not necessary for the * simple map/unmap case. However, it IS necessary if if - * pci_dma_sync_single has been called and the buffer reused. + * pci_dma_sync_single_* has been called and the buffer reused. */ flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle), size); @@ -453,7 +453,7 @@ static void pa11_dma_unmap_sg(struct dev return; } -static void pa11_dma_sync_single(struct device *dev, dma_addr_t dma_handle, unsigned long offset, size_t size, enum dma_data_direction direction) +static void pa11_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, unsigned long offset, size_t size, enum dma_data_direction direction) { if (direction == DMA_NONE) BUG(); @@ -461,7 +461,25 @@ static void pa11_dma_sync_single(struct flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle) + offset, size); } -static void pa11_dma_sync_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) +static void pa11_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, unsigned long offset, size_t size, enum dma_data_direction direction) +{ + if (direction == DMA_NONE) + BUG(); + + flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle) + offset, size); +} + +static void pa11_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) +{ + int i; + + /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ + + for (i = 0; i < nents; i++, sglist++ ) + flush_kernel_dcache_range(sg_virt_addr(sglist), sglist->length); +} + +static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; @@ -480,8 +498,10 @@ struct hppa_dma_ops pcxl_dma_ops = { .unmap_single = pa11_dma_unmap_single, .map_sg = pa11_dma_map_sg, .unmap_sg = pa11_dma_unmap_sg, - .dma_sync_single = pa11_dma_sync_single, - .dma_sync_sg = pa11_dma_sync_sg, + .dma_sync_single_for_cpu = pa11_dma_sync_single_for_cpu, + .dma_sync_single_for_device = pa11_dma_sync_single_for_device, + .dma_sync_sg_for_cpu = pa11_dma_sync_sg_for_cpu, + .dma_sync_sg_for_device = pa11_dma_sync_sg_for_device, }; static void *fail_alloc_consistent(struct device *dev, size_t size, @@ -519,8 +539,10 @@ struct hppa_dma_ops pcx_dma_ops = { .unmap_single = pa11_dma_unmap_single, .map_sg = pa11_dma_map_sg, .unmap_sg = pa11_dma_unmap_sg, - .dma_sync_single = pa11_dma_sync_single, - .dma_sync_sg = pa11_dma_sync_sg, + .dma_sync_single_cpu = pa11_dma_sync_single_cpu, + .dma_sync_single_device = pa11_dma_sync_single_device, + .dma_sync_sg_cpu = pa11_dma_sync_sg_cpu, + .dma_sync_sg_device = pa11_dma_sync_sg_device, }; --- linux-2.6.4-rc1/arch/parisc/kernel/process.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/parisc/kernel/process.c 2004-02-29 13:09:04.000000000 -0800 @@ -1,5 +1,3 @@ -/* - * PARISC Architecture-dependent parts of process handling * based on the work for i386 * * Copyright (C) 1999-2003 Matthew Wilcox @@ -32,7 +30,6 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#define __KERNEL_SYSCALLS__ #include #include --- linux-2.6.4-rc1/arch/parisc/kernel/smp.c 2004-01-09 00:04:31.000000000 -0800 +++ 25/arch/parisc/kernel/smp.c 2004-02-29 13:09:04.000000000 -0800 @@ -16,7 +16,6 @@ ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. */ -#define __KERNEL_SYSCALLS__ #undef ENTRY_SYS_CPUS /* syscall support for iCOD-like functionality */ #include --- linux-2.6.4-rc1/arch/parisc/kernel/sys_parisc.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/parisc/kernel/sys_parisc.c 2004-02-29 13:08:05.000000000 -0800 @@ -242,14 +242,6 @@ asmlinkage ssize_t parisc_readahead(int return sys_readahead(fd, (loff_t)high << 32 | low, count); } -/* - * This changes the io permissions bitmap in the current task. - */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ - return -ENOSYS; -} - asmlinkage unsigned long sys_alloc_hugepages(int key, unsigned long addr, unsigned long len, int prot, int flag) { return -ENOMEM; --- linux-2.6.4-rc1/arch/ppc64/kernel/misc.S 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/ppc64/kernel/misc.S 2004-02-29 13:09:04.000000000 -0800 @@ -582,17 +582,7 @@ _GLOBAL(name) \ li r3,-1; \ blr -#define __NR__exit __NR_exit - -SYSCALL(setsid) -SYSCALL(open) -SYSCALL(read) -SYSCALL(write) -SYSCALL(lseek) -SYSCALL(close) -SYSCALL(dup) SYSCALL(execve) -SYSCALL(waitpid) #ifdef CONFIG_PPC_ISERIES /* hack hack hack */ #define ppc_rtas sys_ni_syscall --- linux-2.6.4-rc1/arch/ppc64/kernel/pmac_smp.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/ppc64/kernel/pmac_smp.c 2004-02-29 13:09:04.000000000 -0800 @@ -29,8 +29,6 @@ #include #include #include -#define __KERNEL_SYSCALLS__ -#include #include #include #include --- linux-2.6.4-rc1/arch/ppc64/kernel/prom.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/ppc64/kernel/prom.c 2004-02-29 13:07:38.000000000 -0800 @@ -516,6 +516,9 @@ prom_initialize_naca(unsigned long mem) return mem; } +#ifdef CONFIG_PMAC_DART +static int dart_force_on; +#endif static unsigned long __init prom_initialize_lmb(unsigned long mem) @@ -539,10 +542,12 @@ prom_initialize_lmb(unsigned long mem) prom_print(opt); prom_print(RELOC("\n")); opt += 6; - while(*opt && *opt == ' ') + while (*opt && *opt == ' ') opt++; if (!strncmp(opt, RELOC("off"), 3)) nodart = 1; + else if (!strncmp(opt, RELOC("force"), 5)) + RELOC(dart_force_on) = 1; } #else nodart = 1; @@ -763,8 +768,10 @@ void prom_initialize_dart_table(void) extern unsigned long dart_tablebase; extern unsigned long dart_tablesize; - /* Only reserve DART space if machine has more than 2Gb of RAM */ - if (lmb_end_of_DRAM() <= 0x80000000ull) + /* Only reserve DART space if machine has more than 2GB of RAM + * or if requested with iommu=on on cmdline. + */ + if (lmb_end_of_DRAM() <= 0x80000000ull && !RELOC(dart_force_on)) return; /* 512 pages is max DART tablesize. */ --- linux-2.6.4-rc1/arch/ppc64/kernel/pSeries_htab.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/ppc64/kernel/pSeries_htab.c 2004-02-29 13:08:03.000000000 -0800 @@ -103,7 +103,7 @@ long pSeries_hpte_insert(unsigned long h __asm__ __volatile__ ("ptesync" : : : "memory"); - return i; + return i | (secondary << 3); } static long pSeries_hpte_remove(unsigned long hpte_group) --- linux-2.6.4-rc1/arch/ppc64/kernel/pSeries_lpar.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/ppc64/kernel/pSeries_lpar.c 2004-02-29 13:08:03.000000000 -0800 @@ -379,7 +379,10 @@ long pSeries_lpar_hpte_insert(unsigned l if (lpar_rc != H_Success) return -2; - return slot; + /* Because of iSeries, we have to pass down the secondary + * bucket bit here as well + */ + return (slot & 7) | (secondary << 3); } static spinlock_t pSeries_lpar_tlbie_lock = SPIN_LOCK_UNLOCKED; --- linux-2.6.4-rc1/arch/ppc64/mm/hash_low.S 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/ppc64/mm/hash_low.S 2004-02-29 13:08:03.000000000 -0800 @@ -176,7 +176,6 @@ _GLOBAL(htab_call_hpte_insert1) beq- htab_pte_insert_failure /* Now try secondary slot */ - ori r30,r30,_PAGE_SECONDARY /* page number in r5 */ rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT @@ -215,8 +214,8 @@ _GLOBAL(htab_call_hpte_remove) b htab_insert_pte htab_pte_insert_ok: - /* Insert slot number in PTE */ - rldimi r30,r3,12,63-14 + /* Insert slot number & secondary bit in PTE */ + rldimi r30,r3,12,63-15 /* Write out the PTE with a normal write * (maybe add eieio may be good still ?) --- linux-2.6.4-rc1/arch/ppc64/mm/numa.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/ppc64/mm/numa.c 2004-02-29 13:08:04.000000000 -0800 @@ -33,7 +33,10 @@ bootmem_data_t plat_node_bdata[MAX_NUMNO static unsigned long node0_io_hole_size; EXPORT_SYMBOL(node_data); +EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(numa_memory_lookup_table); +EXPORT_SYMBOL(numa_cpumask_lookup_table); +EXPORT_SYMBOL(nr_cpus_in_node); static inline void map_cpu_to_node(int cpu, int node) { --- linux-2.6.4-rc1/arch/ppc/boot/ld.script 2003-11-09 16:45:05.000000000 -0800 +++ 25/arch/ppc/boot/ld.script 2004-02-29 13:09:31.000000000 -0800 @@ -82,6 +82,7 @@ SECTIONS *(__ksymtab) *(__ksymtab_strings) *(__bug_table) + *(__kcrctab) } } --- linux-2.6.4-rc1/arch/ppc/kernel/misc.S 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/ppc/kernel/misc.S 2004-02-29 13:09:04.000000000 -0800 @@ -1108,17 +1108,7 @@ _GLOBAL(name) \ li r3,-1; \ blr -#define __NR__exit __NR_exit - -SYSCALL(setsid) -SYSCALL(open) -SYSCALL(read) -SYSCALL(write) -SYSCALL(lseek) -SYSCALL(close) -SYSCALL(dup) SYSCALL(execve) -SYSCALL(waitpid) /* Why isn't this a) automatic, b) written in 'C'? */ .data --- linux-2.6.4-rc1/arch/ppc/kernel/ppc_ksyms.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/ppc/kernel/ppc_ksyms.c 2004-02-29 13:09:04.000000000 -0800 @@ -32,8 +32,6 @@ #include #include #include -#define __KERNEL_SYSCALLS__ -#include #include #include #include @@ -189,10 +187,6 @@ EXPORT_SYMBOL(consistent_sync); EXPORT_SYMBOL(flush_dcache_all); #endif -EXPORT_SYMBOL(open); -EXPORT_SYMBOL(read); -EXPORT_SYMBOL(lseek); -EXPORT_SYMBOL(close); EXPORT_SYMBOL(start_thread); EXPORT_SYMBOL(kernel_thread); --- linux-2.6.4-rc1/arch/ppc/kernel/smp.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/ppc/kernel/smp.c 2004-02-29 13:09:04.000000000 -0800 @@ -17,8 +17,6 @@ #include #include #include -#define __KERNEL_SYSCALLS__ -#include #include #include #include --- linux-2.6.4-rc1/arch/ppc/platforms/chrp_smp.c 2003-06-14 12:18:07.000000000 -0700 +++ 25/arch/ppc/platforms/chrp_smp.c 2004-02-29 13:09:04.000000000 -0800 @@ -16,8 +16,6 @@ #include #include #include -#define __KERNEL_SYSCALLS__ -#include #include #include --- linux-2.6.4-rc1/arch/ppc/platforms/pmac_pic.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/ppc/platforms/pmac_pic.c 2004-02-29 13:07:44.000000000 -0800 @@ -646,7 +646,7 @@ static int __init init_pmacpic_sysfs(voi printk(KERN_DEBUG "Registering pmac pic with sysfs...\n"); sysdev_class_register(&pmacpic_sysclass); - sys_device_register(&device_pmacpic); + sysdev_register(&device_pmacpic); sysdev_driver_register(&pmacpic_sysclass, &driver_pmacpic); return 0; } --- linux-2.6.4-rc1/arch/ppc/platforms/pmac_smp.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/ppc/platforms/pmac_smp.c 2004-02-29 13:09:04.000000000 -0800 @@ -29,8 +29,6 @@ #include #include #include -#define __KERNEL_SYSCALLS__ -#include #include #include #include --- linux-2.6.4-rc1/arch/ppc/syslib/open_pic2.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/ppc/syslib/open_pic2.c 2004-02-29 13:07:44.000000000 -0800 @@ -699,7 +699,7 @@ static int __init init_openpic2_sysfs(vo printk(KERN_ERR "Failed registering openpic sys class\n"); return -ENODEV; } - rc = sys_device_register(&device_openpic2); + rc = sysdev_register(&device_openpic2); if (rc) { printk(KERN_ERR "Failed registering openpic sys device\n"); return -ENODEV; --- linux-2.6.4-rc1/arch/ppc/syslib/open_pic.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/ppc/syslib/open_pic.c 2004-02-29 13:07:44.000000000 -0800 @@ -1032,7 +1032,7 @@ static int __init init_openpic_sysfs(voi printk(KERN_ERR "Failed registering openpic sys class\n"); return -ENODEV; } - rc = sys_device_register(&device_openpic); + rc = sysdev_register(&device_openpic); if (rc) { printk(KERN_ERR "Failed registering openpic sys device\n"); return -ENODEV; --- linux-2.6.4-rc1/arch/s390/Kconfig 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/s390/Kconfig 2004-02-29 13:07:56.000000000 -0800 @@ -143,6 +143,11 @@ config COMPAT depends on S390_SUPPORT default y +config SYSVIPC_COMPAT + bool + depends on COMPAT && SYSVIPC + default y + config BINFMT_ELF32 tristate "Kernel support for 31 bit ELF binaries" depends on S390_SUPPORT --- linux-2.6.4-rc1/arch/s390/kernel/compat_linux.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/s390/kernel/compat_linux.c 2004-02-29 13:07:56.000000000 -0800 @@ -293,541 +293,6 @@ static inline long put_tv32(struct compa __put_user(i->tv_usec, &o->tv_usec))); } -struct msgbuf32 { s32 mtype; char mtext[1]; }; - -struct ipc64_perm_ds32 -{ - __kernel_key_t key; - __kernel_uid32_t uid; - __kernel_gid32_t gid; - __kernel_uid32_t cuid; - __kernel_gid32_t cgid; - compat_mode_t mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - unsigned int __unused1; - unsigned int __unused2; -}; - -struct ipc_perm32 -{ - key_t key; - compat_uid_t uid; - compat_gid_t gid; - compat_uid_t cuid; - compat_gid_t cgid; - compat_mode_t mode; - unsigned short seq; -}; - -struct semid_ds32 { - struct ipc_perm32 sem_perm; /* permissions .. see ipc.h */ - compat_time_t sem_otime; /* last semop time */ - compat_time_t sem_ctime; /* last change time */ - u32 sem_base; /* ptr to first semaphore in array */ - u32 sem_pending; /* pending operations to be processed */ - u32 sem_pending_last; /* last pending operation */ - u32 undo; /* undo requests on this array */ - unsigned short sem_nsems; /* no. of semaphores in array */ -}; - -struct semid64_ds32 { - struct ipc64_perm_ds32 sem_perm; - unsigned int __pad1; - compat_time_t sem_otime; - unsigned int __pad2; - compat_time_t sem_ctime; - u32 sem_nsems; - u32 __unused1; - u32 __unused2; -}; - -struct msqid_ds32 -{ - struct ipc_perm32 msg_perm; - u32 msg_first; - u32 msg_last; - compat_time_t msg_stime; - compat_time_t msg_rtime; - compat_time_t msg_ctime; - u32 wwait; - u32 rwait; - unsigned short msg_cbytes; - unsigned short msg_qnum; - unsigned short msg_qbytes; - compat_ipc_pid_t msg_lspid; - compat_ipc_pid_t msg_lrpid; -}; - -struct msqid64_ds32 { - struct ipc64_perm_ds32 msg_perm; - unsigned int __pad1; - compat_time_t msg_stime; - unsigned int __pad2; - compat_time_t msg_rtime; - unsigned int __pad3; - compat_time_t msg_ctime; - unsigned int msg_cbytes; - unsigned int msg_qnum; - unsigned int msg_qbytes; - compat_pid_t msg_lspid; - compat_pid_t msg_lrpid; - unsigned int __unused1; - unsigned int __unused2; -}; - - -struct shmid_ds32 { - struct ipc_perm32 shm_perm; - int shm_segsz; - compat_time_t shm_atime; - compat_time_t shm_dtime; - compat_time_t shm_ctime; - compat_ipc_pid_t shm_cpid; - compat_ipc_pid_t shm_lpid; - unsigned short shm_nattch; -}; - -struct shmid64_ds32 { - struct ipc64_perm_ds32 shm_perm; - compat_size_t shm_segsz; - compat_time_t shm_atime; - unsigned int __unused1; - compat_time_t shm_dtime; - unsigned int __unused2; - compat_time_t shm_ctime; - unsigned int __unused3; - compat_pid_t shm_cpid; - compat_pid_t shm_lpid; - unsigned int shm_nattch; - unsigned int __unused4; - unsigned int __unused5; -}; - -extern int sem_ctls[]; -#define sc_semopm (sem_ctls[2]) -#define SEMOPM_FAST 64 /* ~ 372 bytes on stack */ - -static long -do_sys32_semtimedop (int semid, struct sembuf *tsops, int nsops, - struct compat_timespec *timeout32) -{ - struct sembuf *sops, fast_sops[SEMOPM_FAST]; - struct timespec t; - mm_segment_t oldfs; - long ret; - - /* parameter checking precedence should mirror sys_semtimedop() */ - if (nsops < 1 || semid < 0) - return -EINVAL; - if (nsops > sc_semopm) - return -E2BIG; - if (nsops <= SEMOPM_FAST) - sops = fast_sops; - else { - sops = kmalloc(nsops * sizeof(*sops), GFP_KERNEL); - if (sops == NULL) - return -ENOMEM; - } - if (copy_from_user(sops, tsops, nsops * sizeof(*tsops)) || - get_compat_timespec(&t, timeout32)) - ret = -EFAULT; - else { - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_semtimedop(semid, sops, nsops, &t); - set_fs(oldfs); - } - if (sops != fast_sops) - kfree(sops); - return ret; -} - -#define IPCOP_MASK(__x) (1UL << (__x)) -static int do_sys32_semctl(int first, int second, int third, void *uptr) -{ - union semun fourth; - u32 pad; - int err = -EINVAL; - - if (!uptr) - goto out; - err = -EFAULT; - if (get_user (pad, (u32 *)uptr)) - goto out; - if(third == SETVAL) - fourth.val = (int)pad; - else - fourth.__pad = (void *)A(pad); - if (IPCOP_MASK (third) & - (IPCOP_MASK (IPC_INFO) | IPCOP_MASK (SEM_INFO) | IPCOP_MASK (GETVAL) | - IPCOP_MASK (GETPID) | IPCOP_MASK (GETNCNT) | IPCOP_MASK (GETZCNT) | - IPCOP_MASK (GETALL) | IPCOP_MASK (SETALL) | IPCOP_MASK (IPC_RMID))) { - err = sys_semctl (first, second, third, fourth); - } else if (third & IPC_64) { - struct semid64_ds s; - struct semid64_ds32 *usp = (struct semid64_ds32 *)A(pad); - mm_segment_t old_fs; - int need_back_translation; - - if (third == (IPC_SET|IPC_64)) { - err = get_user (s.sem_perm.uid, &usp->sem_perm.uid); - err |= __get_user (s.sem_perm.gid, &usp->sem_perm.gid); - err |= __get_user (s.sem_perm.mode, &usp->sem_perm.mode); - if (err) - goto out; - fourth.__pad = &s; - } - need_back_translation = - (IPCOP_MASK (third) & - (IPCOP_MASK (SEM_STAT) | IPCOP_MASK (IPC_STAT))) != 0; - if (need_back_translation) - fourth.__pad = &s; - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_semctl (first, second, third, fourth); - set_fs (old_fs); - if (need_back_translation) { - int err2 = put_user (s.sem_perm.key, &usp->sem_perm.key); - err2 |= __put_user (high2lowuid(s.sem_perm.uid), &usp->sem_perm.uid); - err2 |= __put_user (high2lowgid(s.sem_perm.gid), &usp->sem_perm.gid); - err2 |= __put_user (high2lowuid(s.sem_perm.cuid), &usp->sem_perm.cuid); - err2 |= __put_user (high2lowgid(s.sem_perm.cgid), &usp->sem_perm.cgid); - err2 |= __put_user (s.sem_perm.mode, &usp->sem_perm.mode); - err2 |= __put_user (s.sem_perm.seq, &usp->sem_perm.seq); - err2 |= __put_user (s.sem_otime, &usp->sem_otime); - err2 |= __put_user (s.sem_ctime, &usp->sem_ctime); - err2 |= __put_user (s.sem_nsems, &usp->sem_nsems); - if (err2) err = -EFAULT; - } - } else { - struct semid_ds s; - struct semid_ds32 *usp = (struct semid_ds32 *)A(pad); - mm_segment_t old_fs; - int need_back_translation; - - if (third == IPC_SET) { - err = get_user (s.sem_perm.uid, &usp->sem_perm.uid); - err |= __get_user (s.sem_perm.gid, &usp->sem_perm.gid); - err |= __get_user (s.sem_perm.mode, &usp->sem_perm.mode); - if (err) - goto out; - fourth.__pad = &s; - } - need_back_translation = - (IPCOP_MASK (third) & - (IPCOP_MASK (SEM_STAT) | IPCOP_MASK (IPC_STAT))) != 0; - if (need_back_translation) - fourth.__pad = &s; - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_semctl (first, second, third, fourth); - set_fs (old_fs); - if (need_back_translation) { - int err2 = put_user (s.sem_perm.key, &usp->sem_perm.key); - err2 |= __put_user (high2lowuid(s.sem_perm.uid), &usp->sem_perm.uid); - err2 |= __put_user (high2lowgid(s.sem_perm.gid), &usp->sem_perm.gid); - err2 |= __put_user (high2lowuid(s.sem_perm.cuid), &usp->sem_perm.cuid); - err2 |= __put_user (high2lowgid(s.sem_perm.cgid), &usp->sem_perm.cgid); - err2 |= __put_user (s.sem_perm.mode, &usp->sem_perm.mode); - err2 |= __put_user (s.sem_perm.seq, &usp->sem_perm.seq); - err2 |= __put_user (s.sem_otime, &usp->sem_otime); - err2 |= __put_user (s.sem_ctime, &usp->sem_ctime); - err2 |= __put_user (s.sem_nsems, &usp->sem_nsems); - if (err2) err = -EFAULT; - } - } -out: - return err; -} - -static int do_sys32_msgsnd (int first, int second, int third, void *uptr) -{ - struct msgbuf *p = kmalloc (second + sizeof (struct msgbuf), GFP_USER); - struct msgbuf32 *up = (struct msgbuf32 *)uptr; - mm_segment_t old_fs; - int err; - - if (!p) - return -ENOMEM; - - err = -EINVAL; - if (second > MSGMAX || first < 0 || second < 0) - goto out; - - err = -EFAULT; - if (!uptr) - goto out; - if (get_user (p->mtype, &up->mtype) || - __copy_from_user (p->mtext, &up->mtext, second)) - goto out; - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_msgsnd (first, p, second, third); - set_fs (old_fs); -out: - kfree (p); - return err; -} - -static int do_sys32_msgrcv (int first, int second, int msgtyp, int third, - int version, void *uptr) -{ - struct msgbuf32 *up; - struct msgbuf *p; - mm_segment_t old_fs; - int err; - - if (first < 0 || second < 0) - return -EINVAL; - - if (!version) { - struct ipc_kludge_32 *uipck = (struct ipc_kludge_32 *)uptr; - struct ipc_kludge_32 ipck; - - err = -EINVAL; - if (!uptr) - goto out; - err = -EFAULT; - if (copy_from_user (&ipck, uipck, sizeof (struct ipc_kludge_32))) - goto out; - uptr = (void *)A(ipck.msgp); - msgtyp = ipck.msgtyp; - } - err = -ENOMEM; - p = kmalloc (second + sizeof (struct msgbuf), GFP_USER); - if (!p) - goto out; - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_msgrcv (first, p, second, msgtyp, third); - set_fs (old_fs); - if (err < 0) - goto free_then_out; - up = (struct msgbuf32 *)uptr; - if (put_user (p->mtype, &up->mtype) || - __copy_to_user (&up->mtext, p->mtext, err)) - err = -EFAULT; -free_then_out: - kfree (p); -out: - return err; -} - -static int do_sys32_msgctl (int first, int second, void *uptr) -{ - int err; - - if (IPCOP_MASK (second) & - (IPCOP_MASK (IPC_INFO) | IPCOP_MASK (MSG_INFO) | - IPCOP_MASK (IPC_RMID))) { - err = sys_msgctl (first, second, (struct msqid_ds *)uptr); - } else if (second & IPC_64) { - struct msqid64_ds m; - struct msqid64_ds32 *up = (struct msqid64_ds32 *)uptr; - mm_segment_t old_fs; - - if (second == (IPC_SET|IPC_64)) { - err = get_user (m.msg_perm.uid, &up->msg_perm.uid); - err |= __get_user (m.msg_perm.gid, &up->msg_perm.gid); - err |= __get_user (m.msg_perm.mode, &up->msg_perm.mode); - err |= __get_user (m.msg_qbytes, &up->msg_qbytes); - if (err) - goto out; - } - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_msgctl (first, second, (struct msqid_ds *)&m); - set_fs (old_fs); - if (IPCOP_MASK (second) & - (IPCOP_MASK (MSG_STAT) | IPCOP_MASK (IPC_STAT))) { - int err2 = put_user (m.msg_perm.key, &up->msg_perm.key); - err2 |= __put_user (high2lowuid(m.msg_perm.uid), &up->msg_perm.uid); - err2 |= __put_user (high2lowgid(m.msg_perm.gid), &up->msg_perm.gid); - err2 |= __put_user (high2lowuid(m.msg_perm.cuid), &up->msg_perm.cuid); - err2 |= __put_user (high2lowgid(m.msg_perm.cgid), &up->msg_perm.cgid); - err2 |= __put_user (m.msg_perm.mode, &up->msg_perm.mode); - err2 |= __put_user (m.msg_perm.seq, &up->msg_perm.seq); - err2 |= __put_user (m.msg_stime, &up->msg_stime); - err2 |= __put_user (m.msg_rtime, &up->msg_rtime); - err2 |= __put_user (m.msg_ctime, &up->msg_ctime); - err2 |= __put_user (m.msg_cbytes, &up->msg_cbytes); - err2 |= __put_user (m.msg_qnum, &up->msg_qnum); - err2 |= __put_user (m.msg_qbytes, &up->msg_qbytes); - err2 |= __put_user (m.msg_lspid, &up->msg_lspid); - err2 |= __put_user (m.msg_lrpid, &up->msg_lrpid); - if (err2) - err = -EFAULT; - } - } else { - struct msqid_ds m; - struct msqid_ds32 *up = (struct msqid_ds32 *)uptr; - mm_segment_t old_fs; - - if (second == IPC_SET) { - err = get_user (m.msg_perm.uid, &up->msg_perm.uid); - err |= __get_user (m.msg_perm.gid, &up->msg_perm.gid); - err |= __get_user (m.msg_perm.mode, &up->msg_perm.mode); - err |= __get_user (m.msg_qbytes, &up->msg_qbytes); - if (err) - goto out; - } - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_msgctl (first, second, &m); - set_fs (old_fs); - if (IPCOP_MASK (second) & - (IPCOP_MASK (MSG_STAT) | IPCOP_MASK (IPC_STAT))) { - int err2 = put_user (m.msg_perm.key, &up->msg_perm.key); - err2 |= __put_user (high2lowuid(m.msg_perm.uid), &up->msg_perm.uid); - err2 |= __put_user (high2lowgid(m.msg_perm.gid), &up->msg_perm.gid); - err2 |= __put_user (high2lowuid(m.msg_perm.cuid), &up->msg_perm.cuid); - err2 |= __put_user (high2lowgid(m.msg_perm.cgid), &up->msg_perm.cgid); - err2 |= __put_user (m.msg_perm.mode, &up->msg_perm.mode); - err2 |= __put_user (m.msg_perm.seq, &up->msg_perm.seq); - err2 |= __put_user (m.msg_stime, &up->msg_stime); - err2 |= __put_user (m.msg_rtime, &up->msg_rtime); - err2 |= __put_user (m.msg_ctime, &up->msg_ctime); - err2 |= __put_user (m.msg_cbytes, &up->msg_cbytes); - err2 |= __put_user (m.msg_qnum, &up->msg_qnum); - err2 |= __put_user (m.msg_qbytes, &up->msg_qbytes); - err2 |= __put_user (m.msg_lspid, &up->msg_lspid); - err2 |= __put_user (m.msg_lrpid, &up->msg_lrpid); - if (err2) - err = -EFAULT; - } - } - -out: - return err; -} - -static int do_sys32_shmat (int first, int second, int third, int version, void *uptr) -{ - unsigned long raddr; - u32 *uaddr = (u32 *)A((u32)third); - int err = -EINVAL; - - if (version == 1) - goto out; - err = do_shmat (first, uptr, second, &raddr); - if (err) - goto out; - err = put_user (raddr, uaddr); -out: - return err; -} - -static int do_sys32_shmctl (int first, int second, void *uptr) -{ - int err; - - if (IPCOP_MASK (second) & - (IPCOP_MASK (IPC_INFO) | IPCOP_MASK (SHM_LOCK) | IPCOP_MASK (SHM_UNLOCK) | - IPCOP_MASK (IPC_RMID))) { - if (second == (IPC_INFO|IPC_64)) - second = IPC_INFO; /* So that we don't have to translate it */ - err = sys_shmctl (first, second, (struct shmid_ds *)uptr); - } else if ((second & IPC_64) && second != (SHM_INFO|IPC_64)) { - struct shmid64_ds s; - struct shmid64_ds32 *up = (struct shmid64_ds32 *)uptr; - mm_segment_t old_fs; - - if (second == (IPC_SET|IPC_64)) { - err = get_user (s.shm_perm.uid, &up->shm_perm.uid); - err |= __get_user (s.shm_perm.gid, &up->shm_perm.gid); - err |= __get_user (s.shm_perm.mode, &up->shm_perm.mode); - if (err) - goto out; - } - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_shmctl (first, second, (struct shmid_ds *)&s); - set_fs (old_fs); - if (err < 0) - goto out; - - /* Mask it even in this case so it becomes a CSE. */ - if (IPCOP_MASK (second) & - (IPCOP_MASK (SHM_STAT) | IPCOP_MASK (IPC_STAT))) { - int err2 = put_user (s.shm_perm.key, &up->shm_perm.key); - err2 |= __put_user (high2lowuid(s.shm_perm.uid), &up->shm_perm.uid); - err2 |= __put_user (high2lowgid(s.shm_perm.gid), &up->shm_perm.gid); - err2 |= __put_user (high2lowuid(s.shm_perm.cuid), &up->shm_perm.cuid); - err2 |= __put_user (high2lowgid(s.shm_perm.cgid), &up->shm_perm.cgid); - err2 |= __put_user (s.shm_perm.mode, &up->shm_perm.mode); - err2 |= __put_user (s.shm_perm.seq, &up->shm_perm.seq); - err2 |= __put_user (s.shm_atime, &up->shm_atime); - err2 |= __put_user (s.shm_dtime, &up->shm_dtime); - err2 |= __put_user (s.shm_ctime, &up->shm_ctime); - err2 |= __put_user (s.shm_segsz, &up->shm_segsz); - err2 |= __put_user (s.shm_nattch, &up->shm_nattch); - err2 |= __put_user (s.shm_cpid, &up->shm_cpid); - err2 |= __put_user (s.shm_lpid, &up->shm_lpid); - if (err2) - err = -EFAULT; - } - } else { - struct shmid_ds s; - struct shmid_ds32 *up = (struct shmid_ds32 *)uptr; - mm_segment_t old_fs; - - second &= ~IPC_64; - if (second == IPC_SET) { - err = get_user (s.shm_perm.uid, &up->shm_perm.uid); - err |= __get_user (s.shm_perm.gid, &up->shm_perm.gid); - err |= __get_user (s.shm_perm.mode, &up->shm_perm.mode); - if (err) - goto out; - } - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_shmctl (first, second, &s); - set_fs (old_fs); - if (err < 0) - goto out; - - /* Mask it even in this case so it becomes a CSE. */ - if (second == SHM_INFO) { - struct shm_info32 { - int used_ids; - u32 shm_tot, shm_rss, shm_swp; - u32 swap_attempts, swap_successes; - } *uip = (struct shm_info32 *)uptr; - struct shm_info *kp = (struct shm_info *)&s; - int err2 = put_user (kp->used_ids, &uip->used_ids); - err2 |= __put_user (kp->shm_tot, &uip->shm_tot); - err2 |= __put_user (kp->shm_rss, &uip->shm_rss); - err2 |= __put_user (kp->shm_swp, &uip->shm_swp); - err2 |= __put_user (kp->swap_attempts, &uip->swap_attempts); - err2 |= __put_user (kp->swap_successes, &uip->swap_successes); - if (err2) - err = -EFAULT; - } else if (IPCOP_MASK (second) & - (IPCOP_MASK (SHM_STAT) | IPCOP_MASK (IPC_STAT))) { - int err2 = put_user (s.shm_perm.key, &up->shm_perm.key); - err2 |= __put_user (high2lowuid(s.shm_perm.uid), &up->shm_perm.uid); - err2 |= __put_user (high2lowgid(s.shm_perm.gid), &up->shm_perm.gid); - err2 |= __put_user (high2lowuid(s.shm_perm.cuid), &up->shm_perm.cuid); - err2 |= __put_user (high2lowgid(s.shm_perm.cgid), &up->shm_perm.cgid); - err2 |= __put_user (s.shm_perm.mode, &up->shm_perm.mode); - err2 |= __put_user (s.shm_perm.seq, &up->shm_perm.seq); - err2 |= __put_user (s.shm_atime, &up->shm_atime); - err2 |= __put_user (s.shm_dtime, &up->shm_dtime); - err2 |= __put_user (s.shm_ctime, &up->shm_ctime); - err2 |= __put_user (s.shm_segsz, &up->shm_segsz); - err2 |= __put_user (s.shm_nattch, &up->shm_nattch); - err2 |= __put_user (s.shm_cpid, &up->shm_cpid); - err2 |= __put_user (s.shm_lpid, &up->shm_lpid); - if (err2) - err = -EFAULT; - } - } -out: - return err; -} - /* * sys32_ipc() is the de-multiplexer for the SysV IPC calls in 32bit emulation. * @@ -835,84 +300,64 @@ out: */ asmlinkage int sys32_ipc (u32 call, int first, int second, int third, u32 ptr) { - int version, err; + if(call >> 16) /* hack for backward compatibility */ + return -EINVAL; - version = call >> 16; /* hack for backward compatibility */ call &= 0xffff; - if(version) - return -EINVAL; - if (call <= SEMTIMEDOP) switch (call) { case SEMTIMEDOP: - if (third) { - err = do_sys32_semtimedop(first, - (struct sembuf *)AA(ptr), - second, - (struct compat_timespec *) - AA((u32)third)); - goto out; - } + if (third) + return compat_sys_semtimedop(first, + compat_ptr(ptr), second, + compat_ptr(third)); /* else fall through for normal semop() */ case SEMOP: /* struct sembuf is the same on 32 and 64bit :)) */ - err = sys_semtimedop (first, (struct sembuf *)AA(ptr), + return sys_semtimedop (first, compat_ptr(ptr), second, NULL); - goto out; case SEMGET: - err = sys_semget (first, second, third); - goto out; + return sys_semget (first, second, third); case SEMCTL: - err = do_sys32_semctl (first, second, third, (void *)AA(ptr)); - goto out; + return compat_sys_semctl (first, second, third, + compat_ptr(ptr)); default: - err = -EINVAL; - goto out; + return -EINVAL; }; if (call <= MSGCTL) switch (call) { case MSGSND: - err = do_sys32_msgsnd (first, second, third, (void *)AA(ptr)); - goto out; + return compat_sys_msgsnd (first, second, third, + compat_ptr(ptr)); case MSGRCV: - err = do_sys32_msgrcv (first, second, 0, third, - version, (void *)AA(ptr)); - goto out; + return compat_sys_msgrcv (first, second, 0, third, + 0, compat_ptr(ptr)); case MSGGET: - err = sys_msgget ((key_t) first, second); - goto out; + return sys_msgget ((key_t) first, second); case MSGCTL: - err = do_sys32_msgctl (first, second, (void *)AA(ptr)); - goto out; + return compat_sys_msgctl (first, second, + compat_ptr(ptr)); default: - err = -EINVAL; - goto out; + return -EINVAL; } if (call <= SHMCTL) switch (call) { case SHMAT: - err = do_sys32_shmat (first, second, third, - version, (void *)AA(ptr)); - goto out; + return compat_sys_shmat (first, second, third, + 0, compat_ptr(ptr)); case SHMDT: - err = sys_shmdt ((char *)AA(ptr)); - goto out; + return sys_shmdt(compat_ptr(ptr)); case SHMGET: - err = sys_shmget (first, second, third); - goto out; + return sys_shmget(first, second, third); case SHMCTL: - err = do_sys32_shmctl (first, second, (void *)AA(ptr)); - goto out; + return compat_sys_shmctl(first, second, + compat_ptr(ptr)); default: - err = -EINVAL; - goto out; + return -EINVAL; } - err = -EINVAL; - -out: - return err; + return -EINVAL; } asmlinkage int sys32_truncate64(const char * path, unsigned long high, unsigned long low) --- linux-2.6.4-rc1/arch/s390/kernel/syscalls.S 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/s390/kernel/syscalls.S 2004-02-29 13:08:05.000000000 -0800 @@ -109,7 +109,7 @@ SYSCALL(sys_setpriority,sys_setpriority, NI_SYSCALL /* old profil syscall */ SYSCALL(sys_statfs,sys_statfs,compat_sys_statfs_wrapper) SYSCALL(sys_fstatfs,sys_fstatfs,compat_sys_fstatfs_wrapper) /* 100 */ -SYSCALL(sys_ioperm,sys_ni_syscall,sys_ni_syscall) +NI_SYSCALL /* ioperm for i386 */ SYSCALL(sys_socketcall,sys_socketcall,compat_sys_socketcall_wrapper) SYSCALL(sys_syslog,sys_syslog,sys32_syslog_wrapper) SYSCALL(sys_setitimer,sys_setitimer,compat_sys_setitimer_wrapper) --- linux-2.6.4-rc1/arch/s390/kernel/sys_s390.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/s390/kernel/sys_s390.c 2004-02-29 13:08:05.000000000 -0800 @@ -289,11 +289,6 @@ asmlinkage int sys_olduname(struct oldol return error; } -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on) -{ - return -ENOSYS; -} - #else /* CONFIG_ARCH_S390X */ asmlinkage int s390x_newuname(struct new_utsname * name) --- linux-2.6.4-rc1/arch/sh/boards/adx/Makefile 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sh/boards/adx/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for ADX boards # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o irq.o irq_maskreq.o --- linux-2.6.4-rc1/arch/sh/boards/bigsur/Makefile 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sh/boards/bigsur/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the BigSur specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o io.o irq.o led.o --- linux-2.6.4-rc1/arch/sh/boards/cat68701/Makefile 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sh/boards/cat68701/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the CAT-68701 specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o irq.o --- linux-2.6.4-rc1/arch/sh/boards/cqreek/Makefile 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sh/boards/cqreek/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the CqREEK specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o irq.o --- linux-2.6.4-rc1/arch/sh/boards/dmida/Makefile 2003-06-14 12:17:57.000000000 -0700 +++ 25/arch/sh/boards/dmida/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -2,10 +2,6 @@ # Makefile for the DataMyte Industrial Digital Assistant(tm) specific parts # of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := mach.o --- linux-2.6.4-rc1/arch/sh/boards/dreamcast/Makefile 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sh/boards/dreamcast/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the Sega Dreamcast specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o irq.o rtc.o --- linux-2.6.4-rc1/arch/sh/boards/ec3104/Makefile 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sh/boards/ec3104/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the EC3104 specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o io.o irq.o --- linux-2.6.4-rc1/arch/sh/boards/harp/Makefile 2003-06-14 12:17:58.000000000 -0700 +++ 25/arch/sh/boards/harp/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for STMicroelectronics board specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := irq.o setup.o mach.o led.o --- linux-2.6.4-rc1/arch/sh/boards/hp6xx/hp620/Makefile 2003-06-14 12:18:35.000000000 -0700 +++ 25/arch/sh/boards/hp6xx/hp620/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the HP620 specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := mach.o --- linux-2.6.4-rc1/arch/sh/boards/hp6xx/hp680/Makefile 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/sh/boards/hp6xx/hp680/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the HP680 specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := mach.o setup.o --- linux-2.6.4-rc1/arch/sh/boards/hp6xx/hp690/Makefile 2003-06-14 12:17:55.000000000 -0700 +++ 25/arch/sh/boards/hp6xx/hp690/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the HP690 specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := mach.o --- linux-2.6.4-rc1/arch/sh/boards/mpc1211/Makefile 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sh/boards/mpc1211/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the Interface (CTP/PCI/MPC-SH02) specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o rtc.o led.o --- linux-2.6.4-rc1/arch/sh/boards/overdrive/Makefile 2003-06-14 12:18:33.000000000 -0700 +++ 25/arch/sh/boards/overdrive/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the STMicroelectronics Overdrive specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := mach.o setup.o io.o irq.o led.o time.o --- linux-2.6.4-rc1/arch/sh/boards/saturn/Makefile 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sh/boards/saturn/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the Sega Saturn specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o io.o irq.o --- linux-2.6.4-rc1/arch/sh/boards/se/770x/Makefile 2003-06-14 12:18:29.000000000 -0700 +++ 25/arch/sh/boards/se/770x/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the 770x SolutionEngine specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := mach.o setup.o io.o irq.o led.o --- linux-2.6.4-rc1/arch/sh/boards/se/7751/Makefile 2003-06-14 12:18:34.000000000 -0700 +++ 25/arch/sh/boards/se/7751/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the 7751 SolutionEngine specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := mach.o setup.o io.o irq.o led.o --- linux-2.6.4-rc1/arch/sh/boards/sh2000/Makefile 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sh/boards/sh2000/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the SH2000 specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o --- linux-2.6.4-rc1/arch/sh/boards/snapgear/Makefile 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sh/boards/snapgear/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the SnapGear specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o io.o rtc.o --- linux-2.6.4-rc1/arch/sh/boards/systemh/Makefile 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sh/boards/systemh/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the SystemH specific parts of the kernel # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o irq.o io.o --- linux-2.6.4-rc1/arch/sh/boards/unknown/Makefile 2003-06-14 12:18:29.000000000 -0700 +++ 25/arch/sh/boards/unknown/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for unknown SH boards # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := mach.o io.o setup.o --- linux-2.6.4-rc1/arch/sh/cchips/hd6446x/hd64461/Makefile 2003-06-14 12:18:25.000000000 -0700 +++ 25/arch/sh/cchips/hd6446x/hd64461/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the HD64461 # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o io.o --- linux-2.6.4-rc1/arch/sh/cchips/hd6446x/hd64465/Makefile 2003-06-14 12:18:33.000000000 -0700 +++ 25/arch/sh/cchips/hd6446x/hd64465/Makefile 2004-02-29 13:09:12.000000000 -0800 @@ -1,10 +1,6 @@ # # Makefile for the HD64465 # -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# obj-y := setup.o io.o gpio.o --- linux-2.6.4-rc1/arch/sparc64/Kconfig 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/sparc64/Kconfig 2004-02-29 13:09:27.000000000 -0800 @@ -700,12 +700,19 @@ config DEBUG_BOOTMEM depends on DEBUG_KERNEL bool "Debug BOOTMEM initialization" +config LOCKMETER + bool "Kernel lock metering" + depends on SMP && !PREEMPT + help + Say Y to enable kernel lock metering, which adds overhead to SMP locks, + but allows you to see various statistics using the lockstat command. + # We have a custom atomic_dec_and_lock() implementation but it's not # compatible with spinlock debugging so we need to fall back on # the generic version in that case. config HAVE_DEC_LOCK bool - depends on SMP && !DEBUG_SPINLOCK + depends on SMP && !DEBUG_SPINLOCK && !LOCKMETER default y config MCOUNT --- linux-2.6.4-rc1/arch/sparc64/kernel/pci_iommu.c 2003-08-08 22:55:11.000000000 -0700 +++ 25/arch/sparc64/kernel/pci_iommu.c 2004-02-29 13:07:52.000000000 -0800 @@ -661,7 +661,7 @@ void pci_unmap_sg(struct pci_dev *pdev, /* Make physical memory consistent for a single * streaming mode DMA translation after a transfer. */ -void pci_dma_sync_single(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction) +void pci_dma_sync_single_for_cpu(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction) { struct pcidev_cookie *pcp; struct pci_iommu *iommu; @@ -722,7 +722,7 @@ void pci_dma_sync_single(struct pci_dev /* Make physical memory consistent for a set of streaming * mode DMA translations after a transfer. */ -void pci_dma_sync_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction) +void pci_dma_sync_sg_for_cpu(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction) { struct pcidev_cookie *pcp; struct pci_iommu *iommu; --- linux-2.6.4-rc1/arch/sparc64/kernel/process.c 2003-10-08 15:07:08.000000000 -0700 +++ 25/arch/sparc64/kernel/process.c 2004-02-29 13:09:04.000000000 -0800 @@ -10,7 +10,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#define __KERNEL_SYSCALLS__ #include #include @@ -22,7 +21,6 @@ #include #include #include -#include #include #include #include --- linux-2.6.4-rc1/arch/sparc64/kernel/sbus.c 2003-06-14 12:18:51.000000000 -0700 +++ 25/arch/sparc64/kernel/sbus.c 2004-02-29 13:07:52.000000000 -0800 @@ -540,7 +540,7 @@ void sbus_unmap_sg(struct sbus_dev *sdev spin_unlock_irqrestore(&iommu->lock, flags); } -void sbus_dma_sync_single(struct sbus_dev *sdev, dma_addr_t base, size_t size, int direction) +void sbus_dma_sync_single_for_cpu(struct sbus_dev *sdev, dma_addr_t base, size_t size, int direction) { struct sbus_iommu *iommu = sdev->bus->iommu; unsigned long flags; @@ -552,7 +552,11 @@ void sbus_dma_sync_single(struct sbus_de spin_unlock_irqrestore(&iommu->lock, flags); } -void sbus_dma_sync_sg(struct sbus_dev *sdev, struct scatterlist *sg, int nents, int direction) +void sbus_dma_sync_single_for_device(struct sbus_dev *sdev, dma_addr_t base, size_t size, int direction) +{ +} + +void sbus_dma_sync_sg_for_cpu(struct sbus_dev *sdev, struct scatterlist *sg, int nents, int direction) { struct sbus_iommu *iommu = sdev->bus->iommu; unsigned long flags, size; @@ -572,6 +576,10 @@ void sbus_dma_sync_sg(struct sbus_dev *s spin_unlock_irqrestore(&iommu->lock, flags); } +void sbus_dma_sync_sg_for_device(struct sbus_dev *sdev, struct scatterlist *sg, int nents, int direction) +{ +} + /* Enable 64-bit DVMA mode for the given device. */ void sbus_set_sbus64(struct sbus_dev *sdev, int bursts) { --- linux-2.6.4-rc1/arch/sparc64/kernel/setup.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/sparc64/kernel/setup.c 2004-02-29 13:08:05.000000000 -0800 @@ -603,11 +603,6 @@ static int __init set_preferred_console( } console_initcall(set_preferred_console); -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on) -{ - return -EIO; -} - /* BUFFER is PAGE_SIZE bytes long. */ extern char *sparc_cpu_type; --- linux-2.6.4-rc1/arch/sparc64/kernel/smp.c 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sparc64/kernel/smp.c 2004-02-29 13:09:04.000000000 -0800 @@ -36,9 +36,6 @@ #include #include -#define __KERNEL_SYSCALLS__ -#include - extern int linux_num_cpus; extern void calibrate_delay(void); @@ -46,7 +43,6 @@ extern void calibrate_delay(void); static unsigned char boot_cpu_id; cpumask_t cpu_online_map = CPU_MASK_NONE; -atomic_t sparc64_num_cpus_possible = ATOMIC_INIT(0); cpumask_t phys_cpu_present_map = CPU_MASK_NONE; static cpumask_t smp_commenced_mask; static cpumask_t cpu_callout_map; @@ -1236,20 +1232,17 @@ void __init smp_prepare_cpus(unsigned in instance = 0; while (!cpu_find_by_instance(instance, NULL, &mid)) { - if (mid < max_cpus) { + if (mid < max_cpus) cpu_set(mid, phys_cpu_present_map); - atomic_inc(&sparc64_num_cpus_possible); - } instance++; } - if (atomic_read(&sparc64_num_cpus_possible) > max_cpus) { + if (num_possible_cpus() > max_cpus) { instance = 0; while (!cpu_find_by_instance(instance, NULL, &mid)) { if (mid != boot_cpu_id) { cpu_clear(mid, phys_cpu_present_map); - atomic_dec(&sparc64_num_cpus_possible); - if (atomic_read(&sparc64_num_cpus_possible) <= max_cpus) + if (num_possible_cpus() <= max_cpus) break; } instance++; --- linux-2.6.4-rc1/arch/sparc64/kernel/sparc64_ksyms.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/sparc64/kernel/sparc64_ksyms.c 2004-02-29 13:08:24.000000000 -0800 @@ -145,7 +145,6 @@ EXPORT_SYMBOL_NOVERS(mcount); /* CPU online map and active count. */ EXPORT_SYMBOL(cpu_online_map); EXPORT_SYMBOL(phys_cpu_present_map); -EXPORT_SYMBOL(sparc64_num_cpus_possible); /* Spinlock debugging library, optional. */ #ifdef CONFIG_DEBUG_SPINLOCK @@ -214,8 +213,8 @@ EXPORT_SYMBOL(sbus_map_single); EXPORT_SYMBOL(sbus_unmap_single); EXPORT_SYMBOL(sbus_map_sg); EXPORT_SYMBOL(sbus_unmap_sg); -EXPORT_SYMBOL(sbus_dma_sync_single); -EXPORT_SYMBOL(sbus_dma_sync_sg); +EXPORT_SYMBOL(sbus_dma_sync_single_for_cpu); +EXPORT_SYMBOL(sbus_dma_sync_sg_for_cpu); #endif EXPORT_SYMBOL(outsb); EXPORT_SYMBOL(outsw); @@ -233,8 +232,8 @@ EXPORT_SYMBOL(pci_map_single); EXPORT_SYMBOL(pci_unmap_single); EXPORT_SYMBOL(pci_map_sg); EXPORT_SYMBOL(pci_unmap_sg); -EXPORT_SYMBOL(pci_dma_sync_single); -EXPORT_SYMBOL(pci_dma_sync_sg); +EXPORT_SYMBOL(pci_dma_sync_single_for_cpu); +EXPORT_SYMBOL(pci_dma_sync_sg_for_cpu); EXPORT_SYMBOL(pci_dma_supported); #endif --- linux-2.6.4-rc1/arch/sparc64/kernel/sys_sparc32.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/sparc64/kernel/sys_sparc32.c 2004-02-29 13:08:05.000000000 -0800 @@ -282,11 +282,6 @@ static inline long put_tv32(struct compa __put_user(i->tv_usec, &o->tv_usec))); } -asmlinkage long sys32_ioperm(u32 from, u32 num, int on) -{ - return sys_ioperm((unsigned long)from, (unsigned long)num, on); -} - struct msgbuf32 { s32 mtype; char mtext[1]; }; struct ipc_perm32 --- linux-2.6.4-rc1/arch/sparc64/lib/rwlock.S 2003-11-23 19:03:00.000000000 -0800 +++ 25/arch/sparc64/lib/rwlock.S 2004-02-29 13:09:27.000000000 -0800 @@ -85,5 +85,20 @@ __write_trylock_succeed: __write_trylock_fail: retl mov 0, %o0 + + .globl __read_trylock +__read_trylock: /* %o0 = lock_ptr */ + ldsw [%o0], %g5 + brlz,pn %g5, 100f + add %g5, 1, %g7 + cas [%o0], %g5, %g7 + cmp %g5, %g7 + bne,pn %icc, __read_trylock + membar #StoreLoad | #StoreStore + retl + mov 1, %o0 +100: retl + mov 0, %o0 + rwlock_impl_end: --- linux-2.6.4-rc1/arch/sparc/kernel/ioport.c 2003-09-08 13:58:56.000000000 -0700 +++ 25/arch/sparc/kernel/ioport.c 2004-02-29 13:07:52.000000000 -0800 @@ -360,7 +360,7 @@ void sbus_unmap_sg(struct sbus_dev *sdev /* */ -void sbus_dma_sync_single(struct sbus_dev *sdev, dma_addr_t ba, size_t size, int direction) +void sbus_dma_sync_single_for_cpu(struct sbus_dev *sdev, dma_addr_t ba, size_t size, int direction) { #if 0 unsigned long va; @@ -380,9 +380,34 @@ void sbus_dma_sync_single(struct sbus_de #endif } -void sbus_dma_sync_sg(struct sbus_dev *sdev, struct scatterlist *sg, int n, int direction) +void sbus_dma_sync_single_for_device(struct sbus_dev *sdev, dma_addr_t ba, size_t size, int direction) { - printk("sbus_dma_sync_sg: not implemented yet\n"); +#if 0 + unsigned long va; + struct resource *res; + + /* We do not need the resource, just print a message if invalid. */ + res = _sparc_find_resource(&_sparc_dvma, ba); + if (res == NULL) + panic("sbus_dma_sync_single: 0x%x\n", ba); + + va = page_address(mmu_translate_dvma(ba)); /* XXX higmem */ + /* + * XXX This bogosity will be fixed with the iommu rewrite coming soon + * to a kernel near you. - Anton + */ + /* mmu_inval_dma_area(va, (size + PAGE_SIZE-1) & PAGE_MASK); */ +#endif +} + +void sbus_dma_sync_sg_for_cpu(struct sbus_dev *sdev, struct scatterlist *sg, int n, int direction) +{ + printk("sbus_dma_sync_sg_for_cpu: not implemented yet\n"); +} + +void sbus_dma_sync_sg_for_device(struct sbus_dev *sdev, struct scatterlist *sg, int n, int direction) +{ + printk("sbus_dma_sync_sg_for_device: not implemented yet\n"); } #endif /* CONFIG_SBUS */ @@ -482,7 +507,7 @@ void pci_free_consistent(struct pci_dev * The 32-bit bus address to use is returned. * * Once the device is given the dma address, the device owns this memory - * until either pci_unmap_single or pci_dma_sync_single is performed. + * until either pci_unmap_single or pci_dma_sync_single_* is performed. */ dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size, int direction) @@ -591,10 +616,21 @@ void pci_unmap_sg(struct pci_dev *hwdev, * If you perform a pci_map_single() but wish to interrogate the * buffer using the cpu, yet do not wish to teardown the PCI dma * mapping, you must call this function before doing so. At the - * next point you give the PCI dma address back to the card, the + * next point you give the PCI dma address back to the card, you + * must first perform a pci_dma_sync_for_device, and then the * device again owns the buffer. */ -void pci_dma_sync_single(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction) +void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction) +{ + if (direction == PCI_DMA_NONE) + BUG(); + if (direction != PCI_DMA_TODEVICE) { + mmu_inval_dma_area((unsigned long)phys_to_virt(ba), + (size + PAGE_SIZE-1) & PAGE_MASK); + } +} + +void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction) { if (direction == PCI_DMA_NONE) BUG(); @@ -607,10 +643,27 @@ void pci_dma_sync_single(struct pci_dev /* Make physical memory consistent for a set of streaming * mode DMA translations after a transfer. * - * The same as pci_dma_sync_single but for a scatter-gather list, + * The same as pci_dma_sync_single_* but for a scatter-gather list, * same rules and usage. */ -void pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction) +void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction) +{ + int n; + + if (direction == PCI_DMA_NONE) + BUG(); + if (direction != PCI_DMA_TODEVICE) { + for (n = 0; n < nents; n++) { + if (page_address(sg->page) == NULL) BUG(); + mmu_inval_dma_area( + (unsigned long) page_address(sg->page), + (sg->length + PAGE_SIZE-1) & PAGE_MASK); + sg++; + } + } +} + +void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction) { int n; --- linux-2.6.4-rc1/arch/sparc/kernel/process.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/sparc/kernel/process.c 2004-02-29 13:09:04.000000000 -0800 @@ -9,7 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#define __KERNEL_SYSCALLS__ #include #include @@ -19,7 +18,6 @@ #include #include #include -#include #include #include #include --- linux-2.6.4-rc1/arch/sparc/kernel/setup.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/sparc/kernel/setup.c 2004-02-29 13:08:05.000000000 -0800 @@ -390,11 +390,6 @@ static int __init set_preferred_console( } console_initcall(set_preferred_console); -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on) -{ - return -EIO; -} - extern char *sparc_cpu_type[]; extern char *sparc_fpu_type[]; --- linux-2.6.4-rc1/arch/sparc/kernel/smp.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/sparc/kernel/smp.c 2004-02-29 13:09:04.000000000 -0800 @@ -33,9 +33,6 @@ #include #include -#define __KERNEL_SYSCALLS__ -#include - #define IRQ_RESCHEDULE 13 #define IRQ_STOP_CPU 14 #define IRQ_CROSS_CALL 15 --- linux-2.6.4-rc1/arch/sparc/kernel/sparc_ksyms.c 2004-02-27 16:17:19.000000000 -0800 +++ 25/arch/sparc/kernel/sparc_ksyms.c 2004-02-29 13:07:52.000000000 -0800 @@ -206,8 +206,10 @@ EXPORT_SYMBOL(sbus_map_single); EXPORT_SYMBOL(sbus_unmap_single); EXPORT_SYMBOL(sbus_map_sg); EXPORT_SYMBOL(sbus_unmap_sg); -EXPORT_SYMBOL(sbus_dma_sync_single); -EXPORT_SYMBOL(sbus_dma_sync_sg); +EXPORT_SYMBOL(sbus_dma_sync_single_for_cpu); +EXPORT_SYMBOL(sbus_dma_sync_single_for_device); +EXPORT_SYMBOL(sbus_dma_sync_sg_for_cpu); +EXPORT_SYMBOL(sbus_dma_sync_sg_for_device); EXPORT_SYMBOL(sbus_iounmap); EXPORT_SYMBOL(sbus_ioremap); #endif @@ -219,7 +221,10 @@ EXPORT_SYMBOL(pci_alloc_consistent); EXPORT_SYMBOL(pci_free_consistent); EXPORT_SYMBOL(pci_map_single); EXPORT_SYMBOL(pci_unmap_single); -EXPORT_SYMBOL(pci_dma_sync_single); +EXPORT_SYMBOL(pci_dma_sync_single_for_cpu); +EXPORT_SYMBOL(pci_dma_sync_single_for_device); +EXPORT_SYMBOL(pci_dma_sync_sg_for_cpu); +EXPORT_SYMBOL(pci_dma_sync_sg_for_device); /* Actually, ioremap/iounmap are not PCI specific. But it is ok for drivers. */ EXPORT_SYMBOL(ioremap); EXPORT_SYMBOL(iounmap); --- linux-2.6.4-rc1/arch/sparc/kernel/sun4d_smp.c 2003-09-27 18:57:44.000000000 -0700 +++ 25/arch/sparc/kernel/sun4d_smp.c 2004-02-29 13:09:04.000000000 -0800 @@ -32,9 +32,6 @@ #include #include -#define __KERNEL_SYSCALLS__ -#include - #define IRQ_CROSS_CALL 15 extern ctxd_t *srmmu_ctx_table_phys; --- linux-2.6.4-rc1/arch/sparc/kernel/sun4m_smp.c 2003-06-14 12:18:32.000000000 -0700 +++ 25/arch/sparc/kernel/sun4m_smp.c 2004-02-29 13:09:04.000000000 -0800 @@ -27,9 +27,6 @@ #include #include -#define __KERNEL_SYSCALLS__ -#include - #define IRQ_RESCHEDULE 13 #define IRQ_STOP_CPU 14 #define IRQ_CROSS_CALL 15 --- linux-2.6.4-rc1/arch/v850/kernel/rte_mb_a_pci.c 2003-08-08 22:55:11.000000000 -0700 +++ 25/arch/v850/kernel/rte_mb_a_pci.c 2004-02-29 13:07:52.000000000 -0800 @@ -687,10 +687,11 @@ void pci_unmap_single (struct pci_dev *p If you perform a pci_map_single() but wish to interrogate the buffer using the cpu, yet do not wish to teardown the PCI dma mapping, you must call this function before doing so. At the next - point you give the PCI dma address back to the card, the device - again owns the buffer. */ + point you give the PCI dma address back to the card, you must first + perform a pci_dma_sync_for_device, and then the device again owns + the buffer. */ void -pci_dma_sync_single (struct pci_dev *pdev, dma_addr_t dma_addr, size_t size, +pci_dma_sync_single_for_cpu (struct pci_dev *pdev, dma_addr_t dma_addr, size_t size, int dir) { void *mb_sram_addr = PCI_TO_MB_SRAM (dma_addr); @@ -700,6 +701,22 @@ pci_dma_sync_single (struct pci_dev *pde if (dir == PCI_DMA_FROMDEVICE) memcpy (mapping->cpu_addr, mb_sram_addr, size); else if (dir == PCI_DMA_TODEVICE) + ; /* nothing to do */ + else + panic("pci_dma_sync_single: unsupported sync dir: %d", dir); +} + +void +pci_dma_sync_single_for_device (struct pci_dev *pdev, dma_addr_t dma_addr, size_t size, + int dir) +{ + void *mb_sram_addr = PCI_TO_MB_SRAM (dma_addr); + struct dma_mapping *mapping = find_dma_mapping (mb_sram_addr); + + /* Synchronize the DMA buffer with the CPU buffer if necessary. */ + if (dir == PCI_DMA_FROMDEVICE) + ; /* nothing to do */ + else if (dir == PCI_DMA_TODEVICE) memcpy (mb_sram_addr, mapping->cpu_addr, size); else panic("pci_dma_sync_single: unsupported sync dir: %d", dir); @@ -724,11 +741,18 @@ pci_unmap_sg (struct pci_dev *pdev, stru } /* Make physical memory consistent for a set of streaming mode DMA - translations after a transfer. The same as pci_dma_sync_single but + translations after a transfer. The same as pci_dma_sync_single_* but for a scatter-gather list, same rules and usage. */ void -pci_dma_sync_sg (struct pci_dev *dev, struct scatterlist *sg, int sg_len, +pci_dma_sync_sg_for_cpu (struct pci_dev *dev, struct scatterlist *sg, int sg_len, + int dir) +{ + BUG (); +} + +void +pci_dma_sync_sg_for_device (struct pci_dev *dev, struct scatterlist *sg, int sg_len, int dir) { BUG (); @@ -770,4 +794,5 @@ EXPORT_SYMBOL (pci_map_single); EXPORT_SYMBOL (pci_unmap_single); EXPORT_SYMBOL (pci_alloc_consistent); EXPORT_SYMBOL (pci_free_consistent); -EXPORT_SYMBOL (pci_dma_sync_single); +EXPORT_SYMBOL (pci_dma_sync_single_for_cpu); +EXPORT_SYMBOL (pci_dma_sync_single_for_device); --- linux-2.6.4-rc1/arch/x86_64/defconfig 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/defconfig 2004-02-29 13:09:19.000000000 -0800 @@ -27,6 +27,7 @@ CONFIG_SYSVIPC=y # CONFIG_BSD_PROCESS_ACCT is not set CONFIG_SYSCTL=y CONFIG_LOG_BUF_SHIFT=18 +# CONFIG_HOTPLUG is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y # CONFIG_EMBEDDED is not set @@ -106,6 +107,7 @@ CONFIG_ACPI_POWER=y CONFIG_ACPI_PCI=y CONFIG_ACPI_SYSTEM=y # CONFIG_ACPI_RELAXED_AML is not set +# CONFIG_X86_PM_TIMER is not set # # CPU Frequency scaling @@ -119,7 +121,6 @@ CONFIG_PCI=y CONFIG_PCI_DIRECT=y # CONFIG_PCI_LEGACY_PROC is not set # CONFIG_PCI_NAMES is not set -# CONFIG_HOTPLUG is not set # # Executable file formats / Emulations @@ -168,6 +169,7 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=4096 CONFIG_BLK_DEV_INITRD=y CONFIG_LBD=y +# CONFIG_DCSSBLK is not set # # ATA/ATAPI/MFM/RLL support @@ -306,7 +308,7 @@ CONFIG_FUSION_MAX_SGE=40 # CONFIG_FUSION_CTL is not set # -# IEEE 1394 (FireWire) support (EXPERIMENTAL) +# IEEE 1394 (FireWire) support # # CONFIG_IEEE1394 is not set @@ -489,7 +491,7 @@ CONFIG_TIGON3=y # # ISDN subsystem # -# CONFIG_ISDN_BOOL is not set +# CONFIG_ISDN is not set # # Telephony Support @@ -562,7 +564,8 @@ CONFIG_SERIAL_8250_NR_UARTS=4 CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y CONFIG_UNIX98_PTYS=y -CONFIG_UNIX98_PTY_COUNT=256 +CONFIG_LEGACY_PTYS=y +CONFIG_LEGACY_PTY_COUNT=256 # # Mice @@ -718,7 +721,6 @@ CONFIG_ISO9660_FS=y CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y # CONFIG_DEVFS_FS is not set -CONFIG_DEVPTS_FS=y # CONFIG_DEVPTS_FS_XATTR is not set CONFIG_TMPFS=y CONFIG_HUGETLBFS=y @@ -731,6 +733,7 @@ CONFIG_RAMFS=y # CONFIG_ADFS_FS is not set # CONFIG_AFFS_FS is not set # CONFIG_HFS_FS is not set +# CONFIG_HFSPLUS_FS is not set # CONFIG_BEFS_FS is not set # CONFIG_BFS_FS is not set # CONFIG_EFS_FS is not set @@ -761,7 +764,6 @@ CONFIG_SUNRPC=y # CONFIG_CIFS is not set # CONFIG_NCP_FS is not set # CONFIG_CODA_FS is not set -# CONFIG_INTERMEZZO_FS is not set # CONFIG_AFS_FS is not set # --- linux-2.6.4-rc1/arch/x86_64/ia32/ipc32.c 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/ia32/ipc32.c 2004-02-29 13:07:56.000000000 -0800 @@ -1,656 +1,19 @@ #include -#include -#include -#include +#include +#include #include +#include #include #include -#include #include -#include #include #include -#include -#include -#include -#include -#include - -#include - -/* - * sys32_ipc() is the de-multiplexer for the SysV IPC calls in 32bit emulation.. - * - * This is really horribly ugly. - */ - -struct msgbuf32 { - s32 mtype; - char mtext[1]; -}; - -struct ipc_perm32 { - int key; - compat_uid_t uid; - compat_gid_t gid; - compat_uid_t cuid; - compat_gid_t cgid; - unsigned short mode; - unsigned short seq; -}; - -struct ipc64_perm32 { - unsigned key; - compat_uid32_t uid; - compat_gid32_t gid; - compat_uid32_t cuid; - compat_gid32_t cgid; - unsigned short mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - unsigned int unused1; - unsigned int unused2; -}; - -struct semid_ds32 { - struct ipc_perm32 sem_perm; /* permissions .. see ipc.h */ - compat_time_t sem_otime; /* last semop time */ - compat_time_t sem_ctime; /* last change time */ - u32 sem_base; /* ptr to first semaphore in array */ - u32 sem_pending; /* pending operations to be processed */ - u32 sem_pending_last; /* last pending operation */ - u32 undo; /* undo requests on this array */ - unsigned short sem_nsems; /* no. of semaphores in array */ -}; - -struct semid64_ds32 { - struct ipc64_perm32 sem_perm; - compat_time_t sem_otime; - unsigned int __unused1; - compat_time_t sem_ctime; - unsigned int __unused2; - unsigned int sem_nsems; - unsigned int __unused3; - unsigned int __unused4; -}; - -struct msqid_ds32 { - struct ipc_perm32 msg_perm; - u32 msg_first; - u32 msg_last; - compat_time_t msg_stime; - compat_time_t msg_rtime; - compat_time_t msg_ctime; - u32 wwait; - u32 rwait; - unsigned short msg_cbytes; - unsigned short msg_qnum; - unsigned short msg_qbytes; - compat_ipc_pid_t msg_lspid; - compat_ipc_pid_t msg_lrpid; -}; - -struct msqid64_ds32 { - struct ipc64_perm32 msg_perm; - compat_time_t msg_stime; - unsigned int __unused1; - compat_time_t msg_rtime; - unsigned int __unused2; - compat_time_t msg_ctime; - unsigned int __unused3; - unsigned int msg_cbytes; - unsigned int msg_qnum; - unsigned int msg_qbytes; - compat_pid_t msg_lspid; - compat_pid_t msg_lrpid; - unsigned int __unused4; - unsigned int __unused5; -}; - -struct shmid_ds32 { - struct ipc_perm32 shm_perm; - int shm_segsz; - compat_time_t shm_atime; - compat_time_t shm_dtime; - compat_time_t shm_ctime; - compat_ipc_pid_t shm_cpid; - compat_ipc_pid_t shm_lpid; - unsigned short shm_nattch; -}; - -struct shmid64_ds32 { - struct ipc64_perm32 shm_perm; - compat_size_t shm_segsz; - compat_time_t shm_atime; - unsigned int __unused1; - compat_time_t shm_dtime; - unsigned int __unused2; - compat_time_t shm_ctime; - unsigned int __unused3; - compat_pid_t shm_cpid; - compat_pid_t shm_lpid; - unsigned int shm_nattch; - unsigned int __unused4; - unsigned int __unused5; -}; - -struct shminfo64_32 { - unsigned int shmmax; - unsigned int shmmin; - unsigned int shmmni; - unsigned int shmseg; - unsigned int shmall; - unsigned int __unused1; - unsigned int __unused2; - unsigned int __unused3; - unsigned int __unused4; -}; - -struct shm_info32 { - int used_ids; - u32 shm_tot, shm_rss, shm_swp; - u32 swap_attempts, swap_successes; -}; - -struct ipc_kludge { - u32 msgp; - s32 msgtyp; -}; - - -#define A(__x) ((unsigned long)(__x)) -#define AA(__x) ((unsigned long)(__x)) - -#define SEMOP 1 -#define SEMGET 2 -#define SEMCTL 3 -#define TIMEDSEMOP 4 -#define MSGSND 11 -#define MSGRCV 12 -#define MSGGET 13 -#define MSGCTL 14 -#define SHMAT 21 -#define SHMDT 22 -#define SHMGET 23 -#define SHMCTL 24 - -#define IPCOP_MASK(__x) (1UL << (__x)) - -static int -ipc_parse_version32 (int *cmd) -{ - if (*cmd & IPC_64) { - *cmd ^= IPC_64; - return IPC_64; - } else { - return IPC_OLD; - } -} - -static int put_semid(void *user_semid, struct semid64_ds *s, int version) -{ - int err2; - switch (version) { - case IPC_64: { - struct semid64_ds32 *usp64 = (struct semid64_ds32 *) user_semid; - - if (!access_ok(VERIFY_WRITE, usp64, sizeof(*usp64))) { - err2 = -EFAULT; - break; - } - err2 = __put_user(s->sem_perm.key, &usp64->sem_perm.key); - err2 |= __put_user(s->sem_perm.uid, &usp64->sem_perm.uid); - err2 |= __put_user(s->sem_perm.gid, &usp64->sem_perm.gid); - err2 |= __put_user(s->sem_perm.cuid, &usp64->sem_perm.cuid); - err2 |= __put_user(s->sem_perm.cgid, &usp64->sem_perm.cgid); - err2 |= __put_user(s->sem_perm.mode, &usp64->sem_perm.mode); - err2 |= __put_user(s->sem_perm.seq, &usp64->sem_perm.seq); - err2 |= __put_user(s->sem_otime, &usp64->sem_otime); - err2 |= __put_user(s->sem_ctime, &usp64->sem_ctime); - err2 |= __put_user(s->sem_nsems, &usp64->sem_nsems); - break; - } - default: { - struct semid_ds32 *usp32 = (struct semid_ds32 *) user_semid; - - if (!access_ok(VERIFY_WRITE, usp32, sizeof(*usp32))) { - err2 = -EFAULT; - break; - } - err2 = __put_user(s->sem_perm.key, &usp32->sem_perm.key); - err2 |= __put_user(s->sem_perm.uid, &usp32->sem_perm.uid); - err2 |= __put_user(s->sem_perm.gid, &usp32->sem_perm.gid); - err2 |= __put_user(s->sem_perm.cuid, &usp32->sem_perm.cuid); - err2 |= __put_user(s->sem_perm.cgid, &usp32->sem_perm.cgid); - err2 |= __put_user(s->sem_perm.mode, &usp32->sem_perm.mode); - err2 |= __put_user(s->sem_perm.seq, &usp32->sem_perm.seq); - err2 |= __put_user(s->sem_otime, &usp32->sem_otime); - err2 |= __put_user(s->sem_ctime, &usp32->sem_ctime); - err2 |= __put_user(s->sem_nsems, &usp32->sem_nsems); - break; - } - } - return err2; -} - -static int -semctl32 (int first, int second, int third, void *uptr) -{ - union semun fourth; - u32 pad; - int err; - struct semid64_ds s; - mm_segment_t old_fs; - int version = ipc_parse_version32(&third); - - if (!uptr) - return -EINVAL; - if (get_user(pad, (u32 *)uptr)) - return -EFAULT; - if (third == SETVAL) - fourth.val = (int)pad; - else - fourth.__pad = (void *)A(pad); - switch (third) { - case IPC_INFO: - case IPC_RMID: - case IPC_SET: - case SEM_INFO: - case GETVAL: - case GETPID: - case GETNCNT: - case GETZCNT: - case GETALL: - case SETVAL: - case SETALL: - err = sys_semctl(first, second, third, fourth); - break; - - case IPC_STAT: - case SEM_STAT: - fourth.__pad = &s; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_semctl(first, second, third, fourth); - set_fs(old_fs); - if (!err) - err = put_semid((void *)A(pad), &s, version); - break; - default: - err = -EINVAL; - break; - } - return err; -} - -#define MAXBUF (64*1024) - -static int -do_sys32_msgsnd (int first, int second, int third, void *uptr) -{ - struct msgbuf *p; - struct msgbuf32 *up = (struct msgbuf32 *)uptr; - mm_segment_t old_fs; - int err; - - if (second >= MAXBUF-sizeof(struct msgbuf)) - return -EINVAL; - p = kmalloc(second + sizeof(struct msgbuf), GFP_USER); - if (!p) - return -ENOMEM; - err = get_user(p->mtype, &up->mtype); - err |= (copy_from_user(p->mtext, &up->mtext, second) ? -EFAULT : 0); - if (err) - goto out; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_msgsnd(first, p, second, third); - set_fs(old_fs); - out: - kfree(p); - return err; -} - -static int -do_sys32_msgrcv (int first, int second, int msgtyp, int third, int version, void *uptr) -{ - struct msgbuf32 *up; - struct msgbuf *p; - mm_segment_t old_fs; - int err; - - if (!version) { - struct ipc_kludge *uipck = (struct ipc_kludge *)uptr; - struct ipc_kludge ipck; - - err = -EINVAL; - if (!uptr) - goto out; - err = -EFAULT; - if (copy_from_user(&ipck, uipck, sizeof(struct ipc_kludge))) - goto out; - uptr = (void *)A(ipck.msgp); - msgtyp = ipck.msgtyp; - } - if (second >= MAXBUF-sizeof(struct msgbuf)) - return -EINVAL; - err = -ENOMEM; - p = kmalloc(second + sizeof(struct msgbuf), GFP_USER); - if (!p) - goto out; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_msgrcv(first, p, second, msgtyp, third); - set_fs(old_fs); - if (err < 0) - goto free_then_out; - up = (struct msgbuf32 *)uptr; - if (put_user(p->mtype, &up->mtype) || copy_to_user(&up->mtext, p->mtext, err)) - err = -EFAULT; -free_then_out: - kfree(p); -out: - return err; -} - - -static int -msgctl32 (int first, int second, void *uptr) -{ - int err = -EINVAL, err2; - struct msqid_ds m; - struct msqid64_ds m64; - struct msqid_ds32 *up32 = (struct msqid_ds32 *)uptr; - struct msqid64_ds32 *up64 = (struct msqid64_ds32 *)uptr; - mm_segment_t old_fs; - int version = ipc_parse_version32(&second); - - switch (second) { - case IPC_INFO: - case IPC_RMID: - case MSG_INFO: - err = sys_msgctl(first, second, (struct msqid_ds *)uptr); - break; - - case IPC_SET: - if (version == IPC_64) { - err = get_user(m.msg_perm.uid, &up64->msg_perm.uid); - err |= get_user(m.msg_perm.gid, &up64->msg_perm.gid); - err |= get_user(m.msg_perm.mode, &up64->msg_perm.mode); - err |= get_user(m.msg_qbytes, &up64->msg_qbytes); - } else { - err = get_user(m.msg_perm.uid, &up32->msg_perm.uid); - err |= get_user(m.msg_perm.gid, &up32->msg_perm.gid); - err |= get_user(m.msg_perm.mode, &up32->msg_perm.mode); - err |= get_user(m.msg_qbytes, &up32->msg_qbytes); - } - if (err) - break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_msgctl(first, second, &m); - set_fs(old_fs); - break; - - case IPC_STAT: - case MSG_STAT: - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_msgctl(first, second, (void *) &m64); - set_fs(old_fs); - if (version == IPC_64) { - if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64))) { - err = -EFAULT; - break; - } - err2 = __put_user(m64.msg_perm.key, &up64->msg_perm.key); - err2 |= __put_user(m64.msg_perm.uid, &up64->msg_perm.uid); - err2 |= __put_user(m64.msg_perm.gid, &up64->msg_perm.gid); - err2 |= __put_user(m64.msg_perm.cuid, &up64->msg_perm.cuid); - err2 |= __put_user(m64.msg_perm.cgid, &up64->msg_perm.cgid); - err2 |= __put_user(m64.msg_perm.mode, &up64->msg_perm.mode); - err2 |= __put_user(m64.msg_perm.seq, &up64->msg_perm.seq); - err2 |= __put_user(m64.msg_stime, &up64->msg_stime); - err2 |= __put_user(m64.msg_rtime, &up64->msg_rtime); - err2 |= __put_user(m64.msg_ctime, &up64->msg_ctime); - err2 |= __put_user(m64.msg_cbytes, &up64->msg_cbytes); - err2 |= __put_user(m64.msg_qnum, &up64->msg_qnum); - err2 |= __put_user(m64.msg_qbytes, &up64->msg_qbytes); - err2 |= __put_user(m64.msg_lspid, &up64->msg_lspid); - err2 |= __put_user(m64.msg_lrpid, &up64->msg_lrpid); - if (err2) - err = -EFAULT; - } else { - if (!access_ok(VERIFY_WRITE, up32, sizeof(*up32))) { - err = -EFAULT; - break; - } - err2 = __put_user(m64.msg_perm.key, &up32->msg_perm.key); - err2 |= __put_user(m64.msg_perm.uid, &up32->msg_perm.uid); - err2 |= __put_user(m64.msg_perm.gid, &up32->msg_perm.gid); - err2 |= __put_user(m64.msg_perm.cuid, &up32->msg_perm.cuid); - err2 |= __put_user(m64.msg_perm.cgid, &up32->msg_perm.cgid); - err2 |= __put_user(m64.msg_perm.mode, &up32->msg_perm.mode); - err2 |= __put_user(m64.msg_perm.seq, &up32->msg_perm.seq); - err2 |= __put_user(m64.msg_stime, &up32->msg_stime); - err2 |= __put_user(m64.msg_rtime, &up32->msg_rtime); - err2 |= __put_user(m64.msg_ctime, &up32->msg_ctime); - err2 |= __put_user(m64.msg_cbytes, &up32->msg_cbytes); - err2 |= __put_user(m64.msg_qnum, &up32->msg_qnum); - err2 |= __put_user(m64.msg_qbytes, &up32->msg_qbytes); - err2 |= __put_user(m64.msg_lspid, &up32->msg_lspid); - err2 |= __put_user(m64.msg_lrpid, &up32->msg_lrpid); - if (err2) - err = -EFAULT; - } - break; - } - return err; -} - -static int -shmat32 (int first, int second, int third, int version, void *uptr) -{ - unsigned long raddr; - u32 *uaddr = (u32 *)A((u32)third); - int err; - - if (version == 1) - return -EINVAL; /* iBCS2 emulator entry point: unsupported */ - err = do_shmat(first, uptr, second, &raddr); - if (err) - return err; - return put_user(raddr, uaddr); -} - -static int put_shmid64(struct shmid64_ds *s64p, void *uptr, int version) -{ - int err2; -#define s64 (*s64p) - if (version == IPC_64) { - struct shmid64_ds32 *up64 = (struct shmid64_ds32 *)uptr; - - if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64))) - return -EFAULT; - - err2 = __put_user(s64.shm_perm.key, &up64->shm_perm.key); - err2 |= __put_user(s64.shm_perm.uid, &up64->shm_perm.uid); - err2 |= __put_user(s64.shm_perm.gid, &up64->shm_perm.gid); - err2 |= __put_user(s64.shm_perm.cuid, &up64->shm_perm.cuid); - err2 |= __put_user(s64.shm_perm.cgid, &up64->shm_perm.cgid); - err2 |= __put_user(s64.shm_perm.mode, &up64->shm_perm.mode); - err2 |= __put_user(s64.shm_perm.seq, &up64->shm_perm.seq); - err2 |= __put_user(s64.shm_atime, &up64->shm_atime); - err2 |= __put_user(s64.shm_dtime, &up64->shm_dtime); - err2 |= __put_user(s64.shm_ctime, &up64->shm_ctime); - err2 |= __put_user(s64.shm_segsz, &up64->shm_segsz); - err2 |= __put_user(s64.shm_nattch, &up64->shm_nattch); - err2 |= __put_user(s64.shm_cpid, &up64->shm_cpid); - err2 |= __put_user(s64.shm_lpid, &up64->shm_lpid); - } else { - struct shmid_ds32 *up32 = (struct shmid_ds32 *)uptr; - - if (!access_ok(VERIFY_WRITE, up32, sizeof(*up32))) - return -EFAULT; - - err2 = __put_user(s64.shm_perm.key, &up32->shm_perm.key); - err2 |= __put_user(s64.shm_perm.uid, &up32->shm_perm.uid); - err2 |= __put_user(s64.shm_perm.gid, &up32->shm_perm.gid); - err2 |= __put_user(s64.shm_perm.cuid, &up32->shm_perm.cuid); - err2 |= __put_user(s64.shm_perm.cgid, &up32->shm_perm.cgid); - err2 |= __put_user(s64.shm_perm.mode, &up32->shm_perm.mode); - err2 |= __put_user(s64.shm_perm.seq, &up32->shm_perm.seq); - err2 |= __put_user(s64.shm_atime, &up32->shm_atime); - err2 |= __put_user(s64.shm_dtime, &up32->shm_dtime); - err2 |= __put_user(s64.shm_ctime, &up32->shm_ctime); - err2 |= __put_user(s64.shm_segsz, &up32->shm_segsz); - err2 |= __put_user(s64.shm_nattch, &up32->shm_nattch); - err2 |= __put_user(s64.shm_cpid, &up32->shm_cpid); - err2 |= __put_user(s64.shm_lpid, &up32->shm_lpid); - } -#undef s64 - return err2 ? -EFAULT : 0; -} -static int -shmctl32 (int first, int second, void *uptr) -{ - int err = -EFAULT, err2; - struct shmid_ds s; - struct shmid64_ds s64; - mm_segment_t old_fs; - struct shm_info32 *uip = (struct shm_info32 *)uptr; - struct shm_info si; - int version = ipc_parse_version32(&second); - struct shminfo64 smi; - struct shminfo *usi32 = (struct shminfo *) uptr; - struct shminfo64_32 *usi64 = (struct shminfo64_32 *) uptr; - - switch (second) { - case IPC_INFO: - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_shmctl(first, second, (struct shmid_ds *)&smi); - set_fs(old_fs); - - if (version == IPC_64) { - if (!access_ok(VERIFY_WRITE, usi64, sizeof(*usi64))) { - err = -EFAULT; - break; - } - err2 = __put_user(smi.shmmax, &usi64->shmmax); - err2 |= __put_user(smi.shmmin, &usi64->shmmin); - err2 |= __put_user(smi.shmmni, &usi64->shmmni); - err2 |= __put_user(smi.shmseg, &usi64->shmseg); - err2 |= __put_user(smi.shmall, &usi64->shmall); - } else { - if (!access_ok(VERIFY_WRITE, usi32, sizeof(*usi32))) { - err = -EFAULT; - break; - } - err2 = __put_user(smi.shmmax, &usi32->shmmax); - err2 |= __put_user(smi.shmmin, &usi32->shmmin); - err2 |= __put_user(smi.shmmni, &usi32->shmmni); - err2 |= __put_user(smi.shmseg, &usi32->shmseg); - err2 |= __put_user(smi.shmall, &usi32->shmall); - } - if (err2) - err = -EFAULT; - break; - - case IPC_RMID: - case SHM_LOCK: - case SHM_UNLOCK: - err = sys_shmctl(first, second, (struct shmid_ds *)uptr); - break; - - case IPC_SET: - if (version == IPC_64) { - struct shmid64_ds32 *up64 = (struct shmid64_ds32 *)uptr; - err = get_user(s.shm_perm.uid, &up64->shm_perm.uid); - err |= get_user(s.shm_perm.gid, &up64->shm_perm.gid); - err |= get_user(s.shm_perm.mode, &up64->shm_perm.mode); - } else { - struct shmid_ds32 *up32 = (struct shmid_ds32 *)uptr; - err = get_user(s.shm_perm.uid, &up32->shm_perm.uid); - err |= get_user(s.shm_perm.gid, &up32->shm_perm.gid); - err |= get_user(s.shm_perm.mode, &up32->shm_perm.mode); - } - if (err) - break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_shmctl(first, second, &s); - set_fs(old_fs); - break; - - case IPC_STAT: - case SHM_STAT: - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_shmctl(first, second, (void *) &s64); - set_fs(old_fs); - - if (err < 0) - break; - err2 = put_shmid64(&s64, uptr, version); - if (err2) - err = err2; - break; - - case SHM_INFO: - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_shmctl(first, second, (void *)&si); - set_fs(old_fs); - if (err < 0) - break; - - if (!access_ok(VERIFY_WRITE, uip, sizeof(*uip))) { - err = -EFAULT; - break; - } - err2 = __put_user(si.used_ids, &uip->used_ids); - err2 |= __put_user(si.shm_tot, &uip->shm_tot); - err2 |= __put_user(si.shm_rss, &uip->shm_rss); - err2 |= __put_user(si.shm_swp, &uip->shm_swp); - err2 |= __put_user(si.swap_attempts, &uip->swap_attempts); - err2 |= __put_user(si.swap_successes, &uip->swap_successes); - if (err2) - err = -EFAULT; - break; - default: - err = -EINVAL; - break; - } - return err; -} - -extern int sem_ctls[]; - -static long semtimedop32(int semid, struct sembuf *sb, - unsigned nsops, struct compat_timespec *ts32) -{ - struct timespec ts; - mm_segment_t oldfs = get_fs(); - long ret; - - if (nsops > sem_ctls[2]) - return -E2BIG; - if (!access_ok(VERIFY_READ, sb, nsops * sizeof(struct sembuf))) - return -EFAULT; - if (ts32 && get_compat_timespec(&ts, ts32)) - return -EFAULT; - - set_fs(KERNEL_DS); - ret = sys_semtimedop(semid, sb, nsops, ts32 ? &ts : NULL); - set_fs(oldfs); - return ret; -} +#include asmlinkage long -sys32_ipc (u32 call, int first, int second, int third, u32 ptr, u32 fifth) +sys32_ipc(u32 call, int first, int second, int third, + compat_uptr_t ptr, u32 fifth) { int version; @@ -660,35 +23,35 @@ sys32_ipc (u32 call, int first, int seco switch (call) { case SEMOP: /* struct sembuf is the same on 32 and 64bit :)) */ - return sys_semtimedop(first, (struct sembuf *)AA(ptr), second, - NULL); - case TIMEDSEMOP: - return semtimedop32(first, (struct sembuf *)AA(ptr), second, - (struct compat_timespec *)AA(fifth)); + return sys_semtimedop(first, compat_ptr(ptr), second, NULL); + case SEMTIMEDOP: + return compat_sys_semtimedop(first, compat_ptr(ptr), second, + compat_ptr(fifth)); case SEMGET: return sys_semget(first, second, third); case SEMCTL: - return semctl32(first, second, third, (void *)AA(ptr)); + return compat_sys_semctl(first, second, third, compat_ptr(ptr)); case MSGSND: - return do_sys32_msgsnd(first, second, third, (void *)AA(ptr)); + return compat_sys_msgsnd(first, second, third, compat_ptr(ptr)); case MSGRCV: - return do_sys32_msgrcv(first, second, fifth, third, version, (void *)AA(ptr)); + return compat_sys_msgrcv(first, second, fifth, third, + version, compat_ptr(ptr)); case MSGGET: return sys_msgget((key_t) first, second); case MSGCTL: - return msgctl32(first, second, (void *)AA(ptr)); + return compat_sys_msgctl(first, second, compat_ptr(ptr)); case SHMAT: - return shmat32(first, second, third, version, (void *)AA(ptr)); + return compat_sys_shmat(first, second, third, version, + compat_ptr(ptr)); break; case SHMDT: - return sys_shmdt((char *)AA(ptr)); + return sys_shmdt(compat_ptr(ptr)); case SHMGET: return sys_shmget(first, second, third); case SHMCTL: - return shmctl32(first, second, (void *)AA(ptr)); + return compat_sys_shmctl(first, second, compat_ptr(ptr)); } return -ENOSYS; } - --- linux-2.6.4-rc1/arch/x86_64/Kconfig 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/Kconfig 2004-02-29 13:08:01.000000000 -0800 @@ -381,6 +381,10 @@ config COMPAT depends on IA32_EMULATION default y +config SYSVIPC_COMPAT + bool + depends on COMPAT && SYSVIPC + default y config UID16 bool @@ -452,6 +456,7 @@ config INIT_DEBUG config DEBUG_INFO bool "Compile the kernel with debug info" depends on DEBUG_KERNEL + default n help If you say Y here the resulting kernel image will include debugging info resulting in a larger kernel image. @@ -483,9 +488,8 @@ config IOMMU_LEAK help Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. - -#config X86_REMOTE_DEBUG -# bool "kgdb debugging stub" + +source "arch/x86_64/Kconfig.kgdb" endmenu --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/arch/x86_64/Kconfig.kgdb 2004-02-29 13:08:01.000000000 -0800 @@ -0,0 +1,176 @@ +config KGDB + bool "Include kgdb kernel debugger" + depends on DEBUG_KERNEL + select DEBUG_INFO + help + If you say Y here, the system will be compiled with the debug + option (-g) and a debugging stub will be included in the + kernel. This stub communicates with gdb on another (host) + computer via a serial port. The host computer should have + access to the kernel binary file (vmlinux) and a serial port + that is connected to the target machine. Gdb can be made to + configure the serial port or you can use stty and setserial to + do this. See the 'target' command in gdb. This option also + configures in the ability to request a breakpoint early in the + boot process. To request the breakpoint just include 'kgdb' + as a boot option when booting the target machine. The system + will then break as soon as it looks at the boot options. This + option also installs a breakpoint in panic and sends any + kernel faults to the debugger. For more information see the + Documentation/i386/kgdb.txt file. + +choice + depends on KGDB + prompt "Debug serial port BAUD" + default KGDB_115200BAUD + help + Gdb and the kernel stub need to agree on the baud rate to be + used. Some systems (x86 family at this writing) allow this to + be configured. + +config KGDB_9600BAUD + bool "9600" + +config KGDB_19200BAUD + bool "19200" + +config KGDB_38400BAUD + bool "38400" + +config KGDB_57600BAUD + bool "57600" + +config KGDB_115200BAUD + bool "115200" +endchoice + +config KGDB_PORT + hex "hex I/O port address of the debug serial port" + depends on KGDB + default 3f8 + help + Some systems (x86 family at this writing) allow the port + address to be configured. The number entered is assumed to be + hex, don't put 0x in front of it. The standard address are: + COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx + will tell you what you have. It is good to test the serial + connection with a live system before trying to debug. + +config KGDB_IRQ + int "IRQ of the debug serial port" + depends on KGDB + default 4 + help + This is the irq for the debug port. If everything is working + correctly and the kernel has interrupts on a control C to the + port should cause a break into the kernel debug stub. + +config DEBUG_INFO + bool + depends on KGDB + default y + +config KGDB_MORE + bool "Add any additional compile options" + depends on KGDB + default n + help + Saying yes here turns on the ability to enter additional + compile options. + + +config KGDB_OPTIONS + depends on KGDB_MORE + string "Additional compile arguments" + default "-O1" + help + This option allows you enter additional compile options for + the whole kernel compile. Each platform will have a default + that seems right for it. For example on PPC "-ggdb -O1", and + for i386 "-O1". Note that by configuring KGDB "-g" is already + turned on. In addition, on i386 platforms + "-fomit-frame-pointer" is deleted from the standard compile + options. + +config NO_KGDB_CPUS + int "Number of CPUs" + depends on KGDB && SMP + default NR_CPUS + help + + This option sets the number of cpus for kgdb ONLY. It is used + to prune some internal structures so they look "nice" when + displayed with gdb. This is to overcome possibly larger + numbers that may have been entered above. Enter the real + number to get nice clean kgdb_info displays. + +config KGDB_TS + bool "Enable kgdb time stamp macros?" + depends on KGDB + default n + help + Kgdb event macros allow you to instrument your code with calls + to the kgdb event recording function. The event log may be + examined with gdb at a break point. Turning on this + capability also allows you to choose how many events to + keep. Kgdb always keeps the lastest events. + +choice + depends on KGDB_TS + prompt "Max number of time stamps to save?" + default KGDB_TS_128 + +config KGDB_TS_64 + bool "64" + +config KGDB_TS_128 + bool "128" + +config KGDB_TS_256 + bool "256" + +config KGDB_TS_512 + bool "512" + +config KGDB_TS_1024 + bool "1024" + +endchoice + +config STACK_OVERFLOW_TEST + bool "Turn on kernel stack overflow testing?" + depends on KGDB + default n + help + This option enables code in the front line interrupt handlers + to check for kernel stack overflow on interrupts and system + calls. This is part of the kgdb code on x86 systems. + +config KGDB_CONSOLE + bool "Enable serial console thru kgdb port" + depends on KGDB + default n + help + This option enables the command line "console=kgdb" option. + When the system is booted with this option in the command line + all kernel printk output is sent to gdb (as well as to other + consoles). For this to work gdb must be connected. For this + reason, this command line option will generate a breakpoint if + gdb has not yet connected. After the gdb continue command is + given all pent up console output will be printed by gdb on the + host machine. Neither this option, nor KGDB require the + serial driver to be configured. + +config KGDB_SYSRQ + bool "Turn on SysRq 'G' command to do a break?" + depends on KGDB + default y + help + This option includes an option in the SysRq code that allows + you to enter SysRq G which generates a breakpoint to the KGDB + stub. This will work if the keyboard is alive and can + interrupt the system. Because of constraints on when the + serial port interrupt can be enabled, this code may allow you + to interrupt the system before the serial port control C is + available. Just say yes here. + --- linux-2.6.4-rc1/arch/x86_64/kernel/acpi/boot.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/x86_64/kernel/acpi/boot.c 2004-02-29 13:07:39.000000000 -0800 @@ -48,11 +48,12 @@ #define PREFIX "ACPI: " -int acpi_noirq __initdata = 0; /* skip ACPI IRQ initialization */ +int acpi_noirq __initdata; /* skip ACPI IRQ initialization */ int acpi_ht __initdata = 1; /* enable HT */ int acpi_lapic; int acpi_ioapic; +int acpi_strict; /* -------------------------------------------------------------------------- Boot-time Configuration @@ -264,7 +265,7 @@ acpi_parse_hpet ( * programs the PIC-mode SCI to Level Trigger. * (NO-OP if the BIOS set Level Trigger already) * - * If a PIC-mode SCI is not recogznied or gives spurious IRQ7's + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's * it may require Edge Trigger -- use "acpi_pic_sci=edge" * (NO-OP if the BIOS set Edge Trigger already) * --- linux-2.6.4-rc1/arch/x86_64/kernel/apic.c 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/kernel/apic.c 2004-02-29 13:07:44.000000000 -0800 @@ -553,7 +553,7 @@ static int __init init_lapic_sysfs(void) /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ error = sysdev_class_register(&lapic_sysclass); if (!error) - error = sys_device_register(&device_lapic); + error = sysdev_register(&device_lapic); return error; } device_initcall(init_lapic_sysfs); --- linux-2.6.4-rc1/arch/x86_64/kernel/i8259.c 2004-01-09 00:04:31.000000000 -0800 +++ 25/arch/x86_64/kernel/i8259.c 2004-02-29 13:09:19.000000000 -0800 @@ -423,14 +423,14 @@ static struct sysdev_class timer_sysclas static struct sys_device device_timer = { .id = 0, - .cls &timer_sysclass, + .cls = &timer_sysclass, }; static int __init init_timer_sysfs(void) { int error = sysdev_class_register(&timer_sysclass); if (!error) - error = sys_device_register(&device_timer); + error = sysdev_register(&device_timer); return error; } --- linux-2.6.4-rc1/arch/x86_64/kernel/irq.c 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/kernel/irq.c 2004-02-29 13:08:01.000000000 -0800 @@ -405,6 +405,9 @@ out: spin_unlock(&desc->lock); irq_exit(); + + kgdb_process_breakpoint(); + return 1; } --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/arch/x86_64/kernel/kgdb_stub.c 2004-02-29 13:08:01.000000000 -0800 @@ -0,0 +1,2595 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (c) 2000 VERITAS Software Corporation. + * + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: David Grothe + * Updated by: Robert Walsh + * Updated by: wangdi + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Compatibility with 2.1.xx kernel by David Grothe + * + * Changes to allow auto initilization. All that is needed is that it + * be linked with the kernel and a break point (int 3) be executed. + * The header file defines BREAKPOINT to allow one to do + * this. It should also be possible, once the interrupt system is up, to + * call putDebugChar("+"). Once this is done, the remote debugger should + * get our attention by sending a ^C in a packet. George Anzinger + * + * Integrated into 2.2.5 kernel by Tigran Aivazian + * Added thread support, support for multiple processors, + * support for ia-32(x86) hardware debugging. + * Amit S. Kale ( akale@veritas.com ) + * + * Modified to support debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + * X86_64 changes from Andi Kleen's patch merged by Jim Houston + * (jim.houston@ccur.com). If it works thank Andi if its broken + * blame me. + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ +#define KGDB_VERSION "<20030915.1651.33>" +#include +#include +#include /* for strcpy */ +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define Dearly_printk(x...) +int kgdb_enabled = 0; + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int tty_putDebugChar(int); /* write a single character */ +extern int tty_getDebugChar(void); /* read and return a single char */ +extern void tty_flushDebugChar(void); /* flush pending characters */ +extern int eth_putDebugChar(int); /* write a single character */ +extern int eth_getDebugChar(void); /* read and return a single char */ +extern void eth_flushDebugChar(void); /* flush pending characters */ + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 400 + +char *kgdb_version = KGDB_VERSION; + +/* debug > 0 prints ill-formed commands in valid packets & checksum errors */ +int debug_regs = 0; /* set to non-zero to print registers */ + +/* filled in by an external module */ +char *gdb_module_offsets; + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES (NUMREGS * sizeof(unsigned long)) +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + * + * Could add XMM and segment registers here. + */ +enum regnames {_RAX, + _RBX, + _RCX, + _RDX, + _RSI, + _RDI, + _RBP, + _RSP, + _R8, + _R9, + _R10, + _R11, + _R12, + _R13, + _R14, + _R15, + _PC, + _PS, + NUMREGS }; + + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* + * Put the error code here just in case the user cares. + * Likewise, the vector number here (since GDB only gets the signal + * number through the usual means, and that's not very specific). + * The called_from is the return address so he can tell how we entered kgdb. + * This will allow him to seperate out the various possible entries. + */ +#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */ + +#define PID_MAX PID_MAX_DEFAULT + +#ifdef CONFIG_SMP +void smp_send_nmi_allbutself(void); +#define IF_SMP(x) x +#undef MAX_NO_CPUS +#ifndef CONFIG_NO_KGDB_CPUS +#define CONFIG_NO_KGDB_CPUS 2 +#endif +#if CONFIG_NO_KGDB_CPUS > NR_CPUS +#define MAX_NO_CPUS NR_CPUS +#else +#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS +#endif +#define hold_init hold_on_sstep: 1, +#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL) +#define NUM_CPUS num_online_cpus() +#else +#define IF_SMP(x) +#define hold_init +#undef MAX_NO_CPUS +#define MAX_NO_CPUS 1 +#define NUM_CPUS 1 +#endif +#define NOCPU (struct task_struct *)0xbad1fbad +/* *INDENT-OFF* */ +struct kgdb_info { + int used_malloc; + void *called_from; + long long entry_tsc; + int errcode; + int vector; + int print_debug_info; +#ifdef CONFIG_SMP + int hold_on_sstep; + struct { + volatile struct task_struct *task; + int pid; + int hold; + struct pt_regs *regs; + } cpus_waiting[MAX_NO_CPUS]; +#endif +} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1}; + +/* *INDENT-ON* */ + +#define used_m kgdb_info.used_malloc +/* + * This is little area we set aside to contain the stack we + * need to build to allow gdb to call functions. We use one + * per cpu to avoid locking issues. We will do all this work + * with interrupts off so that should take care of the protection + * issues. + */ +#define LOOKASIDE_SIZE 200 /* should be more than enough */ +#define MALLOC_MAX 200 /* Max malloc size */ +struct { + unsigned long rsp; + unsigned long array[LOOKASIDE_SIZE]; +} fn_call_lookaside[MAX_NO_CPUS]; + +static int trap_cpu; +static unsigned long OLD_esp; + +#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE] +#define IF_BIT 0x200 +#define TF_BIT 0x100 + +#define MALLOC_ROUND 8-1 + +static char malloc_array[MALLOC_MAX]; +IF_SMP(static void to_gdb(const char *mess)); +void * +malloc(int size) +{ + + if (size <= (MALLOC_MAX - used_m)) { + int old_used = used_m; + used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND)); + return &malloc_array[old_used]; + } else { + return NULL; + } +} + +/* + * I/O dispatch functions... + * Based upon kgdboe, either call the ethernet + * handler or the serial one.. + */ +void +putDebugChar(int c) +{ + if (!kgdboe) { + tty_putDebugChar(c); + } else { + eth_putDebugChar(c); + } +} + +int +getDebugChar(void) +{ + if (!kgdboe) { + return tty_getDebugChar(); + } else { + return eth_getDebugChar(); + } +} + +void +flushDebugChar(void) +{ + if (!kgdboe) { + tty_flushDebugChar(); + } else { + eth_flushDebugChar(); + } +} + +/* + * Gdb calls functions by pushing agruments, including a return address + * on the stack and the adjusting EIP to point to the function. The + * whole assumption in GDB is that we are on a different stack than the + * one the "user" i.e. code that hit the break point, is on. This, of + * course is not true in the kernel. Thus various dodges are needed to + * do the call without directly messing with EIP (which we can not change + * as it is just a location and not a register. To adjust it would then + * require that we move every thing below EIP up or down as needed. This + * will not work as we may well have stack relative pointer on the stack + * (such as the pointer to regs, for example). + + * So here is what we do: + * We detect gdb attempting to store into the stack area and instead, store + * into the fn_call_lookaside.array at the same relative location as if it + * were the area ESP pointed at. We also trap ESP modifications + * and uses these to adjust fn_call_lookaside.esp. On entry + * fn_call_lookaside.esp will be set to point at the last entry in + * fn_call_lookaside.array. This allows us to check if it has changed, and + * if so, on exit, we add the registers we will use to do the move and a + * trap/ interrupt return exit sequence. We then adjust the eflags in the + * regs array (remember we now have a copy in the fn_call_lookaside.array) to + * kill the interrupt bit, AND we change EIP to point at our set up stub. + * As part of the register set up we preset the registers to point at the + * begining and end of the fn_call_lookaside.array, so all the stub needs to + * do is move words from the array to the stack until ESP= the desired value + * then do the rti. This will then transfer to the desired function with + * all the correct registers. Nifty huh? + */ +extern asmlinkage void fn_call_stub(void); +extern asmlinkage void fn_rtn_stub(void); +/* *INDENT-OFF* */ +__asm__("fn_rtn_stub:\n\t" + "movq %rax,%rsp\n\t" + "fn_call_stub:\n\t" + "1:\n\t" + "addq $-8,%rbx\n\t" + "movq (%rbx), %rax\n\t" + "pushq %rax\n\t" + "cmpq %rsp,%rcx\n\t" + "jne 1b\n\t" + "popq %rax\n\t" + "popq %rbx\n\t" + "popq %rcx\n\t" + "iret \n\t"); +/* *INDENT-ON* */ +#define gdb_i386vector kgdb_info.vector +#define gdb_i386errcode kgdb_info.errcode +#define waiting_cpus kgdb_info.cpus_waiting +#define remote_debug kgdb_info.print_debug_info +#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold +/* gdb locks */ + +#ifdef CONFIG_SMP +static int in_kgdb_called; +static spinlock_t waitlocks[MAX_NO_CPUS] = + {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; +/* + * The following array has the thread pointer of each of the "other" + * cpus. We make it global so it can be seen by gdb. + */ +volatile int in_kgdb_entry_log[MAX_NO_CPUS]; +volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; +/* +static spinlock_t continuelocks[MAX_NO_CPUS]; +*/ +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +/* waiters on our spinlock plus us */ +static atomic_t spinlock_waiters = ATOMIC_INIT(1); +static int spinlock_count = 0; +static int spinlock_cpu = 0; +/* + * Note we use nested spin locks to account for the case where a break + * point is encountered when calling a function by user direction from + * kgdb. Also there is the memory exception recursion to account for. + * Well, yes, but this lets other cpus thru too. Lets add a + * cpu id to the lock. + */ +#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \ + spinlock_cpu != smp_processor_id()){\ + atomic_inc(&spinlock_waiters); \ + while (! spin_trylock(x)) {\ + in_kgdb(®s);\ + }\ + atomic_dec(&spinlock_waiters); \ + spinlock_count = 1; \ + spinlock_cpu = smp_processor_id(); \ + }else{ \ + spinlock_count++; \ + } +#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x) +#else +unsigned kgdb_spinlock = 0; +#define KGDB_SPIN_LOCK(x) --*x +#define KGDB_SPIN_UNLOCK(x) ++*x +#endif + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + if ((remote_debug) && (checksum != xmitcsum)) { + printk + ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n", + checksum, xmitcsum, buffer); + } + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + + if (remote_debug) + printk("R:%s\n", buffer); + flushDebugChar(); +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + + if (!kgdboe) { + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + putDebugChar(ch); + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + + } while ((getDebugChar() & 0x7f) != '+'); + } else { + /* + * For udp, we can not transfer too much bytes once. + * We only transfer MAX_SEND_COUNT size bytes each time + */ + +#define MAX_SEND_COUNT 30 + + int send_count = 0, i = 0; + char send_buf[MAX_SEND_COUNT]; + + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + send_count = 0; + while ((ch = buffer[count])) { + if (send_count >= MAX_SEND_COUNT) { + for(i = 0; i < MAX_SEND_COUNT; i++) { + putDebugChar(send_buf[i]); + } + flushDebugChar(); + send_count = 0; + } else { + send_buf[send_count] = ch; + checksum += ch; + count ++; + send_count++; + } + } + for(i = 0; i < send_count; i++) + putDebugChar(send_buf[i]); + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + } while ((getDebugChar() & 0x7f) != '+'); + } +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static char lbuf[BUFMAX]; +static short error; + +void +debug_error(char *format, char *parm) +{ + if (remote_debug) + printk(format, parm); +} + +static void +print_regs(struct pt_regs *regs) +{ + printk("RAX=%016lx RBX=%016lx RCX=%016lx\n", + regs->rax, regs->rbx, regs->rcx); + printk("RDX=%016lx RSI=%016lx RDI=%016lx\n", + regs->rdx, regs->rsi, regs->rdi); + printk("RBP=%016lx PS=%016lx PC=%016lx\n", + regs->rbp, regs->eflags, regs->rip); + printk("R8=%016lx R9=%016lx R10=%016lx\n", + regs->r8, regs->r9, regs->r10); + printk("R11=%016lx R12=%016lx R13=%016lx\n", + regs->r11, regs->r12, regs->r13); + printk("R14=%016lx R15=%016lx RSP=%016lx\n", + regs->r14, regs->r15, regs->rsp); +} + +#define NEW_esp fn_call_lookaside[trap_cpu].rsp + +static void +regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_RAX] = regs->rax; + gdb_regs[_RBX] = regs->rbx; + gdb_regs[_RCX] = regs->rcx; + gdb_regs[_RDX] = regs->rdx; + gdb_regs[_RSI] = regs->rsi; + gdb_regs[_RDI] = regs->rdi; + gdb_regs[_RBP] = regs->rbp; + gdb_regs[ _PS] = regs->eflags; + gdb_regs[ _PC] = regs->rip; + gdb_regs[ _R8] = regs->r8; + gdb_regs[ _R9] = regs->r9; + gdb_regs[_R10] = regs->r10; + gdb_regs[_R11] = regs->r11; + gdb_regs[_R12] = regs->r12; + gdb_regs[_R13] = regs->r13; + gdb_regs[_R14] = regs->r14; + gdb_regs[_R15] = regs->r15; + gdb_regs[_RSP] = regs->rsp; + + /* Note, as we are a debugging the kernel, we will always + * trap in kernel code, this means no priviledge change, + * and so the pt_regs structure is not completely valid. In a non + * privilege change trap, only EFLAGS, CS and EIP are put on the stack, + * SS and ESP are not stacked, this means that the last 2 elements of + * pt_regs is not valid (they would normally refer to the user stack) + * also, using regs+1 is no good because you end up will a value that is + * 2 longs (8) too high. This used to cause stepping over functions + * to fail, so my fix is to use the address of regs->esp, which + * should point at the end of the stack frame. Note I have ignored + * completely exceptions that cause an error code to be stacked, such + * as double fault. Stuart Hughes, Zentropix. + * original code: gdb_regs[_ESP] = (int) (regs + 1) ; + + * this is now done on entry and moved to OLD_esp (as well as NEW_esp). + */ +} + +static void +gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + regs->rax = gdb_regs[_RAX] ; + regs->rbx = gdb_regs[_RBX] ; + regs->rcx = gdb_regs[_RCX] ; + regs->rdx = gdb_regs[_RDX] ; + regs->rsi = gdb_regs[_RSI] ; + regs->rdi = gdb_regs[_RDI] ; + regs->rbp = gdb_regs[_RBP] ; + regs->eflags = gdb_regs[ _PS] ; + regs->rip = gdb_regs[ _PC] ; + regs->r8 = gdb_regs[ _R8] ; + regs->r9 = gdb_regs[ _R9] ; + regs->r10 = gdb_regs[ _R10] ; + regs->r11 = gdb_regs[ _R11] ; + regs->r12 = gdb_regs[ _R12] ; + regs->r13 = gdb_regs[ _R13] ; + regs->r14 = gdb_regs[ _R14] ; + regs->r15 = gdb_regs[ _R15] ; + #if 0 /* can't change these */ + regs->rsp = gdb_regs[_RSP] ; + regs->ss = gdb_regs[ _SS] ; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif +} /* gdb_regs_to_regs */ + +extern void scheduling_functions_start_here(void); +extern void scheduling_functions_end_here(void); +#define first_sched ((unsigned long) scheduling_functions_start_here) +#define last_sched ((unsigned long) scheduling_functions_end_here) + +int thread_list = 0; +extern void thread_return(void); + +void +get_gdb_regs(struct task_struct *p, struct pt_regs *regs, unsigned long *gdb_regs) +{ + unsigned long **rbp, *rsp, *rsp0, pc; + int count = 0; + IF_SMP(int i); + if (!p || p == current) { + regs_to_gdb_regs(gdb_regs, regs); + return; + } +#ifdef CONFIG_SMP + for (i = 0; i < MAX_NO_CPUS; i++) { + if (p == kgdb_info.cpus_waiting[i].task) { + regs_to_gdb_regs(gdb_regs, + kgdb_info.cpus_waiting[i].regs); + gdb_regs[_RSP] = + (unsigned long)&kgdb_info.cpus_waiting[i].regs->rsp; + + return; + } + } +#endif + memset(gdb_regs, 0, NUMREGBYTES); + rsp = (unsigned long *)p->thread.rsp; + rbp = (unsigned long **)rsp[0]; + rsp += 2; + gdb_regs[_PC] = (unsigned long)thread_return; + gdb_regs[_RBP] = (unsigned long)rbp; + gdb_regs[_RSP] = (unsigned long)rsp; + +/* + * This code is to give a more informative notion of where a process + * is waiting. It is used only when the user asks for a thread info + * list. If he then switches to the thread, s/he will find the task + * is in schedule, but a back trace should show the same info we come + * up with. This code was shamelessly purloined from process.c. It was + * then enhanced to provide more registers than simply the program + * counter. + */ + + if (!thread_list) { + return; + } + + if (p->state == TASK_RUNNING) + return; + rsp0 = (unsigned long *)p->thread.rsp0; + if (rsp < (unsigned long *) p->thread_info || rsp > rsp0) + return; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + do { + if (*rbp < rsp || *rbp > rsp0) + break; + rbp = (unsigned long **)*rbp; + rsp = (unsigned long *)rbp; + pc = rsp[1]; + + if (pc < first_sched || pc >= last_sched) + break; + gdb_regs[_PC] = (unsigned long)pc; + gdb_regs[_RSP] = (unsigned long)rsp; + gdb_regs[_RBP] = (unsigned long)rbp; + } while (count++ < 16); + return; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* returns nonzero if any memory access fails. */ +int mem2hex( char* mem, char* buf, int count) +{ + int i; + unsigned char ch; + int ret = 0; + + for (i=0;i> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (ret) { + Dearly_printk("mem2hex: fault at accessing %p\n", mem); + } + return(ret); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return nonzero if any memory access fails. */ +int hex2mem( char* buf, char* mem, int count) +{ + int i; + unsigned char ch; + int ret = 0; + + for (i=0;i (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) { + addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr); + } + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set mem_err in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + /* printk("%lx = ", mem) ; */ + + ch = get_char(mem++); + + /* printk("%02x\n", ch & 0xFF) ; */ + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault fetching from addr %lx\n", + (long) (mem - 1)); + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + mem_err_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +/* NOTE: We use the may fault flag to also indicate if the write is to + * the registers (0) or "other" memory (!=0) + */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch, may_fault); + + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault storing to addr %lx\n", + (long) (mem - 1)); + return (mem); + } + } + if (may_fault) + mem_err_expected = 0; + return (mem); +} +#endif + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToLong(char **ptr, unsigned long *value) +{ + int numChars = 0; + int hexValue; + + *value = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *value = (*value << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#define stubhex(h) hex(h) +#ifdef old_thread_list + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +#ifdef old_thread_list +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} +int +int_to_hex_v(unsigned char * id, int value) +{ + unsigned char *start = id; + int shift; + int ch; + + for (shift = 28; shift >= 0; shift -= 4) { + if ((ch = (value >> shift) & 0xf) || (id != start)) { + *id = hexchars[ch]; + id++; + } + } + if (id == start) + *id++ = '0'; + return id - start; +} +#ifdef old_thread_list + +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} +#endif +static int +cmp_str(char *s1, char *s2, int count) +{ + while (count--) { + if (*s1++ != *s2++) + return 0; + } + return 1; +} + +#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */ +extern struct task_struct *kgdb_get_idle(int cpu); +#define idle_task(cpu) kgdb_get_idle(cpu) +#else +#define idle_task(cpu) init_tasks[cpu] +#endif + +extern int kgdb_pid_init_done; + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) { + if (!cpu_online(pid - PID_MAX)) + return NULL; + + return idle_task(pid - PID_MAX); + } else { + /* + * find_task_by_pid is relatively safe all the time + * Other pid functions require lock downs which imply + * that we may be interrupting them (as we get here + * in the middle of most any lock down). + * Still we don't want to call until the table exists! + */ + if (kgdb_pid_init_done){ + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } + } + } + return NULL; +} +/* *INDENT-OFF* */ +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned long addr; +} breakinfo[4] = { {enabled:0}, + {enabled:0}, + {enabled:0}, + {enabled:0}}; +/* *INDENT-ON* */ +unsigned long hw_breakpoint_status; +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned long dr7; + + asm volatile ("movq %%db7, %0\n":"=r" (dr7) + :); + /* *INDENT-OFF* */ + do { + unsigned long addr0, addr1, addr2, addr3; + asm volatile ("movq %%db0, %0\n" + "movq %%db1, %1\n" + "movq %%db2, %2\n" + "movq %%db3, %3\n" + :"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3) + :); + } while (0); + /* *INDENT-ON* */ + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movq %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movq %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movq %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movq %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movq %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +#ifdef CONFIG_SMP +static int in_kgdb_console = 0; + +int +in_kgdb(struct pt_regs *regs) +{ + unsigned long flags; + int cpu; + if (!kgdb_enabled) + return 0; + cpu = smp_processor_id(); + in_kgdb_called = 1; + if (!spin_is_locked(&kgdb_spinlock)) { + if (in_kgdb_here_log[cpu] || /* we are holding this cpu */ + in_kgdb_console) { /* or we are doing slow i/o */ + return 1; + } + return 0; + } + + /* As I see it the only reason not to let all cpus spin on + * the same spin_lock is to allow selected ones to proceed. + * This would be a good thing, so we leave it this way. + * Maybe someday.... Done ! + + * in_kgdb() is called from an NMI so we don't pretend + * to have any resources, like printk() for example. + */ + + local_irq_save(flags); /* only local here, to avoid hanging */ + /* + * log arival of this cpu + * The NMI keeps on ticking. Protect against recurring more + * than once, and ignor the cpu that has the kgdb lock + */ + in_kgdb_entry_log[cpu]++; + in_kgdb_here_log[cpu] = regs; + if (cpu == spinlock_cpu || waiting_cpus[cpu].task) + goto exit_in_kgdb; + + /* + * For protection of the initilization of the spin locks by kgdb + * it locks the kgdb spinlock before it gets the wait locks set + * up. We wait here for the wait lock to be taken. If the + * kgdb lock goes away first?? Well, it could be a slow exit + * sequence where the wait lock is removed prior to the kgdb lock + * so if kgdb gets unlocked, we just exit. + */ + + while (spin_is_locked(&kgdb_spinlock) && + !spin_is_locked(waitlocks + cpu)) ; + if (!spin_is_locked(&kgdb_spinlock)) + goto exit_in_kgdb; + + waiting_cpus[cpu].task = current; + waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu); + waiting_cpus[cpu].regs = regs; + + spin_unlock_wait(waitlocks + cpu); + + /* + * log departure of this cpu + */ + waiting_cpus[cpu].task = 0; + waiting_cpus[cpu].pid = 0; + waiting_cpus[cpu].regs = 0; + correct_hw_break(); + exit_in_kgdb: + in_kgdb_here_log[cpu] = 0; + local_irq_restore(flags); + return 1; + /* + spin_unlock(continuelocks + smp_processor_id()); + */ +} + +void +smp__in_kgdb(struct pt_regs regs) +{ + ack_APIC_irq(); + in_kgdb(®s); +} +#else +int +in_kgdb(struct pt_regs *regs) +{ + return (kgdb_spinlock); +} +#endif + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned long dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movq %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * The ThreadExtraInfo query allows us to pass an arbitrary string + * for display with the "info threads" command. + */ + +void +print_extra_info(task_t *p, char *buf) +{ + if (!p) { + sprintf(buf, "Invalid thread"); + return; + } + sprintf(buf, "0x%p %8d %4d %c %s", + (void *)p, p->parent->pid, + task_cpu(p), + (p->state == 0) ? (task_curr(p)?'R':'r') : + (p->state < 0) ? 'U' : + (p->state & TASK_UNINTERRUPTIBLE) ? 'D' : + (p->state & TASK_STOPPED || p->ptrace & PT_PTRACED) ? 'T' : + (p->state & (TASK_ZOMBIE | TASK_DEAD)) ? 'Z' : + (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?', + p->comm); +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +kgdb_handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + struct task_struct *thread_list_start = 0, *thread = NULL; + struct task_struct *p; + unsigned long addr, length; + unsigned long breakno, breaktype; + char *ptr; + unsigned long newPC; + threadref thref; + unsigned long threadid, tmpid; + int thread_min = PID_MAX + MAX_NO_CPUS; +#ifdef old_thread_list + int maxthreads; +#endif + int nothreads; + unsigned long flags; + unsigned long gdb_regs[NUMREGS]; + unsigned long dr6; + IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */ +#define NO_NMI 1 +#define NO_SYNC 2 +#define regs (*linux_regs) + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->cs)) { + printk("ignoring non-kernel exception\n"); + print_regs(®s); + return (0); + } + /* + * If we're using eth mode, set the 'mode' in the netdevice. + */ + + if (kgdboe) + netpoll_set_trap(1); + + local_irq_save(flags); + + /* Get kgdb spinlock */ + + KGDB_SPIN_LOCK(&kgdb_spinlock); + rdtscll(kgdb_info.entry_tsc); + /* + * We depend on this spinlock and the NMI watch dog to control the + * other cpus. They will arrive at "in_kgdb()" as a result of the + * NMI and will wait there for the following spin locks to be + * released. + */ +#ifdef CONFIG_SMP + +#if 0 + if (cpu_callout_map & ~MAX_CPU_MASK) { + printk("kgdb : too many cpus, possibly not mapped" + " in contiguous space, change MAX_NO_CPUS" + " in kgdb_stub and make new kernel.\n" + " cpu_callout_map is %lx\n", cpu_callout_map); + goto exit_just_unlock; + } +#endif + if (spinlock_count == 1) { + int time, end_time, dum; + int i; + int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0) + }; + if (remote_debug) { + printk("kgdb : cpu %d entry, syncing others\n", + smp_processor_id()); + } + for (i = 0; i < MAX_NO_CPUS; i++) { + /* + * Use trylock as we may already hold the lock if + * we are holding the cpu. Net result is all + * locked. + */ + spin_trylock(&waitlocks[i]); + } + for (i = 0; i < MAX_NO_CPUS; i++) + cpu_logged_in[i] = 0; + /* + * Wait for their arrival. We know the watch dog is active if + * in_kgdb() has ever been called, as it is always called on a + * watchdog tick. + */ + rdtsc(dum, time); + end_time = time + 2; /* Note: we use the High order bits! */ + i = 1; + if (num_online_cpus() > 1) { + int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()]; + smp_send_nmi_allbutself(); + + while (i < num_online_cpus() && time != end_time) { + int j; + for (j = 0; j < MAX_NO_CPUS; j++) { + if (waiting_cpus[j].task && + waiting_cpus[j].task != NOCPU && + !cpu_logged_in[j]) { + i++; + cpu_logged_in[j] = 1; + if (remote_debug) { + printk + ("kgdb : cpu %d arrived at kgdb\n", + j); + } + break; + } else if (!waiting_cpus[j].task && + !cpu_online(j)) { + waiting_cpus[j].task = NOCPU; + cpu_logged_in[j] = 1; + waiting_cpus[j].hold = 1; + break; + } + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + + int wait = 100000; + while (wait--) ; + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + printk + ("kgdb : cpu %d stall" + " in in_kgdb\n", + j); + i++; + cpu_logged_in[j] = 1; + waiting_cpus[j].task = + (struct task_struct + *) 1; + } + } + } + + if (in_kgdb_entry_log[smp_processor_id()] > + (me_in_kgdb + 10)) { + break; + } + + rdtsc(dum, time); + } + if (i < num_online_cpus()) { + printk + ("kgdb : time out, proceeding without sync\n"); +#if 0 + printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n", + waiting_cpus[0].task != 0, + waiting_cpus[1].task != 0); + printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n", + cpu_logged_in[0], cpu_logged_in[1]); + printk + ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n", + in_kgdb_here_log[0] != 0, + in_kgdb_here_log[1] != 0); +#endif + entry_state = NO_SYNC; + } else { +#if 0 + int ent = + in_kgdb_entry_log[smp_processor_id()] - + me_in_kgdb; + printk("kgdb : sync after %d entries\n", ent); +#endif + } + } else { + if (remote_debug) { + printk + ("kgdb : %d cpus, but watchdog not active\n" + "proceeding without locking down other cpus\n", + num_online_cpus()); + entry_state = NO_NMI; + } + } + } +#endif + + if (remote_debug) { + unsigned long *lp = (unsigned long *) &linux_regs; + + printk("handle_exception(exceptionVector=%d, " + "signo=%d, err_code=%d, linux_regs=%p)\n", + exceptionVector, signo, err_code, linux_regs); + if (debug_regs) { + print_regs(®s); + printk("Stk: %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[0], lp[1], lp[2], lp[3], + lp[4], lp[5], lp[6], lp[7]); + printk(" %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[8], lp[9], lp[10], lp[11], + lp[12], lp[13], lp[14], lp[15]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[16], lp[17], lp[18], lp[19], + lp[20], lp[21], lp[22], lp[23]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[24], lp[25], lp[26], lp[27], + lp[28], lp[29], lp[30], lp[31]); + } + } + + /* Disable hardware debugging while we are in kgdb */ + /* Get the debug register status register */ +/* *INDENT-OFF* */ + __asm__("movq %0,%%db7" + : /* no output */ + :"r"(0UL)); + + asm volatile ("movq %%db6, %0\n" + :"=r" (hw_breakpoint_status) + :); + +#if 0 +/* *INDENT-ON* */ + switch (exceptionVector) { + case 0: /* divide error */ + case 1: /* debug exception */ + case 2: /* NMI */ + case 3: /* breakpoint */ + case 4: /* overflow */ + case 5: /* bounds check */ + case 6: /* invalid opcode */ + case 7: /* device not available */ + case 8: /* double fault (errcode) */ + case 10: /* invalid TSS (errcode) */ + case 12: /* stack fault (errcode) */ + case 16: /* floating point error */ + case 17: /* alignment check (errcode) */ + default: /* any undocumented */ + break; + case 11: /* segment not present (errcode) */ + case 13: /* general protection (errcode) */ + case 14: /* page fault (special errcode) */ + case 19: /* cache flush denied */ + if (mem_err_expected) { + /* + * This fault occured because of the + * get_char or set_char routines. These + * two routines use either eax of edx to + * indirectly reference the location in + * memory that they are working with. + * For a page fault, when we return the + * instruction will be retried, so we + * have to make sure that these + * registers point to valid memory. + */ + mem_err = 1; /* set mem error flag */ + mem_err_expected = 0; + mem_err_cnt++; /* helps in debugging */ + /* make valid address */ + regs.eax = (long) &garbage_loc; + /* make valid address */ + regs.edx = (long) &garbage_loc; + if (remote_debug) + printk("Return after memory error: " + "mem_err_cnt=%d\n", mem_err_cnt); + if (debug_regs) + print_regs(®s); + goto exit_kgdb; + } + break; + } +#endif + if (remote_debug) + printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id()); + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + kgdb_info.called_from = __builtin_return_address(0); +#ifdef CONFIG_SMP + /* + * OK, we can now communicate, lets tell gdb about the sync. + * but only if we had a problem. + */ + switch (entry_state) { + case NO_NMI: + to_gdb("NMI not active, other cpus not stopped\n"); + break; + case NO_SYNC: + to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n"); + default:; + } + +#endif +/* + * Set up the gdb function call area. + */ + trap_cpu = smp_processor_id(); + OLD_esp = NEW_esp = (unsigned long) (&linux_regs->rsp); + + IF_SMP(once_again:) + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'd': + remote_debug = !(remote_debug); /* toggle debug flag */ + printk("Remote debug %s\n", + remote_debug ? "on" : "off"); + break; + case 'g': /* return the value of the CPU registers */ + get_gdb_regs(usethread, ®s, gdb_regs); + mem2hex((char *) gdb_regs, + remcomOutBuffer, NUMREGBYTES); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], + (char *) gdb_regs, NUMREGBYTES); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + case 'P':{ /* set the value of a single CPU register - + return OK */ + /* + * For some reason, gdb wants to talk about psudo + * registers (greater than 15). + */ + unsigned long regno; + + ptr = &remcomInBuffer[1]; + regs_to_gdb_regs(gdb_regs, ®s); + if ((!usethread || usethread == current) && + hexToLong(&ptr, ®no) && + *ptr++ == '=' && (regno >= 0)) { + if (regno >= NUMREGS) + break; + hex2mem(ptr, (char *) &gdb_regs[regno], + 8); + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + break; + } + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr) && + (*(ptr++) == ',') && (hexToLong(&ptr, &length))) { + ptr = 0; + /* + * hex doubles the byte count + */ + if (length > (BUFMAX / 2)) + length = BUFMAX / 2; + if (mem2hex((char *) addr, + remcomOutBuffer, length)) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + debug_error + ("malformed read memory command: %s\n", + remcomInBuffer); + } + break; + + /* MAA..AA,LLLL: + Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr) && + (*(ptr++) == ',') && + (hexToLong(&ptr, &length)) && (*(ptr++) == ':')) { + if (hex2mem(ptr, (char *) addr, length)) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } else { + strcpy(remcomOutBuffer, "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + debug_error + ("malformed write memory command: %s\n", + remcomInBuffer); + } + break; + case 'S': + remcomInBuffer[0] = 's'; + case 'C': + /* Csig;AA..AA where ;AA..AA is optional + * continue with signal + * Since signals are meaning less to us, delete that + * part and then fall into the 'c' code. + */ + ptr = &remcomInBuffer[1]; + length = 2; + while (*ptr && *ptr != ';') { + length++; + ptr++; + } + if (*ptr) { + do { + ptr++; + *(ptr - length++) = *ptr; + } while (*ptr); + } else { + remcomInBuffer[1] = 0; + } + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + /* D detach, reply OK and then continue */ + case 'c': + case 's': + case 'D': + + /* try to read optional parameter, + pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr)) { + if (remote_debug) + printk("Changing EIP to 0x%lx\n", addr); + + regs.rip = addr; + } + + newPC = regs.rip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + /* detach is a friendly version of continue. Note that + debugging is still enabled (e.g hit control C) + */ + if (remcomInBuffer[0] == 'D') { + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + } + + if (remote_debug) { + printk("Resuming execution\n"); + print_regs(®s); + } + asm volatile ("movq %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno) && + (breakinfo[breakno].type == 0)) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + + if (kgdboe) + netpoll_set_trap(0); + + correct_hw_break(); + asm volatile ("movq %0, %%db6\n"::"r" (0UL)); + goto exit_kgdb; + + /* kill the program */ + case 'k': /* do nothing */ + break; + + /* query */ + case 'q': + nothreads = 0; + switch (remcomInBuffer[1]) { + case 'f': + threadid = 1; + thread_list = 2; + thread_list_start = (usethread ? : current); + case 's': + if (!cmp_str(&remcomInBuffer[2], + "ThreadInfo", 10)) + break; + + remcomOutBuffer[nothreads++] = 'm'; + for (; threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + nothreads += int_to_hex_v( + &remcomOutBuffer[ + nothreads], + threadid); + if (thread_min > threadid) + thread_min = threadid; + remcomOutBuffer[ + nothreads] = ','; + nothreads++; + if (nothreads > BUFMAX - 10) + break; + } + } + if (remcomOutBuffer[nothreads - 1] == 'm') { + remcomOutBuffer[nothreads - 1] = 'l'; + } else { + nothreads--; + } + remcomOutBuffer[nothreads] = 0; + break; + +#ifdef old_thread_list /* Old thread info request */ + case 'L': + /* List threads */ + thread_list = 2; + thread_list_start = (usethread ? : current); + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + do { + int buf_thread_limit = + (BUFMAX - 22) / BUF_THREAD_ID_SIZE; + if (maxthreads > buf_thread_limit) { + maxthreads = buf_thread_limit; + } + } while (0); + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + /* If start flag set start at 0. */ + if (remcomInBuffer[2] == '1') + threadid = 0; + else + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads && + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + if (thread_min > threadid) + thread_min = threadid; + } + } + + if (threadid == PID_MAX + MAX_NO_CPUS) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; +#endif + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + if (!threadid) { + /* + * idle thread + */ + for (threadid = PID_MAX; + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + if (current == + idle_task(threadid - + PID_MAX)) + break; + } + } + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, + err_code, remcomOutBuffer); + break; + case 'T': + ptr = &remcomInBuffer[0]; + if (strncmp(ptr, "qThreadExtraInfo,", + strlen("qThreadExtraInfo,")) == 0) { + ptr += strlen("qThreadExtraInfo,"); + hexToLong(&ptr, &tmpid); + p = getthread(tmpid); + print_extra_info(p, lbuf); + mem2hex(lbuf, remcomOutBuffer, + strlen(lbuf)); + } + break; +#if 0 + case 'T':{ + char * nptr; + /* Thread extra info */ + if (!cmp_str(&remcomInBuffer[2], + "hreadExtraInfo,", 15)) { + break; + } + ptr = &remcomInBuffer[17]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + nptr = &thread->comm[0]; + length = 0; + ptr = &remcomOutBuffer[0]; + do { + length++; + ptr = pack_hex_byte(ptr, *nptr++); + } while (*nptr && length < 16); + /* + * would like that 16 to be the size of + * task_struct.comm but don't know the + * syntax.. + */ + *ptr = 0; + } +#endif + } + break; + + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + /* + * Just in case I forget what this is all about, + * the "thread info" command to gdb causes it + * to ask for a thread list. It then switches + * to each thread and asks for the registers. + * For this (and only this) usage, we want to + * fudge the registers of tasks not on the run + * list (i.e. waiting) to show the routine that + * called schedule. Also, gdb, is a minimalist + * in that if the current thread is the last + * it will not re-read the info when done. + * This means that in this case we must show + * the real registers. So here is how we do it: + * Each entry we keep track of the min + * thread in the list (the last that gdb will) + * get info for. We also keep track of the + * starting thread. + * "thread_list" is cleared when switching back + * to the min thread if it is was current, or + * if it was not current, thread_list is set + * to 1. When the switch to current comes, + * if thread_list is 1, clear it, else do + * nothing. + */ + usethread = thread; + if ((thread_list == 1) && + (thread == thread_list_start)) { + thread_list = 0; + } + if (thread_list && (threadid == thread_min)) { + if (thread == thread_list_start) { + thread_list = 0; + } else { + thread_list = 1; + } + } + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + if (thread_min > threadid) + thread_min = threadid; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; + + case 'Y': /* set up a hardware breakpoint */ + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &breakno); + ptr++; + hexToLong(&ptr, &breaktype); + ptr++; + hexToLong(&ptr, &length); + ptr++; + hexToLong(&ptr, &addr); + if (set_hw_break(breakno & 0x3, + breaktype & 0x3, + length & 0x3, addr) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + case 'r': /* reboot */ + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + /*to_gdb("Rebooting\n"); */ + /* triplefault no return from here */ + { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt[0])); + BREAKPOINT; + } + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + } /* while(1==1) */ + /* + * reached by goto only. + */ + exit_kgdb: + /* + * Here is where we set up to trap a gdb function call. NEW_esp + * will be changed if we are trying to do this. We handle both + * adding and subtracting, thus allowing gdb to put grung on + * the stack which it removes later. + */ + if (NEW_esp != OLD_esp) { + unsigned long *ptr = END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) + ptr -= (OLD_esp - NEW_esp) / sizeof (unsigned long); + *--ptr = linux_regs->eflags; + *--ptr = linux_regs->cs; + *--ptr = linux_regs->rip; + *--ptr = linux_regs->rcx; + *--ptr = linux_regs->rbx; + *--ptr = linux_regs->rax; + linux_regs->rcx = NEW_esp - (sizeof (unsigned long) * 6); + linux_regs->rbx = (unsigned long) END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) { + linux_regs->rip = (unsigned long) fn_call_stub; + } else { + linux_regs->rip = (unsigned long) fn_rtn_stub; + linux_regs->rax = NEW_esp; + } + linux_regs->eflags &= ~(IF_BIT | TF_BIT); + } +#ifdef CONFIG_SMP + /* + * Release gdb wait locks + * Sanity check time. Must have at least one cpu to run. Also single + * step must not be done if the current cpu is on hold. + */ + if (spinlock_count == 1) { + int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep; + int cpu_avail = 0; + int i; + + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!cpu_online(i)) + break; + if (!hold_cpu(i)) { + cpu_avail = 1; + } + } + /* + * Early in the bring up there will be NO cpus on line... + */ + if (!cpu_avail && !cpus_empty(cpu_online_map)) { + to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n"); + goto once_again; + } + if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) { + to_gdb + ("Current cpu must be unblocked to single step\n"); + goto once_again; + } + if (!(ss_hold)) { + int i; + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!hold_cpu(i)) { + spin_unlock(&waitlocks[i]); + } + } + } else { + spin_unlock(&waitlocks[smp_processor_id()]); + } + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + /* + * If this cpu is on hold, this is where we + * do it. Note, the NMI will pull us out of here, + * but will return as the above lock is not held. + * We will stay here till another cpu releases the lock for us. + */ + spin_unlock_wait(waitlocks + smp_processor_id()); + local_irq_restore(flags); + return (1); + } +#if 0 +exit_just_unlock: +#endif +#endif + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + local_irq_restore(flags); + return (1); +} + +#undef regs +static int kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) +{ + struct die_args *d = ptr; + + if (!kgdb_enabled || (cmd == DIE_DEBUG && user_mode(d->regs))) + return NOTIFY_DONE; + if (cmd == DIE_NMI_IPI) { + if (in_kgdb(d->regs)) + return NOTIFY_BAD; + } else if (kgdb_handle_exception(d->trapnr, d->signr, d->err, d->regs)) + return NOTIFY_BAD; /* skip */ + + return NOTIFY_DONE; +} + +static struct notifier_block kgdb_notifier = { + .notifier_call = kgdb_notify, + .priority = 0, +}; + +void set_debug_traps(void) +{ + static int initialized = 0; + + if (!initialized) { + initialized = 1; + notifier_chain_register(&die_chain, &kgdb_notifier); + } +} + +/* + * Provide the command line "gdb" initial break + */ +int __init kgdb_initial_break(char * str) +{ + if (*str == '\0'){ + breakpoint(); + return 1; + } + return 0; +} +__setup("gdb",kgdb_initial_break); + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ +/* But really, just use the BREAKPOINT macro. We will handle the int stuff + */ + +void breakpoint(void) +{ + + set_debug_traps(); + kgdb_enabled = 1; +#if 0 + /* + * These calls were not enough to allow breakpoint to be + * called before trap_init(). I moved the argument parsing + * after trap_init() and it seems to work. + */ + set_intr_usr_gate(3,&int3); /* disable ints on trap */ + set_intr_gate(1,&debug); + set_intr_gate(14,&page_fault); +#endif + + BREAKPOINT; +} + +#ifdef later +/* + * possibly we should not go thru the traps.c code at all? Someday. + */ +void +do_kgdb_int3(struct pt_regs *regs, long error_code) +{ + kgdb_handle_exception(3, 5, error_code, regs); + return; +} +#endif +#undef regs +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS +asmlinkage void +bad_sys_call_exit(int stuff) +{ + struct pt_regs *regs = (struct pt_regs *) &stuff; + printk("Sys call %d return with %x preempt_count\n", + (int) regs->orig_eax, preempt_count()); +} +#endif +#ifdef CONFIG_STACK_OVERFLOW_TEST +#include +asmlinkage void +stack_overflow(void) +{ +#ifdef BREAKPOINT + BREAKPOINT; +#else + printk("Kernel stack overflow, looping forever\n"); +#endif + while (1) { + } +} +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE) +char gdbconbuf[BUFMAX]; + +static void +kgdb_gdb_message(const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + /* + * This takes care of NMI while spining out chars to gdb + */ + IF_SMP(in_kgdb_console = 1); + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } + IF_SMP(in_kgdb_console = 0); +} +#endif +#ifdef CONFIG_SMP +static void +to_gdb(const char *s) +{ + int count = 0; + while (s[count] && (count++ < BUFMAX)) ; + kgdb_gdb_message(s, count); +} +#endif +#ifdef CONFIG_KGDB_CONSOLE +#include +#include +#include +#include + +void +kgdb_console_write(struct console *co, const char *s, unsigned count) +{ + + if (gdb_i386vector == -1) { + /* + * We have not yet talked to gdb. What to do... + * lets break, on continue we can do the write. + * But first tell him whats up. Uh, well no can do, + * as this IS the console. Oh well... + * We do need to wait or the messages will be lost. + * Other option would be to tell the above code to + * ignore this breakpoint and do an auto return, + * but that might confuse gdb. Also this happens + * early enough in boot up that we don't have the traps + * set up yet, so... + */ + breakpoint(); + } + kgdb_gdb_message(s, count); +} + +/* + * ------------------------------------------------------------ + * Serial KGDB driver + * ------------------------------------------------------------ + */ + +static struct console kgdbcons = { + name:"kgdb", + write:kgdb_console_write, +#ifdef CONFIG_KGDB_USER_CONSOLE + device:kgdb_console_device, +#endif + flags:CON_PRINTBUFFER | CON_ENABLED, + index:-1, +}; + +/* + * The trick here is that this file gets linked before printk.o + * That means we get to peer at the console info in the command + * line before it does. If we are up, we register, otherwise, + * do nothing. By returning 0, we allow printk to look also. + */ +static int kgdb_console_enabled; + +int __init +kgdb_console_init(char *str) +{ + if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) { + register_console(&kgdbcons); + kgdb_console_enabled = 1; + } + return 0; /* let others look at the string */ +} + +__setup("console=", kgdb_console_init); + +#ifdef CONFIG_KGDB_USER_CONSOLE +static kdev_t kgdb_console_device(struct console *c); +/* This stuff sort of works, but it knocks out telnet devices + * we are leaving it here in case we (or you) find time to figure it out + * better.. + */ + +/* + * We need a real char device as well for when the console is opened for user + * space activities. + */ + +static int +kgdb_consdev_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static ssize_t +kgdb_consdev_write(struct file *file, const char *buf, + size_t count, loff_t * ppos) +{ + int size, ret = 0; + static char kbuf[128]; + static DECLARE_MUTEX(sem); + + /* We are not reentrant... */ + if (down_interruptible(&sem)) + return -ERESTARTSYS; + + while (count > 0) { + /* need to copy the data from user space */ + size = count; + if (size > sizeof (kbuf)) + size = sizeof (kbuf); + if (copy_from_user(kbuf, buf, size)) { + ret = -EFAULT; + break;; + } + kgdb_console_write(&kgdbcons, kbuf, size); + count -= size; + ret += size; + buf += size; + } + + up(&sem); + + return ret; +} + +struct file_operations kgdb_consdev_fops = { + open:kgdb_consdev_open, + write:kgdb_consdev_write +}; +static kdev_t +kgdb_console_device(struct console *c) +{ + return MKDEV(TTYAUX_MAJOR, 1); +} + +/* + * This routine gets called from the serial stub in the i386/lib + * This is so it is done late in bring up (just before the console open). + */ +void +kgdb_console_finit(void) +{ + if (kgdb_console_enabled) { + char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1)); + char *cp = cptr; + while (*cptr && *cptr != '(') + cptr++; + *cptr = 0; + unregister_chrdev(TTYAUX_MAJOR, cp); + register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops); + } +} +#endif +#endif +#ifdef CONFIG_KGDB_TS +#include /* time stamp code */ +#include /* in_interrupt */ +#ifdef CONFIG_KGDB_TS_64 +#define DATA_POINTS 64 +#endif +#ifdef CONFIG_KGDB_TS_128 +#define DATA_POINTS 128 +#endif +#ifdef CONFIG_KGDB_TS_256 +#define DATA_POINTS 256 +#endif +#ifdef CONFIG_KGDB_TS_512 +#define DATA_POINTS 512 +#endif +#ifdef CONFIG_KGDB_TS_1024 +#define DATA_POINTS 1024 +#endif +#ifndef DATA_POINTS +#define DATA_POINTS 128 /* must be a power of two */ +#endif +#define INDEX_MASK (DATA_POINTS - 1) +#if (INDEX_MASK & DATA_POINTS) +#error "CONFIG_KGDB_TS_COUNT must be a power of 2" +#endif +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + int data0; + int data1; +}; +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + struct task_struct *t1; + struct task_struct *t2; +}; +struct kgdb_and_then_struct kgdb_data[DATA_POINTS]; + +struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0]; +int kgdb_and_then_count; + +void +kgdb_tstamp(int line, char *source, int data0, int data1) +{ + static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; + int flags; + local_irq_save(flags); + spin_lock(&ts_spin); + rdtscll(kgdb_and_then->at_time); +#ifdef CONFIG_SMP + kgdb_and_then->on_cpu = smp_processor_id(); +#endif + kgdb_and_then->task = current; + kgdb_and_then->from_ln = line; + kgdb_and_then->in_src = source; + kgdb_and_then->from = __builtin_return_address(0); + kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) | + (preempt_count() << 8)); + kgdb_and_then->data0 = data0; + kgdb_and_then->data1 = data1; + kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK]; + spin_unlock(&ts_spin); + local_irq_restore(flags); +#ifdef CONFIG_PREEMPT + +#endif + return; +} +#endif +typedef int gdb_debug_hook(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs); +gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */ + +static int kgdb_need_breakpoint[NR_CPUS]; + +void kgdb_schedule_breakpoint(void) +{ + kgdb_need_breakpoint[smp_processor_id()] = 1; +} + +void kgdb_process_breakpoint(void) +{ + /* + * Handle a breakpoint queued from inside network driver code + * to avoid reentrancy issues + */ + if (kgdb_need_breakpoint[smp_processor_id()]) { + kgdb_need_breakpoint[smp_processor_id()] = 0; + kgdb_enabled = 1; + BREAKPOINT; + } +} + --- linux-2.6.4-rc1/arch/x86_64/kernel/Makefile 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/kernel/Makefile 2004-02-29 13:08:01.000000000 -0800 @@ -27,6 +27,7 @@ obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_KGDB) += kgdb_stub.o obj-y += topology.o --- linux-2.6.4-rc1/arch/x86_64/kernel/mce.c 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/kernel/mce.c 2004-02-29 13:09:19.000000000 -0800 @@ -73,7 +73,9 @@ static void print_mce(struct mce *m) printk("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->cpu, m->mcgstatus, m->bank, m->status); if (m->rip) { - printk("RIP %02x:<%016Lx> ", m->cs, m->rip); + printk("RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->rip); if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->rip); printk("\n"); @@ -133,7 +135,7 @@ void do_machine_check(struct pt_regs * r return; if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; - if (regs && (m.mcgstatus & MCG_STATUS_EIPV)) { + if (regs) { m.rip = regs->rip; m.cs = regs->cs; } @@ -448,7 +450,7 @@ static __init int mce_init_device(void) return -EIO; err = sysdev_class_register(&mce_sysclass); if (!err) - err = sys_device_register(&device_mce); + err = sysdev_register(&device_mce); if (!err) { /* could create per CPU objects, but is not worth it. */ sysdev_create_file(&device_mce, &attr_disabled_banks); --- linux-2.6.4-rc1/arch/x86_64/kernel/mpparse.c 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/kernel/mpparse.c 2004-02-29 13:07:39.000000000 -0800 @@ -996,7 +996,7 @@ void __init mp_parse_prt (void) continue; } if ((1<consistent_dma_mask; + dma_mask = hwdev->dev.coherent_dma_mask; } if (dma_mask == 0) --- linux-2.6.4-rc1/arch/x86_64/kernel/process.c 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/kernel/process.c 2004-02-29 13:09:04.000000000 -0800 @@ -16,7 +16,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#define __KERNEL_SYSCALLS__ #include #include @@ -25,7 +24,6 @@ #include #include #include -#include #include #include #include --- linux-2.6.4-rc1/arch/x86_64/kernel/setup.c 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/kernel/setup.c 2004-02-29 13:09:19.000000000 -0800 @@ -610,8 +610,7 @@ static void __init detect_ht(void) * At this point we only support two siblings per * processor package. */ -#define NR_SIBLINGS 2 - if (smp_num_siblings != NR_SIBLINGS) { + if (smp_num_siblings > NR_CPUS) { printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); smp_num_siblings = 1; return; --- linux-2.6.4-rc1/arch/x86_64/kernel/smp.c 2003-11-23 19:03:00.000000000 -0800 +++ 25/arch/x86_64/kernel/smp.c 2004-02-29 13:08:01.000000000 -0800 @@ -362,6 +362,18 @@ void smp_send_reschedule(int cpu) send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } +#ifdef CONFIG_KGDB +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} +#endif + /* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. --- linux-2.6.4-rc1/arch/x86_64/kernel/time.c 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/kernel/time.c 2004-02-29 13:09:19.000000000 -0800 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -353,11 +354,11 @@ static irqreturn_t timer_interrupt(int i } if (lost) { - if (report_lost_ticks) + if (report_lost_ticks) { printk(KERN_WARNING "time.c: Lost %ld timer " - "tick(s)! (rip %016lx)\n", - (offset - vxtime.last) / hpet_tick - 1, - regs->rip); + "tick(s)! ", lost); + print_symbol("rip %s)\n", regs->rip); + } jiffies += lost; } @@ -399,8 +400,19 @@ static irqreturn_t timer_interrupt(int i return IRQ_HANDLED; } -/* RED-PEN: calculation is done in 32bits with multiply for performance - and could overflow, it may be better (but slower)to use an 64bit division. */ +static unsigned int cyc2ns_scale; +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +{ + cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + unsigned long long sched_clock(void) { unsigned long a = 0; @@ -420,7 +432,7 @@ unsigned long long sched_clock(void) purposes. */ rdtscll(a); - return (a * vxtime.tsc_quot) >> 32; + return cycles_2_ns(a); } unsigned long get_cmos_time(void) @@ -527,6 +539,8 @@ static int time_cpufreq_notifier(struct vxtime.tsc_quot = (1000L << 32) / cpu_khz; } + set_cyc2ns_scale(cpu_khz_ref / 1000); + return 0; } @@ -725,6 +739,8 @@ void __init time_init(void) rdtscll_sync(&vxtime.last_tsc); setup_irq(0, &irq0); + set_cyc2ns_scale(cpu_khz / 1000); + #ifdef CONFIG_CPU_FREQ cpufreq_register_notifier(&time_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); @@ -788,7 +804,7 @@ static int time_init_device(void) { int error = sysdev_class_register(&pit_sysclass); if (!error) - error = sys_device_register(&device_i8253); + error = sysdev_register(&device_i8253); return error; } --- linux-2.6.4-rc1/arch/x86_64/kernel/traps.c 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/kernel/traps.c 2004-02-29 13:09:19.000000000 -0800 @@ -45,6 +45,9 @@ #include #include +#ifdef CONFIG_KGDB +#include +#endif extern struct gate_struct idt_table[256]; @@ -351,9 +354,24 @@ void oops_end(void) void __die(const char * str, struct pt_regs * regs, long err) { + int nl = 0; static int die_counter; printk(KERN_EMERG "%s: %04lx [%u]\n", str, err & 0xffff,++die_counter); notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); + nl = 1; +#endif +#ifdef CONFIG_SMP + printk("SMP "); + nl = 1; +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); + nl = 1; +#endif + if (nl) + printk("\n"); show_registers(regs); /* Executive summary in case the oops scrolled away */ printk("RIP "); --- linux-2.6.4-rc1/arch/x86_64/kernel/x8664_ksyms.c 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/kernel/x8664_ksyms.c 2004-02-29 13:08:50.000000000 -0800 @@ -63,10 +63,6 @@ EXPORT_SYMBOL(pm_idle); EXPORT_SYMBOL(pm_power_off); EXPORT_SYMBOL(get_cmos_time); -#ifdef CONFIG_IO_DEBUG -EXPORT_SYMBOL(__io_virt_debug); -#endif - EXPORT_SYMBOL_NOVERS(__down_failed); EXPORT_SYMBOL_NOVERS(__down_failed_interruptible); EXPORT_SYMBOL_NOVERS(__down_failed_trylock); @@ -196,6 +192,7 @@ EXPORT_SYMBOL(die_chain); #ifdef CONFIG_SMP EXPORT_SYMBOL(cpu_sibling_map); +EXPORT_SYMBOL(smp_num_siblings); #endif extern void do_softirq_thunk(void); --- linux-2.6.4-rc1/arch/x86_64/lib/io.c 2003-06-14 12:17:57.000000000 -0700 +++ 25/arch/x86_64/lib/io.c 2004-02-29 13:08:50.000000000 -0800 @@ -4,12 +4,12 @@ void *memcpy_toio(void *dst,const void*src,unsigned len) { - return __inline_memcpy(__io_virt(dst),src,len); + return __inline_memcpy(dst,src,len); } void *memcpy_fromio(void *dst,const void*src,unsigned len) { - return __inline_memcpy(dst,__io_virt(src),len); + return __inline_memcpy(dst,src,len); } EXPORT_SYMBOL(memcpy_toio); --- linux-2.6.4-rc1/arch/x86_64/lib/iodebug.c 2003-06-14 12:18:23.000000000 -0700 +++ /dev/null 2002-08-30 16:31:37.000000000 -0700 @@ -1,11 +0,0 @@ -#include - -void * __io_virt_debug(unsigned long x, const char *file, int line) -{ - if (x < PAGE_OFFSET) { - printk("io mapaddr 0x%05lx not valid at %s:%d!\n", x, file, line); - return __va(x); - } - return (void *)x; -} - --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/arch/x86_64/lib/kgdb_serial.c 2004-02-29 13:08:01.000000000 -0800 @@ -0,0 +1,490 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * Modified to allow invokation early in boot see also + * kgdb.h for instructions by George Anzinger(george@mvista.com) + * Modified to handle debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_KGDB_USER_CONSOLE +extern void kgdb_console_finit(void); +#endif +#define PRNT_off +#define TEST_EXISTANCE +#ifdef PRNT +#define dbprintk(s) printk s +#else +#define dbprintk(s) +#endif +#define TEST_INTERRUPT_off +#ifdef TEST_INTERRUPT +#define intprintk(s) printk s +#else +#define intprintk(s) +#endif + +#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT) + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +struct async_struct *gdb_async_info; +static int gdb_async_irq; + +#define outb_px(a,b) outb_p(b,a) + +static void program_uart(struct async_struct *info); +static void write_char(struct async_struct *info, int chr); +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(struct async_struct *info) +{ + char it = inb_p(info->port + UART_LSR); + + if (it & UART_LSR_DR) + return (inb_p(info->port + UART_RX)); + /* + * If we have a framing error assume somebody messed with + * our uart. Reprogram it and send '-' both ways... + */ + if (it & 0xc) { + program_uart(info); + write_char(info, '-'); + return ('-'); + } + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + + * Locking here is a bit of a problem. We MUST not lock out communication + * if we are trying to talk to gdb about a kgdb entry. ON the other hand + * we can loose chars in the console pass thru if we don't lock. It is also + * possible that we could hold the lock or be waiting for it when kgdb + * NEEDS to talk. Since kgdb locks down the world, it does not need locks. + * We do, of course have possible issues with interrupting a uart operation, + * but we will just depend on the uart status to help keep that straight. + + */ +static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +#endif + +static int +read_char(struct async_struct *info) +{ + int chr; + unsigned long flags; + local_irq_save(flags); +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_lock(&uart_interrupt_lock); + } +#endif + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + } else { + chr = read_data_bfr(info); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_unlock(&uart_interrupt_lock); + } +#endif + local_irq_restore(flags); + return (chr); +} + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(struct async_struct *info, int chr) +{ + while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ; + + outb_p(chr, info->port + UART_TX); + +} /* write_char */ + +/* + * Mostly we don't need a spinlock, but since the console goes + * thru here with interrutps on, well, we need to catch those + * chars. + */ +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine tty_getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via tty_putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static irqreturn_t +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct async_struct *info; + unsigned long flags; + + info = gdb_async_info; + if (!info || !info->tty || irq != gdb_async_irq) + return IRQ_NONE; + + local_irq_save(flags); + spin_lock(&uart_interrupt_lock); + do { + int chr = read_data_bfr(info); + intprintk(("Debug char on int: %x hex\n", chr)); + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + BREAKPOINT; + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { + /* buffer overflow tosses early char */ + read_char(info); + } + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI); + spin_unlock(&uart_interrupt_lock); + local_irq_restore(flags); + return IRQ_HANDLED; +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +/* These structure are filled in with values defined in asm/kgdb_local.h + */ +static struct serial_state state = SB_STATE; +static struct async_struct local_info = SB_INFO; +static int ok_to_enable_ints = 0; +static void kgdb_enable_ints_now(void); + +extern char *kgdb_version; +/* + * Hook an IRQ for KGDB. + * + * This routine is called from tty_putDebugChar, below. + */ +static int ints_disabled = 1; +int +gdb_hook_interrupt(struct async_struct *info, int verb) +{ + struct serial_state *state = info->state; + unsigned long flags; + int port; +#ifdef TEST_EXISTANCE + int scratch, scratch2; +#endif + + /* The above fails if memory managment is not set up yet. + * Rather than fail the set up, just keep track of the fact + * and pick up the interrupt thing later. + */ + gdb_async_info = info; + port = gdb_async_info->port; + gdb_async_irq = state->irq; + if (verb) { + printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n", + kgdb_version, + port, + gdb_async_irq, gdb_async_info->state->custom_divisor); + } + local_irq_save(flags); +#ifdef TEST_EXISTANCE + /* Existance test */ + /* Should not need all this, but just in case.... */ + + scratch = inb_p(port + UART_IER); + outb_px(port + UART_IER, 0); + outb_px(0xff, 0x080); + scratch2 = inb_p(port + UART_IER); + outb_px(port + UART_IER, scratch); + if (scratch2) { + printk + ("gdb_hook_interrupt: Could not clear IER, not a UART!\n"); + local_irq_restore(flags); + return 1; /* We failed; there's nothing here */ + } + scratch2 = inb_p(port + UART_LCR); + outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */ + outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */ + outb_px(port + UART_LCR, 0); + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO); + scratch = inb_p(port + UART_IIR) >> 6; + if (scratch == 1) { + printk("gdb_hook_interrupt: Undefined UART type!" + " Not a UART! \n"); + local_irq_restore(flags); + return 1; + } else { + dbprintk(("gdb_hook_interrupt: UART type " + "is %d where 0=16450, 2=16550 3=16550A\n", scratch)); + } + scratch = inb_p(port + UART_MCR); + outb_px(port + UART_MCR, UART_MCR_LOOP | scratch); + outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A); + scratch2 = inb_p(port + UART_MSR) & 0xF0; + outb_px(port + UART_MCR, scratch); + if (scratch2 != 0x90) { + printk("gdb_hook_interrupt: " + "Loop back test failed! Not a UART!\n"); + local_irq_restore(flags); + return scratch2 + 1000; /* force 0 to fail */ + } +#endif /* test existance */ + program_uart(info); + local_irq_restore(flags); + + return (0); + +} /* gdb_hook_interrupt */ + +static void +program_uart(struct async_struct *info) +{ + int port = info->port; + + (void) inb_p(port + UART_RX); + outb_px(port + UART_IER, 0); + + (void) inb_p(port + UART_RX); /* serial driver comments say */ + (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */ + (void) inb_p(port + UART_MSR); + outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB); + outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */ + outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */ + outb_px(port + UART_MCR, info->MCR); + + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */ + outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */ + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */ + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + return; +} + +/* + * tty_getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. In the + */ +int kgdb_in_isr = 0; +int kgdb_in_lsr = 0; +extern spinlock_t kgdb_spinlock; + +/* Caller takes needed protections */ + +int +tty_getDebugChar(void) +{ + volatile int chr, dum, time, end_time; + + dbprintk(("tty_getDebugChar(port %x): ", gdb_async_info->port)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + /* + * This trick says if we wait a very long time and get + * no char, return the -1 and let the upper level deal + * with it. + */ + rdtsc(dum, time); + end_time = time + 2; + while (((chr = read_char(gdb_async_info)) == -1) && + (end_time - time) > 0) { + rdtsc(dum, time); + }; + /* + * This covers our butts if some other code messes with + * our uart, hay, it happens :o) + */ + if (chr == -1) + program_uart(gdb_async_info); + + dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' ')); + return (chr); + +} /* tty_getDebugChar */ + +static int count = 3; +static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; + +static int __init +kgdb_enable_ints(void) +{ + set_debug_traps(); + if (kgdboe) { + return 0; + } + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 1); + } + ok_to_enable_ints = 1; + kgdb_enable_ints_now(); +#ifdef CONFIG_KGDB_USER_CONSOLE + kgdb_console_finit(); +#endif + return 0; +} + +#ifdef CONFIG_SERIAL_8250 +void shutdown_for_kgdb(struct async_struct *gdb_async_info); +#endif + +#define kgdb_mem_init_done() (1) + +static void +kgdb_enable_ints_now(void) +{ + if (!spin_trylock(&one_at_atime)) + return; + if (!ints_disabled) + goto exit; + if (kgdb_mem_init_done() && + ints_disabled) { /* don't try till mem init */ +#ifdef CONFIG_SERIAL_8250 + /* + * The ifdef here allows the system to be configured + * without the serial driver. + * Don't make it a module, however, it will steal the port + */ + shutdown_for_kgdb(gdb_async_info); +#endif + ints_disabled = request_irq(gdb_async_info->state->irq, + gdb_interrupt, + IRQ_T(gdb_async_info), + "KGDB-stub", NULL); + intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); + } + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + exit: + spin_unlock(&one_at_atime); +} + +/* + * tty_putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. Caller takes needed protections. + */ +void +tty_putDebugChar(int chr) +{ + dbprintk(("tty_putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n", + gdb_async_info->port, + chr, + chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + + write_char(gdb_async_info, chr); /* this routine will wait */ + count = (chr == '#') ? 0 : count + 1; + if ((count == 2)) { /* try to enable after */ + if (ints_disabled & ok_to_enable_ints) + kgdb_enable_ints_now(); /* try to enable after */ + + /* We do this a lot because, well we really want to get these + * interrupts. The serial driver will clear these bits when it + * initializes the chip. Every thing else it does is ok, + * but this. + */ + if (!ints_disabled) { + outb_px(gdb_async_info->port + UART_IER, + gdb_async_info->IER); + } + } + +} /* tty_putDebugChar */ + +/* + * This does nothing for the serial port, since it doesn't buffer. + */ + +void tty_flushDebugChar(void) +{ +} + +module_init(kgdb_enable_ints); --- linux-2.6.4-rc1/arch/x86_64/lib/Makefile 2003-06-14 12:17:59.000000000 -0700 +++ 25/arch/x86_64/lib/Makefile 2004-02-29 13:08:50.000000000 -0800 @@ -9,5 +9,5 @@ lib-y := csum-partial.o csum-copy.o csum thunk.o io.o clear_page.o copy_page.o bitstr.o lib-y += memcpy.o memmove.o memset.o copy_user.o -lib-$(CONFIG_IO_DEBUG) += iodebug.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o +lib-$(CONFIG_KGDB) += kgdb_serial.o --- linux-2.6.4-rc1/arch/x86_64/Makefile 2004-02-27 16:17:20.000000000 -0800 +++ 25/arch/x86_64/Makefile 2004-02-29 13:09:19.000000000 -0800 @@ -38,7 +38,7 @@ OBJCOPYFLAGS := -O binary -R .note -R .c LDFLAGS_vmlinux := -e stext cflags-$(CONFIG_MK8) += $(call check_gcc,-march=k8,) -cflags-$(CONFIG_MPSC) += $(call check_gcc,-march=pentium4,) +cflags-$(CONFIG_MPSC) += $(call check_gcc,-march=prescott,) CFLAGS += $(cflags-y) CFLAGS += -mno-red-zone --- linux-2.6.4-rc1/CREDITS 2004-02-17 20:48:41.000000000 -0800 +++ 25/CREDITS 2004-02-29 13:09:10.000000000 -0800 @@ -289,6 +289,15 @@ S: Via Delle Palme, 9 S: Terni 05100 S: Italy +N: Krzysztof Benedyczak +E: golbi@mat.uni.torun.pl +W: http://www.mat.uni.torun.pl/~golbi +D: POSIX message queues fs (with M. Wronski) +S: ul. Podmiejska 52 +S: Radunica +S: 83-000 Pruszcz Gdanski +S: Poland + N: Randolph Bentson E: bentson@grieg.seaslug.org W: http://www.aa.net/~bentson/ @@ -3489,6 +3498,14 @@ S: 12725 SW Millikan Way, Suite 400 S: Beaverton, OR 97005 S: USA +N: Michal Wronski +E: wrona@mat.uni.torun.pl +W: http://www.mat.uni.torun.pl/~wrona +D: POSIX message queues fs (with K. Benedyczak) +S: ul. Teczowa 23/12 +S: 80-680 Gdansk-Sobieszewo +S: Poland + N: Frank Xia E: qx@math.columbia.edu D: Xiafs filesystem [defunct] --- linux-2.6.4-rc1/crypto/cipher.c 2003-08-22 19:23:40.000000000 -0700 +++ 25/crypto/cipher.c 2004-02-29 13:08:54.000000000 -0800 @@ -4,7 +4,6 @@ * Cipher operations. * * Copyright (c) 2002 James Morris - * Generic scatterwalk code by Adam J. Richter . * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -17,30 +16,13 @@ #include #include #include -#include -#include #include #include "internal.h" +#include "scatterwalk.h" typedef void (cryptfn_t)(void *, u8 *, const u8 *); typedef void (procfn_t)(struct crypto_tfm *, u8 *, - u8*, cryptfn_t, int enc, void *); - -struct scatter_walk { - struct scatterlist *sg; - struct page *page; - void *data; - unsigned int len_this_page; - unsigned int len_this_segment; - unsigned int offset; -}; - -enum km_type crypto_km_types[] = { - KM_USER0, - KM_USER1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, -}; + u8*, cryptfn_t, int enc, void *, int); static inline void xor_64(u8 *a, const u8 *b) { @@ -57,108 +39,6 @@ static inline void xor_128(u8 *a, const } -/* Define sg_next is an inline routine now in case we want to change - scatterlist to a linked list later. */ -static inline struct scatterlist *sg_next(struct scatterlist *sg) -{ - return sg + 1; -} - -void *which_buf(struct scatter_walk *walk, unsigned int nbytes, void *scratch) -{ - if (nbytes <= walk->len_this_page && - (((unsigned long)walk->data) & (PAGE_CACHE_SIZE - 1)) + nbytes <= - PAGE_CACHE_SIZE) - return walk->data; - else - return scratch; -} - -static void memcpy_dir(void *buf, void *sgdata, size_t nbytes, int out) -{ - if (out) - memcpy(sgdata, buf, nbytes); - else - memcpy(buf, sgdata, nbytes); -} - -static void scatterwalk_start(struct scatter_walk *walk, struct scatterlist *sg) -{ - unsigned int rest_of_page; - - walk->sg = sg; - - walk->page = sg->page; - walk->len_this_segment = sg->length; - - rest_of_page = PAGE_CACHE_SIZE - (sg->offset & (PAGE_CACHE_SIZE - 1)); - walk->len_this_page = min(sg->length, rest_of_page); - walk->offset = sg->offset; -} - -static void scatterwalk_map(struct scatter_walk *walk, int out) -{ - walk->data = crypto_kmap(walk->page, out) + walk->offset; -} - -static void scatter_page_done(struct scatter_walk *walk, int out, - unsigned int more) -{ - /* walk->data may be pointing the first byte of the next page; - however, we know we transfered at least one byte. So, - walk->data - 1 will be a virutual address in the mapped page. */ - - if (out) - flush_dcache_page(walk->page); - - if (more) { - walk->len_this_segment -= walk->len_this_page; - - if (walk->len_this_segment) { - walk->page++; - walk->len_this_page = min(walk->len_this_segment, - (unsigned)PAGE_CACHE_SIZE); - walk->offset = 0; - } - else - scatterwalk_start(walk, sg_next(walk->sg)); - } -} - -static void scatter_done(struct scatter_walk *walk, int out, int more) -{ - crypto_kunmap(walk->data, out); - if (walk->len_this_page == 0 || !more) - scatter_page_done(walk, out, more); -} - -/* - * Do not call this unless the total length of all of the fragments - * has been verified as multiple of the block size. - */ -static int copy_chunks(void *buf, struct scatter_walk *walk, - size_t nbytes, int out) -{ - if (buf != walk->data) { - while (nbytes > walk->len_this_page) { - memcpy_dir(buf, walk->data, walk->len_this_page, out); - buf += walk->len_this_page; - nbytes -= walk->len_this_page; - - crypto_kunmap(walk->data, out); - scatter_page_done(walk, out, 1); - scatterwalk_map(walk, out); - } - - memcpy_dir(buf, walk->data, nbytes, out); - } - - walk->offset += nbytes; - walk->len_this_page -= nbytes; - walk->len_this_segment -= nbytes; - return 0; -} - /* * Generic encrypt/decrypt wrapper for ciphers, handles operations across * multiple page boundaries by using temporary blocks. In user context, @@ -191,19 +71,21 @@ static int crypt(struct crypto_tfm *tfm, scatterwalk_map(&walk_in, 0); scatterwalk_map(&walk_out, 1); - src_p = which_buf(&walk_in, bsize, tmp_src); - dst_p = which_buf(&walk_out, bsize, tmp_dst); + src_p = scatterwalk_whichbuf(&walk_in, bsize, tmp_src); + dst_p = scatterwalk_whichbuf(&walk_out, bsize, tmp_dst); nbytes -= bsize; - copy_chunks(src_p, &walk_in, bsize, 0); + scatterwalk_copychunks(src_p, &walk_in, bsize, 0); - prfn(tfm, dst_p, src_p, crfn, enc, info); + prfn(tfm, dst_p, src_p, crfn, enc, info, + scatterwalk_samebuf(&walk_in, &walk_out, + src_p, dst_p)); - scatter_done(&walk_in, 0, nbytes); + scatterwalk_done(&walk_in, 0, nbytes); - copy_chunks(dst_p, &walk_out, bsize, 1); - scatter_done(&walk_out, 1, nbytes); + scatterwalk_copychunks(dst_p, &walk_out, bsize, 1); + scatterwalk_done(&walk_out, 1, nbytes); if (!nbytes) return 0; @@ -212,8 +94,8 @@ static int crypt(struct crypto_tfm *tfm, } } -static void cbc_process(struct crypto_tfm *tfm, - u8 *dst, u8 *src, cryptfn_t fn, int enc, void *info) +static void cbc_process(struct crypto_tfm *tfm, u8 *dst, u8 *src, + cryptfn_t fn, int enc, void *info, int in_place) { u8 *iv = info; @@ -226,10 +108,9 @@ static void cbc_process(struct crypto_tf fn(crypto_tfm_ctx(tfm), dst, iv); memcpy(iv, dst, crypto_tfm_alg_blocksize(tfm)); } else { - const int need_stack = (src == dst); - u8 stack[need_stack ? crypto_tfm_alg_blocksize(tfm) : 0]; - u8 *buf = need_stack ? stack : dst; - + u8 stack[in_place ? crypto_tfm_alg_blocksize(tfm) : 0]; + u8 *buf = in_place ? stack : dst; + fn(crypto_tfm_ctx(tfm), buf, src); tfm->crt_u.cipher.cit_xor_block(buf, iv); memcpy(iv, src, crypto_tfm_alg_blocksize(tfm)); @@ -239,7 +120,7 @@ static void cbc_process(struct crypto_tf } static void ecb_process(struct crypto_tfm *tfm, u8 *dst, u8 *src, - cryptfn_t fn, int enc, void *info) + cryptfn_t fn, int enc, void *info, int in_place) { fn(crypto_tfm_ctx(tfm), dst, src); } --- linux-2.6.4-rc1/crypto/internal.h 2003-09-08 13:58:56.000000000 -0700 +++ 25/crypto/internal.h 2004-02-29 13:08:54.000000000 -0800 @@ -11,6 +11,7 @@ */ #ifndef _CRYPTO_INTERNAL_H #define _CRYPTO_INTERNAL_H +#include #include #include #include --- linux-2.6.4-rc1/crypto/Makefile 2003-09-08 13:58:56.000000000 -0700 +++ 25/crypto/Makefile 2004-02-29 13:08:54.000000000 -0800 @@ -4,7 +4,7 @@ proc-crypto-$(CONFIG_PROC_FS) = proc.o -obj-$(CONFIG_CRYPTO) += api.o cipher.o digest.o compress.o \ +obj-$(CONFIG_CRYPTO) += api.o scatterwalk.o cipher.o digest.o compress.o \ $(proc-crypto-y) obj-$(CONFIG_CRYPTO_HMAC) += hmac.o --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/crypto/scatterwalk.c 2004-02-29 13:08:54.000000000 -0800 @@ -0,0 +1,124 @@ +/* + * Cryptographic API. + * + * Cipher operations. + * + * Copyright (c) 2002 James Morris + * 2002 Adam J. Richter + * 2004 Jean-Luc Cooke + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ +#include +#include +#include +#include +#include +#include "internal.h" +#include "scatterwalk.h" + +enum km_type crypto_km_types[] = { + KM_USER0, + KM_USER1, + KM_SOFTIRQ0, + KM_SOFTIRQ1, +}; + +void *scatterwalk_whichbuf(struct scatter_walk *walk, unsigned int nbytes, void *scratch) +{ + if (nbytes <= walk->len_this_page && + (((unsigned long)walk->data) & (PAGE_CACHE_SIZE - 1)) + nbytes <= + PAGE_CACHE_SIZE) + return walk->data; + else + return scratch; +} + +static void memcpy_dir(void *buf, void *sgdata, size_t nbytes, int out) +{ + if (out) + memcpy(sgdata, buf, nbytes); + else + memcpy(buf, sgdata, nbytes); +} + +void scatterwalk_start(struct scatter_walk *walk, struct scatterlist *sg) +{ + unsigned int rest_of_page; + + walk->sg = sg; + + walk->page = sg->page; + walk->len_this_segment = sg->length; + + rest_of_page = PAGE_CACHE_SIZE - (sg->offset & (PAGE_CACHE_SIZE - 1)); + walk->len_this_page = min(sg->length, rest_of_page); + walk->offset = sg->offset; +} + +void scatterwalk_map(struct scatter_walk *walk, int out) +{ + walk->data = crypto_kmap(walk->page, out) + walk->offset; +} + +static void scatterwalk_pagedone(struct scatter_walk *walk, int out, + unsigned int more) +{ + /* walk->data may be pointing the first byte of the next page; + however, we know we transfered at least one byte. So, + walk->data - 1 will be a virutual address in the mapped page. */ + + if (out) + flush_dcache_page(walk->page); + + if (more) { + walk->len_this_segment -= walk->len_this_page; + + if (walk->len_this_segment) { + walk->page++; + walk->len_this_page = min(walk->len_this_segment, + (unsigned)PAGE_CACHE_SIZE); + walk->offset = 0; + } + else + scatterwalk_start(walk, sg_next(walk->sg)); + } +} + +void scatterwalk_done(struct scatter_walk *walk, int out, int more) +{ + crypto_kunmap(walk->data, out); + if (walk->len_this_page == 0 || !more) + scatterwalk_pagedone(walk, out, more); +} + +/* + * Do not call this unless the total length of all of the fragments + * has been verified as multiple of the block size. + */ +int scatterwalk_copychunks(void *buf, struct scatter_walk *walk, + size_t nbytes, int out) +{ + if (buf != walk->data) { + while (nbytes > walk->len_this_page) { + memcpy_dir(buf, walk->data, walk->len_this_page, out); + buf += walk->len_this_page; + nbytes -= walk->len_this_page; + + crypto_kunmap(walk->data, out); + scatterwalk_pagedone(walk, out, 1); + scatterwalk_map(walk, out); + } + + memcpy_dir(buf, walk->data, nbytes, out); + } + + walk->offset += nbytes; + walk->len_this_page -= nbytes; + walk->len_this_segment -= nbytes; + return 0; +} --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/crypto/scatterwalk.h 2004-02-29 13:08:54.000000000 -0800 @@ -0,0 +1,50 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2002 James Morris + * Copyright (c) 2002 Adam J. Richter + * Copyright (c) 2004 Jean-Luc Cooke + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#ifndef _CRYPTO_SCATTERWALK_H +#define _CRYPTO_SCATTERWALK_H +#include +#include + +struct scatter_walk { + struct scatterlist *sg; + struct page *page; + void *data; + unsigned int len_this_page; + unsigned int len_this_segment; + unsigned int offset; +}; + +/* Define sg_next is an inline routine now in case we want to change + scatterlist to a linked list later. */ +static inline struct scatterlist *sg_next(struct scatterlist *sg) +{ + return sg + 1; +} + +static inline int scatterwalk_samebuf(struct scatter_walk *walk_in, + struct scatter_walk *walk_out, + void *src_p, void *dst_p) +{ + return walk_in->page == walk_out->page && + walk_in->data == src_p && walk_out->data == dst_p; +} + +void *scatterwalk_whichbuf(struct scatter_walk *walk, unsigned int nbytes, void *scratch); +void scatterwalk_start(struct scatter_walk *walk, struct scatterlist *sg); +int scatterwalk_copychunks(void *buf, struct scatter_walk *walk, size_t nbytes, int out); +void scatterwalk_map(struct scatter_walk *walk, int out); +void scatterwalk_done(struct scatter_walk *walk, int out, int more); + +#endif /* _CRYPTO_SCATTERWALK_H */ --- linux-2.6.4-rc1/Documentation/00-INDEX 2003-09-27 18:57:43.000000000 -0700 +++ 25/Documentation/00-INDEX 2004-02-29 13:08:51.000000000 -0800 @@ -1,7 +1,7 @@ This is a brief list of all the files in ./linux/Documentation and what -they contain. If you add a documentation file, please list it here in -alphabetical order as well, or risk being hunted down like a rabid dog. +they contain. If you add a documentation file, please list it here in +alphabetical order as well, or risk being hunted down like a rabid dog. Please try and keep the descriptions small enough to fit on one line. Thanks -- Paul G. @@ -26,10 +26,14 @@ DocBook/ - directory with DocBook templates etc. for kernel documentation. IO-mapping.txt - how to access I/O mapped memory from within device drivers. +IPMI.txt + - info on Linux Intelligent Platform Management Interface (IPMI) Driver. IRQ-affinity.txt - how to select which CPU(s) handle which interrupt events on SMP. +MSI-HOWTO.txt + - the Message Signaled Interrupts (MSI) Driver Guide HOWTO and FAQ. README.DAC960 - - info on Mylex DAC960/DAC1100 PCI RAID Controller Driver for Linux + - info on Mylex DAC960/DAC1100 PCI RAID Controller Driver for Linux. README.moxa - release notes for Moxa mutiport serial card. SAK.txt @@ -42,6 +46,10 @@ VGA-softcursor.txt - how to change your VGA cursor from a blinking underscore. arm/ - directory with info about Linux on the ARM architecture. +as-iosched.txt + - info on anticipatory IO scheduler. +basic_profiling.txt + - basic instructions for those who wants to profile Linux kernel. binfmt_misc.txt - info on the kernel support for extra binary formats. block/ @@ -53,25 +61,31 @@ cciss.txt cdrom/ - directory with information on the CD-ROM drivers that Linux has. computone.txt - - info on Computone Intelliport II/Plus Multiport Serial Driver + - info on Computone Intelliport II/Plus Multiport Serial Driver. cpqarray.txt - info on using Compaq's SMART2 Intelligent Disk Array Controllers. cpufreq/ - - info on CPU frequency and voltage scaling + - info on CPU frequency and voltage scaling. cris/ - directory with info about Linux on CRIS architecture. +debugging-modules.txt + - some notes on debugging modules after Linux 2.6.3. devices.txt - - plain ASCII listing of all the nodes in /dev/ with major minor #'s + - plain ASCII listing of all the nodes in /dev/ with major minor #'s. digiboard.txt - info on the Digiboard PC/X{i,e,eve} multiport boards. digiepca.txt - info on Digi Intl. {PC,PCI,EISA}Xx and Xem series cards. dnotify.txt - info about directory notification in Linux. -driver-model.txt - - info about Linux driver model. +driver-model/ + - directory with info about Linux driver model. +dvb/ + - info on Linux Digital Video Broadcast (DVB) subsystem. early-userspace/ - info about initramfs, klibc, and userspace early during boot. +eisa.txt + - info on EISA bus support. exception.txt - how Linux v2.2 handles exceptions without verify_area etc. fb/ @@ -81,47 +95,51 @@ filesystems/ floppy.txt - notes and driver options for the floppy disk driver. ftape.txt - - notes about the floppy tape device driver + - notes about the floppy tape device driver. hayes-esp.txt - info on using the Hayes ESP serial driver. highuid.txt - notes on the change from 16 bit to 32 bit user/group IDs. +hw_random.txt + - info on Linux support for random number generator in i8xx chipsets. i2c/ - - directory with info about the I2C bus/protocol (2 wire, kHz speed) + - directory with info about the I2C bus/protocol (2 wire, kHz speed). i386/ - - directory with info about Linux on intel 32 bit architecture. -i810_rng.txt - - info on Linux support for random number generator in i8xx chipsets. + - directory with info about Linux on Intel 32 bit architecture. ia64/ - - directory with info about Linux on intel 64 bit architecture. + - directory with info about Linux on Intel 64 bit architecture. ide.txt - - important info for users of ATA devices (IDE/EIDE disks and CD-ROMS) + - important info for users of ATA devices (IDE/EIDE disks and CD-ROMS). initrd.txt - how to use the RAM disk as an initial/temporary root filesystem. input/ - info on Linux input device support. ioctl-number.txt - how to implement and register device/driver ioctl calls. +iostats.txt + - info on I/O statistics Linux kernel provides. isapnp.txt - - info on Linux ISA Plug & Play support + - info on Linux ISA Plug & Play support. isdn/ - directory with info on the Linux ISDN support, and supported cards. java.txt - - info on the in-kernel binary support for Java(tm) + - info on the in-kernel binary support for Java(tm). kbuild/ - - directory with info about the kernel build process + - directory with info about the kernel build process. kernel-doc-nano-HOWTO.txt - mini HowTo on generation and location of kernel documentation files. kernel-docs.txt - listing of various WWW + books that document kernel internals. kernel-parameters.txt - summary listing of command line / boot prompt args for the kernel. +kobject.txt + - info of the kobject infrastructure of the Linux kernel. ldm.txt - a brief description of LDM (Windows Dynamic Disks). locks.txt - info on file locking implementations, flock() vs. fcntl(), etc. logo.gif - - Full colour GIF image of Linux logo (penguin) + - Full colour GIF image of Linux logo (penguin). logo.txt - Info on creator of above logo & site to get additional images from. m68k/ @@ -133,27 +151,27 @@ mandatory.txt mca.txt - info on supporting Micro Channel Architecture (e.g. PS/2) systems. md.txt - - info on boot arguments for the multiple devices driver + - info on boot arguments for the multiple devices driver. memory.txt - info on typical Linux memory problems. mips/ - directory with info about Linux on MIPS architecture. mkdev.cciss - - script to make /dev entries for SMART controllers (see cciss.txt) + - script to make /dev entries for SMART controllers (see cciss.txt). mkdev.ida - script to make /dev entries for Intelligent Disk Array Controllers. moxa-smartio - info on installing/using Moxa multiport serial driver. mtrr.txt - - how to use PPro Memory Type Range Registers to increase performance + - how to use PPro Memory Type Range Registers to increase performance. nbd.txt - info on a TCP implementation of a network block device. networking/ - directory with info on various aspects of networking with Linux. nfsroot.txt - - short guide on setting up a diskless box with NFS root filesystem + - short guide on setting up a diskless box with NFS root filesystem. nmi_watchdog.txt - - info on NMI watchdog for SMP systems + - info on NMI watchdog for SMP systems. oops-tracing.txt - how to decode those nasty internal kernel error dump messages. paride.txt @@ -165,11 +183,11 @@ parport.txt parport-lowlevel.txt - description and usage of the low level parallel port functions. pci.txt - - info on the PCI subsystem for device driver authors -pcwd-watchdog.txt - - info and sample code for using with the PC Watchdog reset card. + - info on the PCI subsystem for device driver authors. pm.txt - info on Linux power management support. +pnp.txt + - Linux Plug and Play documentation. power/ - directory with info on Linux PCI power management. powerpc/ @@ -181,29 +199,31 @@ ramdisk.txt riscom8.txt - notes on using the RISCom/8 multi-port serial driver. rocket.txt - - info on installing/using the Comtrol RocketPort multiport serial driver + - info on installing/using the Comtrol RocketPort multiport serial driver. +rpc-cache.txt + - introduction to the caching mechanisms in the sunrpc layer. rtc.txt - notes on how to use the Real Time Clock (aka CMOS clock) driver. s390/ - directory with info on using Linux on the IBM S390. -sh/ - - directory with info on porting Linux to a new architecture. +sched-design.txt + - goals, design and implementation of the Linux O(1) scheduler. scsi/ - directory with info on Linux scsi support. serial-console.txt - how to set up Linux with a serial line console as the default. sgi-visws.txt - short blurb on the SGI Visual Workstations. +sh/ + - directory with info on porting Linux to a new architecture. smart-config.txt - description of the Smart Config makefile feature. -smp.tex - - LaTeX document describing implementation of Multiprocessor Linux smp.txt - - a few more notes on symmetric multi-processing + - a few notes on symmetric multi-processing. sonypi.txt - info on Linux Sony Programmable I/O Device support. sound/ - - directory with info on sound card support + - directory with info on sound card support. sparc/ - directory with info on using Linux on Sparc architecture. specialix.txt @@ -217,9 +237,9 @@ svga.txt sx.txt - info on the Specialix SX/SI multiport serial driver. sysctl/ - - directory with info on the /proc/sys/* files + - directory with info on the /proc/sys/* files. sysrq.txt - - info on the magic SysRq key + - info on the magic SysRq key. telephony/ - directory with info on telephony (e.g. voice over IP) support. unicode.txt @@ -230,7 +250,7 @@ video4linux/ - directory with info regarding video/TV/radio cards and linux. vm/ - directory with info on the Linux vm code. -watchdog.txt +watchdog/ - how to auto-reboot Linux if it has "fallen and can't get up". ;-) x86_64/ - directory with info on Linux support for AMD x86-64 (Hammer) machines. @@ -238,4 +258,3 @@ xterm-linux.xpm - XPM image of penguin logo (see logo.txt) sitting on an xterm. zorro.txt - info on writing drivers for Zorro bus devices found on Amigas. - --- linux-2.6.4-rc1/Documentation/binfmt_misc.txt 2003-10-08 15:07:08.000000000 -0700 +++ 25/Documentation/binfmt_misc.txt 2004-02-29 13:08:30.000000000 -0800 @@ -15,7 +15,7 @@ First you must mount binfmt_misc: mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc To actually register a new binary type, you have to set up a string looking like -:name:type:offset:magic:mask:interpreter: (where you can choose the ':' upon +:name:type:offset:magic:mask:interpreter:flags (where you can choose the ':' upon your needs) and echo it to /proc/sys/fs/binfmt_misc/register. Here is what the fields mean: - 'name' is an identifier string. A new /proc file will be created with this @@ -34,6 +34,28 @@ Here is what the fields mean: The mask is anded with the byte sequence of the file. - 'interpreter' is the program that should be invoked with the binary as first argument (specify the full path) + - 'flags' is an optional field that controls several aspects of the invocation + of the interpreter. It is a string of capital letters, each controls a certain + aspect. The following flags are supported - + 'P' - preserve-argv[0]. Legacy behavior of binfmt_misc is to overwrite the + original argv[0] with the full path to the binary. When this flag is + included, binfmt_misc will add an argument to the argument vector for + this purpose, thus preserving the original argv[0]. + 'O' - open-binary. Legacy behavior of binfmt_misc is to pass the full path + of the binary to the interpreter as an argument. When this flag is + included, binfmt_misc will open the file for reading and pass its + descriptor as an argument, instead of the full path, thus allowing + the interpreter to execute non-readable binaries. This feature should + be used with care - the interpreter has to be trusted not to emit + the contents of the non-readable binary. + 'C' - credentials. Currently, the behavior of binfmt_misc is to calculate + the credentials and security token of the new process according to + the interpreter. When this flag is included, these attributes are + calculated according to the binary. It also implies the 'O' flag. + This feature should be used with care as the interpreter + will run with root permissions when a setuid binary owned by root + is run with binfmt_misc. + There are some restrictions: - the whole register string may not exceed 255 characters @@ -83,9 +105,9 @@ If you want to pass special arguments to write a wrapper script for it. See Documentation/java.txt for an example. -Your interpreter should NOT look in the PATH for the filename; the -kernel passes it the full filename to use. Using the PATH can cause -unexpected behaviour and be a security hazard. +Your interpreter should NOT look in the PATH for the filename; the kernel +passes it the full filename (or the file descriptor) to use. Using $PATH can +cause unexpected behaviour and can be a security hazard. There is a web page about binfmt_misc at --- linux-2.6.4-rc1/Documentation/DMA-mapping.txt 2003-08-22 19:23:39.000000000 -0700 +++ 25/Documentation/DMA-mapping.txt 2004-02-29 13:07:52.000000000 -0800 @@ -283,7 +283,7 @@ There are two types of DMA mappings: in order to get correct behavior on all platforms. - Streaming DMA mappings which are usually mapped for one DMA transfer, - unmapped right after it (unless you use pci_dma_sync below) and for which + unmapped right after it (unless you use pci_dma_sync_* below) and for which hardware can optimize for sequential accesses. This of "streaming" as "asynchronous" or "outside the coherency @@ -543,14 +543,30 @@ same bus address space) and you could re all bus addresses. If you need to use the same streaming DMA region multiple times and touch -the data in between the DMA transfers, just map it with -pci_map_{single,sg}, and after each DMA transfer call either: +the data in between the DMA transfers, the buffer needs to be synced +properly in order for the cpu and device to see the most uptodate and +correct copy of the DMA buffer. - pci_dma_sync_single(dev, dma_handle, size, direction); +So, firstly, just map it with pci_map_{single,sg}, and after each DMA +transfer call either: + + pci_dma_sync_single_for_cpu(dev, dma_handle, size, direction); or: - pci_dma_sync_sg(dev, sglist, nents, direction); + pci_dma_sync_sg_for_cpu(dev, sglist, nents, direction); + +as appropriate. + +Then, if you wish to let the device get at the DMA area again, +finish accessing the data with the cpu, and then before actually +giving the buffer to the hardware call either: + + pci_dma_sync_single_for_device(dev, dma_handle, size, direction); + +or: + + pci_dma_sync_sg_for_device(dev, sglist, nents, direction); as appropriate. @@ -590,8 +606,9 @@ to use the pci_dma_sync_*() interfaces. * the DMA transfer with the CPU first * so that we see updated contents. */ - pci_dma_sync_single(cp->pdev, cp->rx_dma, cp->rx_len, - PCI_DMA_FROMDEVICE); + pci_dma_sync_single_for_cpu(cp->pdev, cp->rx_dma, + cp->rx_len, + PCI_DMA_FROMDEVICE); /* Now it is safe to examine the buffer. */ hp = (struct my_card_header *) cp->rx_buf; @@ -601,7 +618,13 @@ to use the pci_dma_sync_*() interfaces. pass_to_upper_layers(cp->rx_buf); make_and_setup_new_rx_buf(cp); } else { - /* Just give the buffer back to the card. */ + /* Just sync the buffer and give it back + * to the card. + */ + pci_dma_sync_single_for_device(cp->pdev, + cp->rx_dma, + cp->rx_len, + PCI_DMA_FROMDEVICE); give_rx_buf_to_card(cp); } } @@ -709,12 +732,21 @@ interfaces. To reiterate: When the DMA transfer is complete, invoke: - void pci_dac_dma_sync_single(struct pci_dev *pdev, - dma64_addr_t dma_addr, - size_t len, int direction); + void pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, + dma64_addr_t dma_addr, + size_t len, int direction); This must be done before the CPU looks at the buffer again. -This interface behaves identically to pci_dma_sync_{single,sg}(). +This interface behaves identically to pci_dma_sync_{single,sg}_for_cpu(). + +And likewise, if you wish to let the device get back at the buffer after +the cpu has read/written it, invoke: + + void pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, + dma64_addr_t dma_addr, + size_t len, int direction); + +before letting the device access the DMA area again. If you need to get back to the PAGE/OFFSET tuple from a dma64_addr_t the following interfaces are provided: --- linux-2.6.4-rc1/Documentation/DocBook/gadget.tmpl 2003-09-27 18:57:43.000000000 -0700 +++ 25/Documentation/DocBook/gadget.tmpl 2004-02-29 13:07:49.000000000 -0800 @@ -454,6 +454,7 @@ but some optional utilities are provided !Edrivers/usb/gadget/usbstring.c +!Edrivers/usb/gadget/config.c --- linux-2.6.4-rc1/Documentation/early-userspace/README 2003-08-22 19:23:39.000000000 -0700 +++ 25/Documentation/early-userspace/README 2004-02-29 13:08:30.000000000 -0800 @@ -71,5 +71,31 @@ custom initramfs images that meet your n For questions and help, you can sign up for the early userspace mailing list at http://www.zytor.com/mailman/listinfo/klibc +How does it work? +================= + +The kernel has currently 3 ways to mount the root filesystem: + +a) all required device and filesystem drivers compiled into the kernel, no + initrd. init/main.c:init() will call prepare_namespace() to mount the + final root filesystem, based on the root= option and optional init= to run + some other init binary than listed at the end of init/main.c:init(). + +b) some device and filesystem drivers built as modules and stored in an + initrd. The initrd must contain a binary '/linuxrc' which is supposed to + load these driver modules. It is also possible to mount the final root + filesystem via linuxrc and use the pivot_root syscall. The initrd is + mounted and executed via prepare_namespace(). + +c) using initramfs. The call to prepare_namespace() must be skipped. + This means that a binary must do all the work. Said binary can be stored + into initramfs either via modifying usr/gen_init_cpio.c or via the new + initrd format, an cpio archive. It must be called "/init". This binary + is responsible to do all the things prepare_namespace() would do. + + To remain backwards compatibility, the /init binary will only run if it + comes via an initramfs cpio archive. If this is not the case, + init/main.c:init() will run prepare_namespace() to mount the final root + and exec one of the predefined init binaries. Bryan O'Sullivan --- linux-2.6.4-rc1/Documentation/filesystems/proc.txt 2004-02-27 16:17:18.000000000 -0800 +++ 25/Documentation/filesystems/proc.txt 2004-02-29 13:09:10.000000000 -0800 @@ -38,6 +38,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem ------------------------------------------------------------------------------ Preface @@ -1814,6 +1815,30 @@ The /proc/net/ipx_route table holds a gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. +2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem +---------------------------------------------------------- + +The "mqueue" filesystem provides the necessary kernel features to enable the +creation of a user space library that implements the POSIX message queues +API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System +Interfaces specification.) + +The "mqueue" filesystem contains values for determining/setting the amount of +resources used by the file system. + +/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the +maximum number of message queues allowed on the system. + +/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the +maximum number of messages in a queue value. In fact it is the limiting value +for another (user) limit which is set in mq_open invocation. This attribute of +a queue must be less or equal then msg_max. + +/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the +maximum message size value (it is every message queue's attribute set during +its creation). + + ------------------------------------------------------------------------------ Summary ------------------------------------------------------------------------------ --- linux-2.6.4-rc1/Documentation/filesystems/ufs.txt 2003-06-14 12:18:29.000000000 -0700 +++ 25/Documentation/filesystems/ufs.txt 2004-02-29 13:08:44.000000000 -0800 @@ -20,6 +20,9 @@ ufstype=type_of_ufs 44bsd used in FreeBSD, NetBSD, OpenBSD supported os read-write + ufs2 used in FreeBSD 5.x + supported os read-only + sun used in SunOS (Solaris) supported as read-write --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/Documentation/i386/kgdb/andthen 2004-02-29 13:07:58.000000000 -0800 @@ -0,0 +1,100 @@ + +define set_andthen + set var $thp=0 + set var $thp=(struct kgdb_and_then_struct *)&kgdb_data[0] + set var $at_size = (sizeof kgdb_data)/(sizeof *$thp) + set var $at_oc=kgdb_and_then_count + set var $at_cc=$at_oc +end + +define andthen_next + set var $at_cc=$arg0 +end + +define andthen + andthen_set_edge + if ($at_cc >= $at_oc) + printf "Outside window. Window size is %d\n",($at_oc-$at_low) + else + printf "%d: ",$at_cc + output *($thp+($at_cc++ % $at_size )) + printf "\n" + end +end +define andthen_set_edge + set var $at_oc=kgdb_and_then_count + set var $at_low = $at_oc - $at_size + if ($at_low < 0 ) + set var $at_low = 0 + end + if (( $at_cc > $at_oc) || ($at_cc < $at_low)) + printf "Count outside of window, setting count to " + if ($at_cc >= $at_oc) + set var $at_cc = $at_oc + else + set var $at_cc = $at_low + end + printf "%d\n",$at_cc + end +end + +define beforethat + andthen_set_edge + if ($at_cc <= $at_low) + printf "Outside window. Window size is %d\n",($at_oc-$at_low) + else + printf "%d: ",$at_cc-1 + output *($thp+(--$at_cc % $at_size )) + printf "\n" + end +end + +document andthen_next + andthen_next + . sets the number of the event to display next. If this event + . is not in the event pool, either andthen or beforethat will + . correct it to the nearest event pool edge. The event pool + . ends at the last event recorded and begins + . prior to that. If beforethat is used next, it will display + . event -1. +. + andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + + +document andthen + andthen +. displays the next event in the list. sets up to display +. the oldest saved event first. +. (optional) count of the event to display. +. note the number of events saved is specified at configure time. +. if events are saved between calls to andthen the index will change +. but the displayed event will be the next one (unless the event buffer +. is overrun). +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + +document set_andthen + set_andthen +. sets up to use the and commands. +. if you have defined your own struct, use the above and +. then enter the following: +. p $thp=(struct kgdb_and_then_structX *)&kgdb_data[0] +. where is the name of your structure. +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + +document beforethat + beforethat +. displays the next prior event in the list. sets up to +. display the last occuring event first. +. +. note the number of events saved is specified at configure time. +. if events are saved between calls to beforethat the index will change +. but the displayed event will be the next one (unless the event buffer +. is overrun). +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/Documentation/i386/kgdb/debug-nmi.txt 2004-02-29 13:07:58.000000000 -0800 @@ -0,0 +1,37 @@ +Subject: Debugging with NMI +Date: Mon, 12 Jul 1999 11:28:31 -0500 +From: David Grothe +Organization: Gcom, Inc +To: David Grothe + +Kernel hackers: + +Maybe this is old hat, but it is new to me -- + +On an ISA bus machine, if you short out the A1 and B1 pins of an ISA +slot you will generate an NMI to the CPU. This interrupts even a +machine that is hung in a loop with interrupts disabled. Used in +conjunction with kgdb < +ftp://ftp.gcom.com/pub/linux/src/kgdb-2.3.35/kgdb-2.3.35.tgz > you can +gain debugger control of a machine that is hung in the kernel! Even +without kgdb the kernel will print a stack trace so you can find out +where it was hung. + +The A1/B1 pins are directly opposite one another and the farthest pins +towards the bracket end of the ISA bus socket. You can stick a paper +clip or multi-meter probe between them to short them out. + +I had a spare ISA bus to PC104 bus adapter around. The PC104 end of the +board consists of two rows of wire wrap pins. So I wired a push button +between the A1/B1 pins and now have an ISA board that I can stick into +any ISA bus slot for debugger entry. + +Microsoft has a circuit diagram of a PCI card at +http://www.microsoft.com/hwdev/DEBUGGING/DMPSW.HTM. If you want to +build one you will have to mail them and ask for the PAL equations. +Nobody makes one comercially. + +[THIS TIP COMES WITH NO WARRANTY WHATSOEVER. It works for me, but if +your machine catches fire, it is your problem, not mine.] + +-- Dave (the kgdb guy) --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/Documentation/i386/kgdb/gdb-globals.txt 2004-02-29 13:07:58.000000000 -0800 @@ -0,0 +1,71 @@ +Sender: akale@veritas.com +Date: Fri, 23 Jun 2000 19:26:35 +0530 +From: "Amit S. Kale" +Organization: Veritas Software (India) +To: Dave Grothe , linux-kernel@vger.rutgers.edu +CC: David Milburn , + "Edouard G. Parmelan" , + ezannoni@cygnus.com, Keith Owens +Subject: Re: Module debugging using kgdb + +Dave Grothe wrote: +> +> Amit: +> +> There is a 2.4.0 version of kgdb on our ftp site: +> ftp://ftp.gcom.com/pub/linux/src/kgdb. I mirrored your version of gdb +> and loadmodule.sh there. +> +> Have a look at the README file and see if I go it right. If not, send +> me some corrections and I will update it. +> +> Does your version of gdb solve the global variable problem? + +Yes. +Thanks to Elena Zanoni, gdb (developement version) can now calculate +correctly addresses of dynamically loaded object files. I have not been +following gdb developement for sometime and am not sure when symbol +address calculation fix is going to appear in a gdb stable version. + +Elena, any idea when the fix will make it to a prebuilt gdb from a +redhat release? + +For the time being I have built a gdb developement version. It can be +used for module debugging with loadmodule.sh script. + +The problem with calculating of module addresses with previous versions +of gdb was as follows: +gdb did not use base address of a section while calculating address of +a symbol in the section in an object file loaded via 'add-symbol-file'. +It used address of .text segment instead. Due to this addresses of +symbols in .data, .bss etc. (e.g. global variables) were calculated incorrectly. + +Above mentioned fix allow gdb to use base address of a segment while +calculating address of a symbol in it. It adds a parameter '-s' to +'add-symbol-file' command for specifying base address of a segment. + +loadmodule.sh script works as follows. + +1. Copy a module file to target machine. +2. Load the module on the target machine using insmod with -m parameter. +insmod produces a module load map which contains base addresses of all +sections in the module and addresses of symbols in the module file. +3. Find all sections and their base addresses in the module from +the module map. +4. Generate a script that loads the module file. The script uses +'add-symbol-file' and specifies address of text segment followed by +addresses of all segments in the module. + +Here is an example gdb script produced by loadmodule.sh script. + +add-symbol-file foo 0xd082c060 -s .text.lock 0xd08cbfb5 +-s .fixup 0xd08cfbdf -s .rodata 0xd08cfde0 -s __ex_table 0xd08e3b38 +-s .data 0xd08e3d00 -s .bss 0xd08ec8c0 -s __ksymtab 0xd08ee838 + +With this command gdb can calculate addresses of symbols in ANY segment +in a module file. + +Regards. +-- +Amit Kale +Veritas Software ( http://www.veritas.com ) --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/Documentation/i386/kgdb/gdbinit 2004-02-29 13:07:58.000000000 -0800 @@ -0,0 +1,14 @@ +shell echo -e "\003" >/dev/ttyS0 +set remotebaud 38400 +target remote /dev/ttyS0 +define si +stepi +printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx +printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp +x/i $eip +end +define ni +nexti +printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx +printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp +x/i $eip --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/Documentation/i386/kgdb/gdbinit.hw 2004-02-29 13:07:58.000000000 -0800 @@ -0,0 +1,117 @@ + +#Using ia-32 hardware breakpoints. +# +#4 hardware breakpoints are available in ia-32 processors. These breakpoints +#do not need code modification. They are set using debug registers. +# +#Each hardware breakpoint can be of one of the +#three types: execution, write, access. +#1. An Execution breakpoint is triggered when code at the breakpoint address is +#executed. +#2. A write breakpoint ( aka watchpoints ) is triggered when memory location +#at the breakpoint address is written. +#3. An access breakpoint is triggered when memory location at the breakpoint +#address is either read or written. +# +#As hardware breakpoints are available in limited number, use software +#breakpoints ( br command in gdb ) instead of execution hardware breakpoints. +# +#Length of an access or a write breakpoint defines length of the datatype to +#be watched. Length is 1 for char, 2 short , 3 int. +# +#For placing execution, write and access breakpoints, use commands +#hwebrk, hwwbrk, hwabrk +#To remove a breakpoint use hwrmbrk command. +# +#These commands take following types of arguments. For arguments associated +#with each command, use help command. +#1. breakpointno: 0 to 3 +#2. length: 1 to 3 +#3. address: Memory location in hex ( without 0x ) e.g c015e9bc +# +#Use the command exinfo to find which hardware breakpoint occured. + +#hwebrk breakpointno address +define hwebrk + maintenance packet Y$arg0,0,0,$arg1 +end +document hwebrk + hwebrk
+ Places a hardware execution breakpoint + = 0 - 3 +
= Hex digits without leading "0x". +end + +#hwwbrk breakpointno length address +define hwwbrk + maintenance packet Y$arg0,1,$arg1,$arg2 +end +document hwwbrk + hwwbrk
+ Places a hardware write breakpoint + = 0 - 3 + = 1 (1 byte), 2 (2 byte), 3 (4 byte) +
= Hex digits without leading "0x". +end + +#hwabrk breakpointno length address +define hwabrk + maintenance packet Y$arg0,1,$arg1,$arg2 +end +document hwabrk + hwabrk
+ Places a hardware access breakpoint + = 0 - 3 + = 1 (1 byte), 2 (2 byte), 3 (4 byte) +
= Hex digits without leading "0x". +end + +#hwrmbrk breakpointno +define hwrmbrk + maintenance packet y$arg0 +end +document hwrmbrk + hwrmbrk + = 0 - 3 + Removes a hardware breakpoint +end + +define reboot + maintenance packet r +end +#exinfo +define exinfo + maintenance packet qE +end +document exinfo + exinfo + Gives information about a breakpoint. +end +define get_th + p $th=(struct thread_info *)((int)$esp & ~8191) +end +document get_th + get_tu + Gets and prints the current thread_info pointer, Defines th to be it. +end +define get_cu + p $cu=((struct thread_info *)((int)$esp & ~8191))->task +end +document get_cu + get_cu + Gets and print the "current" value. Defines $cu to be it. +end +define int_off + set var $flags=$eflags + set $eflags=$eflags&~0x200 + end +define int_on + set var $eflags|=$flags&0x200 + end +document int_off + saves the current interrupt state and clears the processor interrupt + flag. Use int_on to restore the saved flag. +end +document int_on + Restores the interrupt flag saved by int_off. +end --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/Documentation/i386/kgdb/gdbinit-modules 2004-02-29 13:07:58.000000000 -0800 @@ -0,0 +1,146 @@ +# +# Usefull GDB user-command to debug Linux Kernel Modules with gdbstub. +# +# This don't work for Linux-2.0 or older. +# +# Author Edouard G. Parmelan +# +# +# Fri Apr 30 20:33:29 CEST 1999 +# First public release. +# +# Major cleanup after experiment Linux-2.0 kernel without success. +# Symbols of a module are not in the correct order, I can't explain +# why :( +# +# Fri Mar 19 15:41:40 CET 1999 +# Initial version. +# +# Thu Jan 6 16:29:03 CST 2000 +# A little fixing by Dave Grothe +# +# Mon Jun 19 09:33:13 CDT 2000 +# Alignment changes from Edouard Parmelan +# +# The basic idea is to find where insmod load the module and inform +# GDB to load the symbol table of the module with the GDB command +# ``add-symbol-file
''. +# +# The Linux kernel holds the list of all loaded modules in module_list, +# this list end with &kernel_module (exactly with module->next == NULL, +# but the last module is not a real module). +# +# Insmod allocates the struct module before the object file. Since +# Linux-2.1, this structure contain his size. The real address of +# the object file is then (char*)module + module->size_of_struct. +# +# You can use three user functions ``mod-list'', ``mod-print-symbols'' +# and ``add-module-symbols''. +# +# mod-list list all loaded modules with the format: +# +# +# As soon as you have found the address of your module, you can +# print its exported symbols (mod-print-symbols) or inform GDB to add +# symbols from your module file (mod-add-symbols). +# +# The argument that you give to mod-print-symbols or mod-add-symbols +# is the from the mod-list command. +# +# When using the mod-add-symbols command you must also give the full +# pathname of the modules object code file. +# +# The command mod-add-lis is an example of how to make this easier. +# You can edit this macro to contain the path name of your own +# favorite module and then use it as a shorthand to load it. You +# still need the module-address, however. +# +# The internal function ``mod-validate'' set the GDB variable $mod +# as a ``struct module*'' if the kernel known the module otherwise +# $mod is set to NULL. This ensure to not add symbols for a wrong +# address. +# +# Have a nice hacking day ! +# +# +define mod-list + set $mod = (struct module*)module_list + # the last module is the kernel, ignore it + while $mod != &kernel_module + printf "%p\t%s\n", (long)$mod, ($mod)->name + set $mod = $mod->next + end +end +document mod-list +List all modules in the form: +Use the as the argument for the other +mod-commands: mod-print-symbols, mod-add-symbols. +end + +define mod-validate + set $mod = (struct module*)module_list + while ($mod != $arg0) && ($mod != &kernel_module) + set $mod = $mod->next + end + if $mod == &kernel_module + set $mod = 0 + printf "%p is not a module\n", $arg0 + end +end +document mod-validate +mod-validate +Internal user-command used to validate the module parameter. +If is a real loaded module, set $mod to it otherwise set $mod to 0. +end + + +define mod-print-symbols + mod-validate $arg0 + if $mod != 0 + set $i = 0 + while $i < $mod->nsyms + set $sym = $mod->syms[$i] + printf "%p\t%s\n", $sym->value, $sym->name + set $i = $i + 1 + end + end +end +document mod-print-symbols +mod-print-symbols +Print all exported symbols of the module. see mod-list +end + + +define mod-add-symbols-align + mod-validate $arg0 + if $mod != 0 + set $mod_base = ($mod->size_of_struct + (long)$mod) + if ($arg2 != 0) && (($mod_base & ($arg2 - 1)) != 0) + set $mod_base = ($mod_base | ($arg2 - 1)) + 1 + end + add-symbol-file $arg1 $mod_base + end +end +document mod-add-symbols-align +mod-add-symbols-align +Load the symbols table of the module from the object file where +first section aligment is . +To retreive alignment, use `objdump -h '. +end + +define mod-add-symbols + mod-add-symbols-align $arg0 $arg1 sizeof(long) +end +document mod-add-symbols +mod-add-symbols +Load the symbols table of the module from the object file. +Default alignment is 4. See mod-add-symbols-align. +end + +define mod-add-lis + mod-add-symbols-align $arg0 /usr/src/LiS/streams.o 16 +end +document mod-add-lis +mod-add-lis +Does mod-add-symbols /usr/src/LiS/streams.o +end --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/Documentation/i386/kgdb/kgdbeth.txt 2004-02-29 13:07:59.000000000 -0800 @@ -0,0 +1,92 @@ +KGDB over ethernet +================== + +Authors +------- + +Robert Walsh (2.6 port) +wangdi (2.6 port) +Matt Mackall (netpoll api) +San Mehat (original 2.4 code) + + +Introduction +------------ + +KGDB supports debugging over ethernet (kgdboe) via polling of a given +network interface. Most cards should be supported automatically. +Debugging facilities are available as soon as the network driver and +kgdboe have initialized. Unfortunately, this is too late in the boot +process for debugging some issues, but works quite well for many +others. This should not interfere with normal network usage and +doesn't require a dedicated NIC. + +Terminology +----------- + +This document uses the following terms: + + TARGET: the machine being debugged. + HOST: the machine running gdb. + + +Usage +----- + +You need to use the following command-line option on the TARGET kernel: + + kgdboe=[tgt-port]@/[dev],[host-port]@/[host-macaddr] + + where + tgt-port source for UDP packets (defaults to 6443) + tgt-ip source IP to use (interface address) + dev network interface (eth0) + host-port HOST UDP port (6442) (not really used) + host-ip IP address for HOST machine + host-macaddr ethernet MAC address for HOST (ff:ff:ff:ff:ff:ff) + + examples: + + kgdboe=7000@192.168.0.1/eth1,7001@192.168.0.2/00:05:3C:04:47:5D + this machine is 192.168.0.1 on eth1 + remote machine is 192.168.0.2 with MAC address 00:05:3C:04:47:5D + listen for gdb packets on port 7000 + send unsolicited gdb packets to port 7001 + + kgdboe=@192.168.0.1/,@192.168.0.2/ + this machine is 192.168.0.1 on default interface eth0 + remote machine is 192.168.0.2, use default broadcast MAC address + listen for gdb packets on default port 6443 + send unsolicited gdb packets to port 6442 + +Only packets originating from the configured HOST IP address will be +accepted by the debugger. + +On the HOST side, run gdb as normal and use a remote UDP host as the +target: + + % gdb ./vmlinux + GNU gdb Red Hat Linux (5.3post-0.20021129.18rh) + Copyright 2003 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux-gnu"... + (gdb) target remote udp:HOSTNAME:6443 + +You can now continue as if you were debugging over a serial line. + +Limitations +----------- + +The current release of this code is exclusive of using kgdb on a +serial interface, so you must boot without the kgdboe option to use +serial debugging. Trying to debug the network driver while using it +will prove interesting. + +Bug reports +----------- + +Send bug reports to Robert Walsh and Matt +Mackall . --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/Documentation/i386/kgdb/kgdb.txt 2004-02-29 13:07:58.000000000 -0800 @@ -0,0 +1,775 @@ +Last edit: <20030806.1637.12> +This file has information specific to the i386 kgdb option. Other +platforms with the kgdb option may behave in a similar fashion. + +New features: +============ +20030806.1557.37 +This version was made against the 2.6.0-test2 kernel. We have made the +following changes: + +- The getthread() code in the stub calls find_task_by_pid(). It fails + if we are early in the bring up such that the pid arrays have yet to + be allocated. We have added a line to kernel/pid.c to make + "kgdb_pid_init_done" true once the arrays are allocated. This way the + getthread() code knows not to call. This is only used by the thread + debugging stuff and threads will not yet exist at this point in the + boot. + +- For some reason, gdb was not asking for a new thread list when the + "info thread" command was given. We changed to the newer version of + the thread info command and gdb now seems to ask when needed. Result, + we now get all threads in the thread list. + +- We now respond to the ThreadExtraInfo request from gdb with the thread + name from task_struct .comm. This then appears in the thread list. + Thoughts on additional options for this are welcome. Things such as + "has BKL" and "Preempted" come to mind. I think we could have a flag + word that could enable different bits of info here. + +- We now honor, sort of, the C and S commands. These are continue and + single set after delivering a signal. We ignore the signal and do the + requested action. This only happens when we told gdb that a signal + was the reason for entry, which is only done on memory faults. The + result is that you can now continue into the Oops. + +- We changed the -g to -gdwarf-2. This seems to be the same as -ggdb, + but it is more exact on what language to use. + +- We added two dwarf2 include files and a bit of code at the end of + entry.S. This does not yet work, so it is disabled. Still we want to + keep track of the code and "maybe" someone out there can fix it. + +- Randy Dunlap sent some fix ups for this file which are now merged. + +- Hugh Dickins sent a fix to a bit of code in traps.c that prevents a + compiler warning if CONFIG_KGDB is off (now who would do that :). + +- Andrew Morton sent a fix for the serial driver which is now merged. + +- Andrew also sent a change to the stub around the cpu managment code + which is also merged. + +- Andrew also sent a patch to make "f" as well as "g" work as SysRq + commands to enter kgdb, merged. + +- If CONFIG_KGDB and CONFIG_DEBUG_SPINLOCKS are both set we added a + "who" field to the spinlock data struct. This is filled with + "current" when ever the spinlock suceeds. Useful if you want to know + who has the lock. + +_ And last, but not least, we fixed the "get_cu" macro to properly get + the current value of "current". + +New features: +============ +20030505.1827.27 +We are starting to align with the sourceforge version, at least in +commands. To this end, the boot command string to start kgdb at +boot time has been changed from "kgdb" to "gdb". + +Andrew Morton sent a couple of patches which are now included as follows: +1.) We now return a flag to the interrupt handler. +2.) We no longer use smp_num_cpus (a conflict with the lock meter). +3.) And from William Lee Irwin III code to make + sure high-mem is set up before we attempt to register our interrupt + handler. +We now include asm/kgdb.h from config.h so you will most likely never +have to include it. It also 'NULLS' the kgdb macros you might have in +your code when CONFIG_KGDB is not defined. This allows you to just +turn off CONFIG_KGDB to turn off all the kgdb_ts() calls and such. +This include is conditioned on the machine being an x86 so as to not +mess with other archs. + +20020801.1129.03 +This is currently the version for the 2.4.18 (and beyond?) kernel. + +We have several new "features" beginning with this version: + +1.) Kgdb now syncs the "other" CPUs with a cross-CPU NMI. No more + waiting and it will pull that guy out of an IRQ off spin lock :) + +2.) We doctored up the code that tells where a task is waiting and + included it so that the "info thread" command will show a bit more + than "schedule()". Try it... + +3.) Added the ability to call a function from gdb. All the standard gdb + issues apply, i.e. if you hit a breakpoint in the function, you are + not allowed to call another (gdb limitation, not kgdb). To help + this capability we added a memory allocation function. Gdb does not + return this memory (it is used for strings that you pass to that function + you are calling from gdb) so we fixed up a way to allow you to + manually return the memory (see below). + +4.) Kgdb time stamps (kgdb_ts()) are enhanced to expand what was the + interrupt flag to now also include the preemption count and the + "in_interrupt" info. The flag is now called "with_pif" to indicate + the order, preempt_count, in_interrupt, flag. The preempt_count is + shifted left by 4 bits so you can read the count in hex by dropping + the low order digit. In_interrupt is in bit 1, and the flag is in + bit 0. + +5.) The command: "p kgdb_info" is now expanded and prints something + like: +(gdb) p kgdb_info +$2 = {used_malloc = 0, called_from = 0xc0107506, entry_tsc = 67468627259, + errcode = 0, vector = 3, print_debug_info = 0, hold_on_sstep = 1, + cpus_waiting = {{task = 0xc027a000, pid = 32768, hold = 0, + regs = 0xc027bf84}, {task = 0x0, pid = 0, hold = 0, regs = 0x0}}} + + Things to note here: a.) used_malloc is the amount of memory that + has been malloc'ed to do calls from gdb. You can reclaim this + memory like this: "p kgdb_info.used_malloc=0" Cool, huh? b.) + cpus_waiting is now "sized" by the number of CPUs you enter at + configure time in the kgdb configure section. This is NOT used + anywhere else in the system, but it is "nice" here. c.) The task's + "pid" is now in the structure. This is the pid you will need to use + to decode to the thread id to get gdb to look at that thread. + Remember that the "info thread" command prints a list of threads + wherein it numbers each thread with its reference number followed + by the thread's pid. Note that the per-CPU idle threads actually + have pids of 0 (yes, there is more than one pid 0 in an SMP system). + To avoid confusion, kgdb numbers these threads with numbers beyond + the MAX_PID. That is why you see 32768 and above. + +6.) A subtle change, we now provide the complete register set for tasks + that are active on the other CPUs. This allows better trace back on + those tasks. + + And, let's mention what we could not fix. Back-trace from all but the + thread that we trapped will, most likely, have a bogus entry in it. + The problem is that gdb does not recognize the entry code for + functions that use "current" near (at all?) the entry. The compiler + is putting the "current" decode as the first two instructions of the + function where gdb expects to find %ebp changing code. Back trace + also has trouble with interrupt frames. I am talking with Daniel + Jacobowitz about some way to fix this, but don't hold your breath. + +20011220.0050.35 +Major enhancement with this version is the ability to hold one or more +CPUs in an SMP system while allowing the others to continue. Also, by +default only the current CPU is enabled on single-step commands (please +note that gdb issues single-step commands at times other than when you +use the si command). + +Another change is to collect some useful information in +a global structure called "kgdb_info". You should be able to just: + +p kgdb_info + +although I have seen cases where the first time this is done gdb just +prints the first member but prints the whole structure if you then enter +CR (carriage return or enter). This also works: + +p *&kgdb_info + +Here is a sample: +(gdb) p kgdb_info +$4 = {called_from = 0xc010732c, entry_tsc = 32804123790856, errcode = 0, + vector = 3, print_debug_info = 0} + +"Called_from" is the return address from the current entry into kgdb. +Sometimes it is useful to know why you are in kgdb, for example, was +it an NMI or a real breakpoint? The simple way to interrogate this +return address is: + +l *0xc010732c + +which will print the surrounding few lines of source code. + +"Entry_tsc" is the CPU TSC on entry to kgdb (useful to compare to the +kgdb_ts entries). + +"errcode" and "vector" are other entry parameters which may be helpful on +some traps. + +"print_debug_info" is the internal debugging kgdb print enable flag. Yes, +you can modify it. + +In SMP systems kgdb_info also includes the "cpus_waiting" structure and +"hold_on_step": + +(gdb) p kgdb_info +$7 = {called_from = 0xc0112739, entry_tsc = 1034936624074, errcode = 0, + vector = 2, print_debug_info = 0, hold_on_sstep = 1, cpus_waiting = {{ + task = 0x0, hold = 0, regs = 0x0}, {task = 0xc71b8000, hold = 0, + regs = 0xc71b9f70}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}}} + +"Cpus_waiting" has an entry for each CPU other than the current one that +has been stopped. Each entry contains the task_struct address for that +CPU, the address of the regs for that task and a hold flag. All these +have the proper typing so that, for example: + +p *kgdb_info.cpus_waiting[1].regs + +will print the registers for CPU 1. + +"Hold_on_sstep" is a new feature with this version and comes up set or +true. What this means is that whenever kgdb is asked to single-step all +other CPUs are held (i.e. not allowed to execute). The flag applies to +all but the current CPU and, again, can be changed: + +p kgdb_info.hold_on_sstep=0 + +restores the old behavior of letting all CPUs run during single-stepping. + +Likewise, each CPU has a "hold" flag, which if set, locks that CPU out +of execution. Note that this has some risk in cases where the CPUs need +to communicate with each other. If kgdb finds no CPU available on exit, +it will push a message thru gdb and stay in kgdb. Note that it is legal +to hold the current CPU as long as at least one CPU can execute. + +20010621.1117.09 +This version implements an event queue. Events are signaled by calling +a function in the kgdb stub and may be examined from gdb. See EVENTS +below for details. This version also tightens up the interrupt and SMP +handling to not allow interrupts on the way to kgdb from a breakpoint +trap. It is fine to allow these interrupts for user code, but not +system debugging. + +Version +======= + +This version of the kgdb package was developed and tested on +kernel version 2.4.16. It will not install on any earlier kernels. +It is possible that it will continue to work on later versions +of 2.4 and then versions of 2.5 (I hope). + + +Debugging Setup +=============== + +Designate one machine as the "development" machine. This is the +machine on which you run your compiles and which has your source +code for the kernel. Designate a second machine as the "target" +machine. This is the machine that will run your experimental +kernel. + +The two machines will be connected together via a serial line out +one or the other of the COM ports of the PC. You will need the +appropriate modem eliminator (null modem) cable(s) for this. + +Decide on which tty port you want the machines to communicate, then +connect them up back-to-back using the null modem cable. COM1 is +/dev/ttyS0 and COM2 is /dev/ttyS1. You should test this connection +with the two machines prior to trying to debug a kernel. Once you +have it working, on the TARGET machine, enter: + +setserial /dev/ttyS0 (or what ever tty you are using) + +and record the port address and the IRQ number. + +On the DEVELOPMENT machine you need to apply the patch for the kgdb +hooks. You have probably already done that if you are reading this +file. + +On your DEVELOPMENT machine, go to your kernel source directory and do +"make Xconfig" where X is one of "x", "menu", or "". If you are +configuring in the standard serial driver, it must not be a module. +Either yes or no is ok, but making the serial driver a module means it +will initialize after kgdb has set up the UART interrupt code and may +cause a failure of the control-C option discussed below. The configure +question for the serial driver is under the "Character devices" heading +and is: + +"Standard/generic (8250/16550 and compatible UARTs) serial support" + +Go down to the kernel debugging menu item and open it up. Enable the +kernel kgdb stub code by selecting that item. You can also choose to +turn on the "-ggdb -O1" compile options. The -ggdb causes the compiler +to put more debug info (like local symbols) in the object file. On the +i386 -g and -ggdb are the same so this option just reduces to "O1". The +-O1 reduces the optimization level. This may be helpful in some cases, +be aware, however, that this may also mask the problem you are looking +for. + +The baud rate. Default is 115200. What ever you choose be sure that +the host machine is set to the same speed. I recommend the default. + +The port. This is the I/O address of the serial UART that you should +have gotten using setserial as described above. The standard COM1 port +(3f8) using IRQ 4 is default. COM2 is 2f8 which by convention uses IRQ +3. + +The port IRQ (see above). + +Stack overflow test. This option makes a minor change in the trap, +system call and interrupt code to detect stack overflow and transfer +control to kgdb if it happens. (Some platforms have this in the +baseline code, but the i386 does not.) + +You can also configure the system to recognize the boot option +"console=kgdb" which if given will cause all console output during +booting to be put thru gdb as well as other consoles. This option +requires that gdb and kgdb be connected prior to sending console output +so, if they are not, a breakpoint is executed to force the connection. +This will happen before any kernel output (it is going thru gdb, right), +and will stall the boot until the connection is made. + +You can also configure in a patch to SysRq to enable the kGdb SysRq. +This request generates a breakpoint. Since the serial port IRQ line is +set up after any serial drivers, it is possible that this command will +work when the control-C will not. + +Save and exit the Xconfig program. Then do "make clean" , "make dep" +and "make bzImage" (or whatever target you want to make). This gets the +kernel compiled with the "-g" option set -- necessary for debugging. + +You have just built the kernel on your DEVELOPMENT machine that you +intend to run on your TARGET machine. + +To install this new kernel, use the following installation procedure. +Remember, you are on the DEVELOPMENT machine patching the kernel source +for the kernel that you intend to run on the TARGET machine. + +Copy this kernel to your target machine using your usual procedures. I +usually arrange to copy development: +/usr/src/linux/arch/i386/boot/bzImage to /vmlinuz on the TARGET machine +via a LAN based NFS access. That is, I run the cp command on the target +and copy from the development machine via the LAN. Run Lilo (see "man +lilo" for details on how to set this up) on the new kernel on the target +machine so that it will boot! Then boot the kernel on the target +machine. + +On the DEVELOPMENT machine, create a file called .gdbinit in the +directory /usr/src/linux. An example .gdbinit file looks like this: + +shell echo -e "\003" >/dev/ttyS0 +set remotebaud 38400 (or what ever speed you have chosen) +target remote /dev/ttyS0 + + +Change the "echo" and "target" definition so that it specifies the tty +port that you intend to use. Change the "remotebaud" definition to +match the data rate that you are going to use for the com line. + +You are now ready to try it out. + +Boot your target machine with "kgdb" in the boot command i.e. something +like: + +lilo> test kgdb + +or if you also want console output thru gdb: + +lilo> test kgdb console=kgdb + +You should see the lilo message saying it has loaded the kernel and then +all output stops. The kgdb stub is trying to connect with gdb. Start +gdb something like this: + + +On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". +When gdb gets the symbols loaded it will read your .gdbinit file and, if +everything is working correctly, you should see gdb print out a few +lines indicating that a breakpoint has been taken. It will actually +show a line of code in the target kernel inside the kgdb activation +code. + +The gdb interaction should look something like this: + + linux-dev:/usr/src/linux# gdb vmlinux + GDB is free software and you are welcome to distribute copies of it + under certain conditions; type "show copying" to see the conditions. + There is absolutely no warranty for GDB; type "show warranty" for details. + GDB 4.15.1 (i486-slackware-linux), + Copyright 1995 Free Software Foundation, Inc... + breakpoint () at i386-stub.c:750 + 750 } + (gdb) + +You can now use whatever gdb commands you like to set breakpoints. +Enter "continue" to start your target machine executing again. At this +point the target system will run at full speed until it encounters +your breakpoint or gets a segment violation in the kernel, or whatever. + +If you have the kgdb console enabled when you continue, gdb will print +out all the console messages. + +The above example caused a breakpoint relatively early in the boot +process. For the i386 kgdb it is possible to code a break instruction +as the first C-language point in init/main.c, i.e. as the first instruction +in start_kernel(). This could be done as follows: + +#include + breakpoint(); + +This breakpoint() is really a function that sets up the breakpoint and +single-step hardware trap cells and then executes a breakpoint. Any +early hard coded breakpoint will need to use this function. Once the +trap cells are set up they need not be set again, but doing it again +does not hurt anything, so you don't need to be concerned about which +breakpoint is hit first. Once the trap cells are set up (and the kernel +sets them up in due course even if breakpoint() is never called) the +macro: + +BREAKPOINT; + +will generate an inline breakpoint. This may be more useful as it stops +the processor at the instruction instead of in a function a step removed +from the location of interest. In either case must be +included to define both breakpoint() and BREAKPOINT. + +Triggering kgdbstub at other times +================================== + +Often you don't need to enter the debugger until much later in the boot +or even after the machine has been running for some time. Once the +kernel is booted and interrupts are on, you can force the system to +enter the debugger by sending a control-C to the debug port. This is +what the first line of the recommended .gdbinit file does. This allows +you to start gdb any time after the system is up as well as when the +system is already at a breakpoint. (In the case where the system is +already at a breakpoint the control-C is not needed, however, it will +be ignored by the target so no harm is done. Also note the the echo +command assumes that the port speed is already set. This will be true +once gdb has connected, but it is best to set the port speed before you +run gdb.) + +Another simple way to do this is to put the following file in you ~/bin +directory: + +#!/bin/bash +echo -e "\003" > /dev/ttyS0 + +Here, the ttyS0 should be replaced with what ever port you are using. +The "\003" is control-C. Once you are connected with gdb, you can enter +control-C at the command prompt. + +An alternative way to get control to the debugger is to enable the kGdb +SysRq command. Then you would enter Alt-SysRq-g (all three keys at the +same time, but push them down in the order given). To refresh your +memory of the available SysRq commands try Alt-SysRq-=. Actually any +undefined command could replace the "=", but I like to KNOW that what I +am pushing will never be defined. + +Debugging hints +=============== + +You can break into the target machine at any time from the development +machine by typing ^C (see above paragraph). If the target machine has +interrupts enabled this will stop it in the kernel and enter the +debugger. + +There is unfortunately no way of breaking into the kernel if it is +in a loop with interrupts disabled, so if this happens to you then +you need to place exploratory breakpoints or printk's into the kernel +to find out where it is looping. The exploratory breakpoints can be +entered either thru gdb or hard coded into the source. This is very +handy if you do something like: + +if () BREAKPOINT; + + +There is a copy of an e-mail in the Documentation/i386/kgdb/ directory +(debug-nmi.txt) which describes how to create an NMI on an ISA bus +machine using a paper clip. I have a sophisticated version of this made +by wiring a push button switch into a PC104/ISA bus adapter card. The +adapter card nicely furnishes wire wrap pins for all the ISA bus +signals. + +When you are done debugging the kernel on the target machine it is a +good idea to leave it in a running state. This makes reboots faster, +bypassing the fsck. So do a gdb "continue" as the last gdb command if +this is possible. To terminate gdb itself on the development machine +and leave the target machine running, first clear all breakpoints and +continue, then type ^Z to suspend gdb and then kill it with "kill %1" or +something similar. + +If gdbstub Does Not Work +======================== + +If it doesn't work, you will have to troubleshoot it. Do the easy +things first like double checking your cabling and data rates. You +might try some non-kernel based programs to see if the back-to-back +connection works properly. Just something simple like cat /etc/hosts +>/dev/ttyS0 on one machine and cat /dev/ttyS0 on the other will tell you +if you can send data from one machine to the other. Make sure it works +in both directions. There is no point in tearing out your hair in the +kernel if the line doesn't work. + +All of the real action takes place in the file +/usr/src/linux/arch/i386/kernel/kgdb_stub.c. That is the code on the target +machine that interacts with gdb on the development machine. In gdb you can +turn on a debug switch with the following command: + + set remotedebug + +This will print out the protocol messages that gdb is exchanging with +the target machine. + +Another place to look is /usr/src/arch/i386/lib/kgdb_serial.c. This is +the code that talks to the serial port on the target side. There might +be a problem there. In particular there is a section of this code that +tests the UART which will tell you what UART you have if you define +"PRNT" (just remove "_off" from the #define PRNT_off). To view this +report you will need to boot the system without any beakpoints. This +allows the kernel to run to the point where it calls kgdb to set up +interrupts. At this time kgdb will test the UART and print out the type +it finds. (You need to wait so that the printks are actually being +printed. Early in the boot they are cached, waiting for the console to +be enabled. Also, if kgdb is entered thru a breakpoint it is possible +to cause a dead lock by calling printk when the console is locked. The +stub thus avoids doing printks from breakpoints, especially in the +serial code.) At this time, if the UART fails to do the expected thing, +kgdb will print out (using printk) information on what failed. (These +messages will be buried in all the other boot up messages. Look for +lines that start with "gdb_hook_interrupt:". You may want to use dmesg +once the system is up to view the log. If this fails or if you still +don't connect, review your answers for the port address. Use: + +setserial /dev/ttyS0 + +to get the current port and IRQ information. This command will also +tell you what the system found for the UART type. The stub recognizes +the following UART types: + +16450, 16550, and 16550A + +If you are really desperate you can use printk debugging in the +kgdbstub code in the target kernel until you get it working. In particular, +there is a global variable in /usr/src/linux/arch/i386/kernel/kgdb_stub.c +named "remote_debug". Compile your kernel with this set to 1, rather +than 0 and the debug stub will print out lots of stuff as it does +what it does. Likewise there are debug printks in the kgdb_serial.c +code that can be turned on with simple changes in the macro defines. + + +Debugging Loadable Modules +========================== + +This technique comes courtesy of Edouard Parmelan + + +When you run gdb, enter the command + +source gdbinit-modules + +This will read in a file of gdb macros that was installed in your +kernel source directory when kgdb was installed. This file implements +the following commands: + +mod-list + Lists the loaded modules in the form + +mod-print-symbols + Prints all the symbols in the indicated module. + +mod-add-symbols + Loads the symbols from the object file and associates them + with the indicated module. + +After you have loaded the module that you want to debug, use the command +mod-list to find the of your module. Then use that +address in the mod-add-symbols command to load your module's symbols. +From that point onward you can debug your module as if it were a part +of the kernel. + +The file gdbinit-modules also contains a command named mod-add-lis as +an example of how to construct a command of your own to load your +favorite module. The idea is to "can" the pathname of the module +in the command so you don't have to type so much. + +Threads +======= + +Each process in a target machine is seen as a gdb thread. gdb thread +related commands (info threads, thread n) can be used. + +ia-32 hardware breakpoints +========================== + +kgdb stub contains support for hardware breakpoints using debugging features +of ia-32(x86) processors. These breakpoints do not need code modification. +They use debugging registers. 4 hardware breakpoints are available in ia-32 +processors. + +Each hardware breakpoint can be of one of the following three types. + +1. Execution breakpoint - An Execution breakpoint is triggered when code + at the breakpoint address is executed. + + As limited number of hardware breakpoints are available, it is + advisable to use software breakpoints ( break command ) instead + of execution hardware breakpoints, unless modification of code + is to be avoided. + +2. Write breakpoint - A write breakpoint is triggered when memory + location at the breakpoint address is written. + + A write or can be placed for data of variable length. Length of + a write breakpoint indicates length of the datatype to be + watched. Length is 1 for 1 byte data , 2 for 2 byte data, 3 for + 4 byte data. + +3. Access breakpoint - An access breakpoint is triggered when memory + location at the breakpoint address is either read or written. + + Access breakpoints also have lengths similar to write breakpoints. + +IO breakpoints in ia-32 are not supported. + +Since gdb stub at present does not use the protocol used by gdb for hardware +breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros +for hardware breakpoints are described below. + +hwebrk - Places an execution breakpoint + hwebrk breakpointno address +hwwbrk - Places a write breakpoint + hwwbrk breakpointno length address +hwabrk - Places an access breakpoint + hwabrk breakpointno length address +hwrmbrk - Removes a breakpoint + hwrmbrk breakpointno +exinfo - Tells whether a software or hardware breakpoint has occurred. + Prints number of the hardware breakpoint if a hardware breakpoint has + occurred. + +Arguments required by these commands are as follows +breakpointno - 0 to 3 +length - 1 to 3 +address - Memory location in hex digits ( without 0x ) e.g c015e9bc + +SMP support +========== + +When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb +client, all the processors are forced to enter the debugger. Current +thread corresponds to the thread running on the processor where +breakpoint occurred. Threads running on other processor(s) appear +similar to other non-running threads in the 'info threads' output. +Within the kgdb stub there is a structure "waiting_cpus" in which kgdb +records the values of "current" and "regs" for each CPU other than the +one that hit the breakpoint. "current" is a pointer to the task +structure for the task that CPU is running, while "regs" points to the +saved registers for the task. This structure can be examined with the +gdb "p" command. + +ia-32 hardware debugging registers on all processors are set to same +values. Hence any hardware breakpoints may occur on any processor. + +gdb troubleshooting +=================== + +1. gdb hangs +Kill it. restart gdb. Connect to target machine. + +2. gdb cannot connect to target machine (after killing a gdb and +restarting another) If the target machine was not inside debugger when +you killed gdb, gdb cannot connect because the target machine won't +respond. In this case echo "Ctrl+C"(ASCII 3) to the serial line. +e.g. echo -e "\003" > /dev/ttyS1 +This forces that target machine into the debugger, after which you +can connect. + +3. gdb cannot connect even after echoing Ctrl+C into serial line +Try changing serial line settings min to 1 and time to 0 +e.g. stty min 1 time 0 < /dev/ttyS1 +Try echoing again + +Check serial line speed and set it to correct value if required +e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 + +EVENTS +====== + +Ever want to know the order of things happening? Which CPU did what and +when? How did the spinlock get the way it is? Then events are for +you. Events are defined by calls to an event collection interface and +saved for later examination. In this case, kgdb events are saved by a +very fast bit of code in kgdb which is fully SMP and interrupt protected +and they are examined by using gdb to display them. Kgdb keeps only +the last N events, where N must be a power of two and is defined at +configure time. + + +Events are signaled to kgdb by calling: + +kgdb_ts(data0,data1) + +For each call kgdb records each call in an array along with other info. +Here is the array definition: + +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + long long at_time; + int from_ln; + char * in_src; + void *from; + int with_if; + int data0; + int data1; +}; + +For SMP machines the CPU is recorded, for all machines the TSC is +recorded (gets a time stamp) as well as the line number and source file +the call was made from. The address of the (from), the "if" (interrupt +flag) and the two data items are also recorded. The macro kgdb_ts casts +the types to int, so you can put any 32-bit values here. There is a +configure option to select the number of events you want to keep. A +nice number might be 128, but you can keep up to 1024 if you want. The +number must be a power of two. An "andthen" macro library is provided +for gdb to help you look at these events. It is also possible to define +a different structure for the event storage and cast the data to this +structure. For example the following structure is defined in kgdb: + +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + long long at_time; + int from_ln; + char * in_src; + void *from; + int with_if; + struct task_struct *t1; + struct task_struct *t2; +}; + +If you use this for display, the data elements will be displayed as +pointers to task_struct entries. You may want to define your own +structure to use in casting. You should only change the last two items +and you must keep the structure size the same. Kgdb will handle these +as 32-bit ints, but within that constraint you can define a structure to +cast to any 32-bit quantity. This need only be available to gdb and is +only used for casting in the display code. + +Final Items +=========== + +I picked up this code from Amit S. Kale and enhanced it. + +If you make some really cool modification to this stuff, or if you +fix a bug, please let me know. + +George Anzinger + + +Amit S. Kale + + +(First kgdb by David Grothe ) + +(modified by Tigran Aivazian ) + Putting gdbstub into the kernel config menu. + +(modified by Scott Foehner ) + Hooks for entering gdbstub at boot time. + +(modified by Amit S. Kale ) + Threads, ia-32 hw debugging, mp support, console support, + nmi watchdog handling. + +(modified by George Anzinger ) + Extended threads to include the idle threads. + Enhancements to allow breakpoint() at first C code. + Use of module_init() and __setup() to automate the configure. + Enhanced the cpu "collection" code to work in early bring-up. + Added ability to call functions from gdb + Print info thread stuff without going back to schedule() + Now collect the "other" cpus with an IPI/ NMI. --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/Documentation/i386/kgdb/loadmodule.sh 2004-02-29 13:07:58.000000000 -0800 @@ -0,0 +1,78 @@ +#/bin/sh +# This script loads a module on a target machine and generates a gdb script. +# source generated gdb script to load the module file at appropriate addresses +# in gdb. +# +# Usage: +# Loading the module on target machine and generating gdb script) +# [foo]$ loadmodule.sh +# +# Loading the module file into gdb +# (gdb) source +# +# Modify following variables according to your setup. +# TESTMACHINE - Name of the target machine +# GDBSCRIPTS - The directory where a gdb script will be generated +# +# Author: Amit S. Kale (akale@veritas.com). +# +# If you run into problems, please check files pointed to by following +# variables. +# ERRFILE - /tmp/.errs contains stderr output of insmod +# MAPFILE - /tmp/.map contains stdout output of insmod +# GDBSCRIPT - $GDBSCRIPTS/load gdb script. + +TESTMACHINE=foo +GDBSCRIPTS=/home/bar + +if [ $# -lt 1 ] ; then { + echo Usage: $0 modulefile + exit +} ; fi + +MODULEFILE=$1 +MODULEFILEBASENAME=`basename $1` + +if [ $MODULEFILE = $MODULEFILEBASENAME ] ; then { + MODULEFILE=`pwd`/$MODULEFILE +} fi + +ERRFILE=/tmp/$MODULEFILEBASENAME.errs +MAPFILE=/tmp/$MODULEFILEBASENAME.map +GDBSCRIPT=$GDBSCRIPTS/load$MODULEFILEBASENAME + +function findaddr() { + local ADDR=0x$(echo "$SEGMENTS" | \ + grep "$1" | sed 's/^[^ ]*[ ]*[^ ]*[ ]*//' | \ + sed 's/[ ]*[^ ]*$//') + echo $ADDR +} + +function checkerrs() { + if [ "`cat $ERRFILE`" != "" ] ; then { + cat $ERRFILE + exit + } fi +} + +#load the module +echo Copying $MODULEFILE to $TESTMACHINE +rcp $MODULEFILE root@${TESTMACHINE}: + +echo Loading module $MODULEFILE +rsh -l root $TESTMACHINE /sbin/insmod -m ./`basename $MODULEFILE` \ + > $MAPFILE 2> $ERRFILE +checkerrs + +SEGMENTS=`head -n 11 $MAPFILE | tail -n 10` +TEXTADDR=$(findaddr "\\.text[^.]") +LOADSTRING="add-symbol-file $MODULEFILE $TEXTADDR" +SEGADDRS=`echo "$SEGMENTS" | awk '//{ + if ($1 != ".text" && $1 != ".this" && + $1 != ".kstrtab" && $1 != ".kmodtab") { + print " -s " $1 " 0x" $3 " " + } +}'` +LOADSTRING="$LOADSTRING $SEGADDRS" +echo Generating script $GDBSCRIPT +echo $LOADSTRING > $GDBSCRIPT --- linux-2.6.4-rc1/Documentation/kernel-parameters.txt 2004-02-27 16:17:18.000000000 -0800 +++ 25/Documentation/kernel-parameters.txt 2004-02-29 13:07:39.000000000 -0800 @@ -90,10 +90,13 @@ running once the system is up. Format: , default is 13 acpi= [HW,ACPI] Advanced Configuration and Power Interface - Format: { force | off | ht } + Format: { force | off | ht | strict } force -- enables ACPI for systems with default off off -- disabled ACPI for systems with default on ht -- run only enough ACPI to enable Hyper Threading + strict -- Be less tolerant of platforms that are not + strictly ACPI specification compliant. + See also Documentation/pm.txt. acpi_pic_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25/Documentation/laptop-mode.txt 2004-02-29 13:08:18.000000000 -0800 @@ -0,0 +1,480 @@ +How to conserve battery power using laptop-mode +----------------------------------------------- + +Document Author: Bart Samwel (bart@samwel.tk) +Date created: January 2, 2004 + +Introduction +------------ + +Laptopmode is used to minimize the time that the hard disk needs to be spun up, +to conserve battery power on laptops. It has been reported to cause significant +power savings. + +Contents +-------- + +* Introduction +* The short story +* Caveats +* The details +* Tips & Tricks +* Control script +* ACPI integration +* Monitoring tool + + +The short story +--------------- + +If you just want to use it, run the laptop_mode control script (which is included +at the end of this document) as follows: + +# laptop_mode start + +Then set your harddisk spindown time to a relatively low value with hdparm: + +hdparm -S 4 /dev/hda + +The value -S 4 means 20 seconds idle time before spindown. Your harddisk will +now only spin up when a disk cache miss occurs, or at least once every 10 +minutes to write back any pending changes. + +To stop laptop_mode, remount your filesystems with regular commit intervals +(e.g., 5 seconds), and run "laptop_mode stop". + + +Caveats +------- + +* The downside of laptop mode is that you have a chance of losing up + to 10 minutes of work. If you cannot afford this, don't use it! + +* Most desktop hard drives have a very limited lifetime measured in spindown + cycles, typically about 50.000 times (it's usually listed on the spec sheet). + Check your drive's rating, and don't wear down your drive's lifetime if you + don't need to. + +* If you mount some of your ext3/reiserfs filesystems with the -n option, then + the control script will not be able to remount them correctly. You must set + DO_REMOUNTS=0 in the control script, otherwise it will remount them with the + wrong options -- or it will fail because it cannot write to /etc/mtab. + +* If you have your filesystems listed as type "auto" in fstab, like I did, then + the control script will not recognize them as filesystems that need remounting. + +The details +----------- + +Laptop-mode is controlled by the flag /proc/sys/vm/laptop_mode. When this +flag is set, any physical disk read operation (that might have caused the +hard disk to spin up) causes Linux to flush all dirty blocks. The result +of this is that after a disk has spun down, it will not be spun up anymore +to write dirty blocks, because those blocks had already been written +immediately after the most recent read operation + +To increase the effectiveness of the laptop_mode strategy, the laptop_mode +control script increases dirty_expire_centisecs and dirty_writeback_centisecs in +/proc/sys/vm to about 10 minutes (by default), which means that pages that are +dirtied are not forced to be written to disk as often. The control script also +changes the dirty background ratio, so that background writeback of dirty pages +is not done anymore. Combined with a higher commit value (also 10 minutes) for +ext3 or ReiserFS filesystems (also done automatically by the control script), +this results in concentration of disk activity in a small time interval which +occurs only once every 10 minutes, or whenever the disk is forced to spin up by +a cache miss. The disk can then be spun down in the periods of inactivity. + +If you want to find out which process caused the disk to spin up, you can +gather information by setting the flag /proc/sys/vm/block_dump. When this flag +is set, Linux reports all disk read and write operations that take place, and +all block dirtyings done to files. This makes it possible to debug why a disk +needs to spin up, and to increase battery life even more. + +If 10 minutes is too much or too little downtime for you, you can configure +this downtime as follows. In the control script, set the MAX_AGE value to the +maximum number of seconds of disk downtime that you would like. You should +then set your filesystem's commit interval to the same value. The dirty ratio +is also configurable from the control script. + +If you don't like the idea of the control script remounting your filesystems +for you, you can change DO_REMOUNTS to 0 in the script. + +Thanks to Kiko Piris, the control script can be used to enable laptop mode on +both the Linux 2.4 and 2.6 series. + + +Tips & Tricks +------------- + +* Bartek Kania reports getting up to 50 minutes of extra battery life (on top + of his regular 3 to 3.5 hours) using very aggressive power management (hdparm + -B1) and a spindown time of 5 seconds (hdparm -S1). + +* You can spin down the disk while playing MP3, by setting the disk readahead + to 8MB (hdparm -a 16384). Effectively, the disk will read a complete MP3 at + once, and will then spin down while the MP3 is playing. (Thanks to Bartek + Kania.) + +* Drew Scott Daniels observed: "I don't know why, but when I decrease the number + of colours that my display uses it consumes less battery power. I've seen + this on powerbooks too. I hope that this is a piece of information that + might be useful to the Laptop Mode patch or it's users." + + +Control script +-------------- + +Please note that this control script works for the Linux 2.4 and 2.6 series. + +--------------------CONTROL SCRIPT BEGIN------------------------------------------ +#!/bin/sh + +# start or stop laptop_mode, best run by a power management daemon when +# ac gets connected/disconnected from a laptop +# +# install as /sbin/laptop_mode +# +# Contributors to this script: Kiko Piris +# Bart Samwel +# Dax Kelson +# Original Linux 2.4 version by: Jens Axboe + +parse_mount_opts () { + echo "$*" | \ + sed 's/commit=[0-9]*//g' | \ + sed 's/,,*/,/g' | \ + sed 's/^,//' | \ + sed 's/,$//' | \ + cat - +} + +KLEVEL="$(uname -r | cut -c1-3)" +case "$KLEVEL" in + "2.4") + true + ;; + "2.6") + true + ;; + *) + echo "Unhandled kernel level: $KLEVEL ('uname -r' = '$(uname -r)')" + exit 1 + ;; +esac + +# Shall we remount journaled fs. with appropiate commit interval? (1=yes) +DO_REMOUNTS=1 + +# age time, in seconds. should be put into a sysconfig file +MAX_AGE=600 + +# Allowed dirty ratio, in pct. should be put into a sysconfig file as well. +DIRTY_RATIO=40 + +# kernel default dirty buffer age +DEF_AGE=30 +DEF_UPDATE=5 +DEF_DIRTY_BACKGROUND_RATIO=10 +DEF_DIRTY_RATIO=40 + + +if [ ! -e /proc/sys/vm/laptop_mode ]; then + echo "Kernel is not patched with laptop_mode patch." + exit 1 +fi + +if [ ! -w /proc/sys/vm/laptop_mode ]; then + echo "You do not have enough privileges to enable laptop_mode." + exit 1 +fi + +case "$1" in + start) + AGE=$((100*$MAX_AGE)) + echo -n "Starting laptop_mode" + case "$KLEVEL" in + "2.4") + echo "1" > /proc/sys/vm/laptop_mode + echo "30 500 0 0 $AGE $AGE 60 20 0" > /proc/sys/vm/bdflush + ;; + "2.6") + echo "1" > /proc/sys/vm/laptop_mode + echo "$AGE" > /proc/sys/vm/dirty_writeback_centisecs + echo "$AGE" > /proc/sys/vm/dirty_expire_centisecs + echo "$DIRTY_RATIO" > /proc/sys/vm/dirty_ratio + echo "$DIRTY_RATIO" > /proc/sys/vm/dirty_background_ratio + ;; + esac + if [ $DO_REMOUNTS -eq 1 ]; then + cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do + PARSEDOPTS="$(parse_mount_opts "$OPTS")" + case "$FST" in + "ext3") mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE ;; + "reiserfs") mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE ;; + "xfs") mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE ;; + esac + done + fi + echo "." + ;; + stop) + U_AGE=$((100*$DEF_UPDATE)) + B_AGE=$((100*$DEF_AGE)) + echo -n "Stopping laptop_mode" + case "$KLEVEL" in + "2.4") + echo "0" > /proc/sys/vm/laptop_mode + echo "30 500 0 0 $U_AGE $B_AGE 60 20 0" > /proc/sys/vm/bdflush + ;; + "2.6") + echo "0" > /proc/sys/vm/laptop_mode + echo "$U_AGE" > /proc/sys/vm/dirty_writeback_centisecs + echo "$B_AGE" > /proc/sys/vm/dirty_expire_centisecs + echo "$DEF_DIRTY_RATIO" > /proc/sys/vm/dirty_ratio + echo "$DEF_DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio + ;; + esac + if [ $DO_REMOUNTS -eq 1 ]; then + cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do + PARSEDOPTS="$(parse_mount_opts "$OPTS")" + case "$FST" in + "ext3") mount $DEV -t $FST $MP -o remount,$PARSEDOPTS ;; + "reiserfs") mount $DEV -t $FST $MP -o remount,$PARSEDOPTS ;; + "xfs") mount $DEV -t $FST $MP -o remount,$PARSEDOPTS ;; + esac + done + fi + echo "." + ;; + *) + echo "$0 {start|stop}" + ;; + +esac + +exit 0 + +--------------------CONTROL SCRIPT END-------------------------------------------- + + +ACPI integration +---------------- + +Dax Kelson submitted this so that the ACPI acpid daemon will +kick off the laptop_mode script and run hdparm. + +---------------------------/etc/acpi/events/ac_adapter BEGIN------------------------------------------- +event=ac_adapter +action=/etc/acpi/actions/battery.sh +---------------------------/etc/acpi/events/ac_adapter END------------------------------------------- + +---------------------------/etc/acpi/actions/battery.sh BEGIN------------------------------------------- +#!/bin/sh + +# cpu throttling +# cat /proc/acpi/processor/CPU0/throttling for more info +ACAD_THR=0 +BATT_THR=2 + +# spindown time for HD (man hdparm for valid values) +# I prefer 2 hours for acad and 20 seconds for batt +ACAD_HD=244 +BATT_HD=4 + +# ac/battery event handler + +status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/AC/state` + +case $status in + "on-line") + echo "Setting HD spindown to 2 hours" + /sbin/laptop-mode stop + /sbin/hdparm -S $ACAD_HD /dev/hda > /dev/null 2>&1 + /sbin/hdparm -B 255 /dev/hda > /dev/null 2>&1 + #echo -n $ACAD_CPU:$ACAD_THR > /proc/acpi/processor/CPU0/limit + exit 0 + ;; + "off-line") + echo "Setting HD spindown to 20 seconds" + /sbin/laptop-mode start + /sbin/hdparm -S $BATT_HD /dev/hda > /dev/null 2>&1 + /sbin/hdparm -B 1 /dev/hda > /dev/null 2>&1 + #echo -n $BATT_CPU:$BATT_THR > /proc/acpi/processor/CPU0/limit + exit 0 + ;; +esac +---------------------------/etc/acpi/actions/battery.sh END------------------------------------------- + +Monitoring tool +--------------- + +Bartek Kania submitted this, it can be used to measure how much time your disk +spends spun up/down. + +---------------------------dslm.c BEGIN------------------------------------------- +/* + * Simple Disk SLeep Monitor + * by Bartek Kania + * Licenced under the GPL + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DEBUG +#define D(x) x +#else +#define D(x) +#endif + +int endit = 0; + +/* Check if the disk is in powersave-mode + * Most of the code is stolen from hdparm. + * 1 = active, 0 = standby/sleep, -1 = unknown */ +int check_powermode(int fd) +{ + unsigned char args[4] = {WIN_CHECKPOWERMODE1,0,0,0}; + int state; + + if (ioctl(fd, HDIO_DRIVE_CMD, &args) + && (args[0] = WIN_CHECKPOWERMODE2) /* try again with 0x98 */ + && ioctl(fd, HDIO_DRIVE_CMD, &args)) { + if (errno != EIO || args[0] != 0 || args[1] != 0) { + state = -1; /* "unknown"; */ + } else + state = 0; /* "sleeping"; */ + } else { + state = (args[2] == 255) ? 1 : 0; + } + D(printf(" drive state is: %s\n", state)); + + return state; +} + +char *state_name(int i) +{ + if (i == -1) return "unknown"; + if (i == 0) return "sleeping"; + if (i == 1) return "active"; + + return "internal error"; +} + +char *myctime(time_t time) +{ + char *ts = ctime(&time); + ts[strlen(ts) - 1] = 0; + + return ts; +} + +void measure(int fd) +{ + time_t start_time; + int last_state; + time_t last_time; + int curr_state; + time_t curr_time = 0; + time_t time_diff; + time_t active_time = 0; + time_t sleep_time = 0; + time_t unknown_time = 0; + time_t total_time = 0; + int changes = 0; + float tmp; + + printf("Starting measurements\n"); + + last_state = check_powermode(fd); + start_time = last_time = time(0); + printf(" System is in state %s\n\n", state_name(last_state)); + + while(!endit) { + sleep(1); + curr_state = check_powermode(fd); + + if (curr_state != last_state || endit) { + changes++; + curr_time = time(0); + time_diff = curr_time - last_time; + + if (last_state == 1) active_time += time_diff; + else if (last_state == 0) sleep_time += time_diff; + else unknown_time += time_diff; + + last_state = curr_state; + last_time = curr_time; + + printf("%s: State-change to %s\n", myctime(curr_time), + state_name(curr_state)); + } + } + changes--; /* Compensate for SIGINT */ + + total_time = time(0) - start_time; + printf("\nTotal running time: %lus\n", curr_time - start_time); + printf(" State changed %d times\n", changes); + + tmp = (float)sleep_time / (float)total_time * 100; + printf(" Time in sleep state: %lus (%.2f%%)\n", sleep_time, tmp); + tmp = (float)active_time / (float)total_time * 100; + printf(" Time in active state: %lus (%.2f%%)\n", active_time, tmp); + tmp = (float)unknown_time / (float)total_time * 100; + printf(" Time in unknown state: %lus (%.2f%%)\n", unknown_time, tmp); +} + +void ender(int s) +{ + endit = 1; +} + +void usage() +{ + puts("usage: dslm [-w