--- linux-2.6.5-rc3/arch/alpha/kernel/setup.c 2003-11-23 19:03:00.000000000 -0800 +++ 25/arch/alpha/kernel/setup.c 2004-03-31 00:55:03.061328936 -0800 @@ -56,6 +56,7 @@ static struct notifier_block alpha_panic #include #include #include +#include #include "proto.h" #include "pci_impl.h" @@ -115,7 +116,6 @@ static void get_sysnames(unsigned long, char **, char **); static char command_line[COMMAND_LINE_SIZE]; -char saved_command_line[COMMAND_LINE_SIZE]; /* * The format of "screen_info" is strange, and due to early @@ -527,8 +527,8 @@ setup_arch(char **cmdline_p) } else { strlcpy(command_line, COMMAND_LINE, sizeof command_line); } - strcpy(saved_command_line, command_line); *cmdline_p = command_line; + parse_early_options(cmdline_p); /* * Process command-line arguments. --- linux-2.6.5-rc3/arch/alpha/kernel/sys_nautilus.c 2003-08-08 22:55:10.000000000 -0700 +++ 25/arch/alpha/kernel/sys_nautilus.c 2004-03-30 20:21:43.000000000 -0800 @@ -225,11 +225,13 @@ nautilus_init_pci(void) if (request_resource(&iomem_resource, bus->resource[1]) < 0) printk(KERN_ERR "Failed to request MEM on hose 0\n"); - if (pci_mem < memtop && pci_mem > alpha_mv.min_mem_address) { + if (pci_mem < memtop) + memtop = pci_mem; + if (memtop > alpha_mv.min_mem_address) { free_reserved_mem(__va(alpha_mv.min_mem_address), - __va(pci_mem)); + __va(memtop)); printk("nautilus_init_pci: %ldk freed\n", - (pci_mem - alpha_mv.min_mem_address) >> 10); + (memtop - alpha_mv.min_mem_address) >> 10); } if ((IRONGATE0->dev_vendor >> 16) > 0x7006) /* Albacore? */ --- linux-2.6.5-rc3/arch/alpha/kernel/vmlinux.lds.S 2003-08-22 19:23:40.000000000 -0700 +++ 25/arch/alpha/kernel/vmlinux.lds.S 2004-03-31 00:55:03.061328936 -0800 @@ -45,6 +45,11 @@ SECTIONS __setup_end = .; . = ALIGN(8); + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; + + . = ALIGN(8); __start___param = .; __param : { *(__param) } __stop___param = .; --- linux-2.6.5-rc3/arch/arm26/kernel/setup.c 2003-09-27 18:57:43.000000000 -0700 +++ 25/arch/arm26/kernel/setup.c 2004-03-31 00:55:03.225304008 -0800 @@ -76,7 +76,6 @@ struct processor processor; unsigned char aux_device_present; char elf_platform[ELF_PLATFORM_SIZE]; -char saved_command_line[COMMAND_LINE_SIZE]; unsigned long phys_initrd_start __initdata = 0; unsigned long phys_initrd_size __initdata = 0; @@ -154,53 +153,38 @@ static void __init setup_processor(void) } /* - * Initial parsing of the command line. We need to pick out the - * memory size. We look for mem=size@start, where start and size - * are "size[KkMm]" + * Pick out the memory size. We look for mem=size@start, + * where start and size are "size[KkMm]" */ -static void __init -parse_cmdline(struct meminfo *mi, char **cmdline_p, char *from) +static int __init early_mem(char *p) { - char c = ' ', *to = command_line; - int usermem = 0, len = 0; + static int usermem __initdata = 0; + unsigned long size, start; - for (;;) { - if (c == ' ' && !memcmp(from, "mem=", 4)) { - unsigned long size, start; - - if (to != command_line) - to -= 1; - - /* - * If the user specifies memory size, we - * blow away any automatically generated - * size. - */ - if (usermem == 0) { - usermem = 1; - mi->nr_banks = 0; - } - - start = PHYS_OFFSET; - size = memparse(from + 4, &from); - if (*from == '@') - start = memparse(from + 1, &from); - - mi->bank[mi->nr_banks].start = start; - mi->bank[mi->nr_banks].size = size; - mi->bank[mi->nr_banks].node = PHYS_TO_NID(start); - mi->nr_banks += 1; - } - c = *from++; - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *to++ = c; + /* + * The user has specified the memory size for the first time, + * so we blow away any automatically generated size. + */ + if (usermem == 0) { + usermem = 1; + mi->nr_banks = 0; } - *to = '\0'; - *cmdline_p = command_line; + + unsigned long size, start; + + start = PHYS_OFFSET; + size = memparse(p, &p); + if (*p == '@') + start = memparse(p + 1, &p); + + mi->bank[mi->nr_banks].start = start; + mi->bank[mi->nr_banks].size = size; + mi->bank[mi->nr_banks].node = PHYS_TO_NID(start); + mi->nr_banks += 1; + + return 0; } +__early_param("mem=", early_mem); static void __init setup_ramdisk(int doload, int prompt, int image_start, unsigned int rd_sz) @@ -495,9 +479,8 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) &_edata; init_mm.brk = (unsigned long) &_end; - memcpy(saved_command_line, from, COMMAND_LINE_SIZE); - saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; - parse_cmdline(&meminfo, cmdline_p, from); + *cmdline_p = from; + parse_early_options(cmdline_p); bootmem_init(&meminfo); paging_init(&meminfo); request_standard_resources(&meminfo); --- linux-2.6.5-rc3/arch/arm26/machine/small_page.c 2003-06-14 12:17:56.000000000 -0700 +++ 25/arch/arm26/machine/small_page.c 2004-03-31 00:56:27.733456816 -0800 @@ -95,7 +95,7 @@ again: offset = ffz(USED_MAP(page)); SET_USED(page, offset); if (USED_MAP(page) == order->all_used) - list_del_init(&page->list); + list_del_init(&page->lru); spin_unlock_irqrestore(&small_page_lock, flags); return (unsigned long) page_address(page) + (offset << order->shift); @@ -110,7 +110,7 @@ need_new_page: goto no_page; SetPageReserved(page); USED_MAP(page) = 0; - list_add(&page->list, &order->queue); + list_add(&page->lru, &order->queue); goto again; } @@ -151,7 +151,7 @@ static void __free_small_page(unsigned l spin_lock_irqsave(&small_page_lock, flags); if (USED_MAP(page) == order->all_used) - list_add(&page->list, &order->queue); + list_add(&page->lru, &order->queue); if (!TEST_AND_CLEAR_USED(page, spage)) goto already_free; @@ -167,7 +167,7 @@ free_page: /* * unlink the page from the small page queue and free it */ - list_del_init(&page->list); + list_del_init(&page->lru); spin_unlock_irqrestore(&small_page_lock, flags); ClearPageReserved(page); __free_page(page); --- linux-2.6.5-rc3/arch/arm/configs/neponset_defconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/arm/configs/neponset_defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -848,7 +848,6 @@ CONFIG_USB_MOUSE=m # CONFIG_USB_TIGL is not set # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_TEST is not set --- linux-2.6.5-rc3/arch/arm/kernel/setup.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/arm/kernel/setup.c 2004-03-31 00:55:02.898353712 -0800 @@ -81,7 +81,6 @@ struct cpu_cache_fns cpu_cache; unsigned char aux_device_present; char elf_platform[ELF_PLATFORM_SIZE]; -char saved_command_line[COMMAND_LINE_SIZE]; unsigned long phys_initrd_start __initdata = 0; unsigned long phys_initrd_size __initdata = 0; @@ -330,17 +329,19 @@ static struct machine_desc * __init setu return list; } -static void __init early_initrd(char **p) +static int __init early_initrd(char *p) { unsigned long start, size; - start = memparse(*p, p); - if (**p == ',') { - size = memparse((*p) + 1, p); + start = memparse(p, &p); + if (*p == ',') { + size = memparse(p++, &p); phys_initrd_start = start; phys_initrd_size = size; } + + return 0; } __early_param("initrd=", early_initrd); @@ -348,15 +349,14 @@ __early_param("initrd=", early_initrd); * Pick out the memory size. We look for mem=size@start, * where start and size are "size[KkMm]" */ -static void __init early_mem(char **p) +static int __init early_mem(char *p) { static int usermem __initdata = 0; unsigned long size, start; /* - * If the user specifies memory size, we - * blow away any automatically generated - * size. + * The user has specified the memory size for the first time, + * so we blow away any automatically generated size. */ if (usermem == 0) { usermem = 1; @@ -364,55 +364,18 @@ static void __init early_mem(char **p) } start = PHYS_OFFSET; - size = memparse(*p, p); - if (**p == '@') - start = memparse(*p + 1, p); + size = memparse(p, &p); + if (*p == '@') + start = memparse(p + 1, &p); meminfo.bank[meminfo.nr_banks].start = start; meminfo.bank[meminfo.nr_banks].size = size; meminfo.bank[meminfo.nr_banks].node = PHYS_TO_NID(start); meminfo.nr_banks += 1; -} -__early_param("mem=", early_mem); - -/* - * Initial parsing of the command line. - */ -static void __init parse_cmdline(char **cmdline_p, char *from) -{ - char c = ' ', *to = command_line; - int len = 0; - for (;;) { - if (c == ' ') { - extern struct early_params __early_begin, __early_end; - struct early_params *p; - - for (p = &__early_begin; p < &__early_end; p++) { - int len = strlen(p->arg); - - if (memcmp(from, p->arg, len) == 0) { - if (to != command_line) - to -= 1; - from += len; - p->fn(&from); - - while (*from != ' ' && *from != '\0') - from++; - break; - } - } - } - c = *from++; - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *to++ = c; - } - *to = '\0'; - *cmdline_p = command_line; + return 0; } +__early_param("mem=", early_mem); static void __init setup_ramdisk(int doload, int prompt, int image_start, unsigned int rd_sz) @@ -704,9 +667,8 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) &_edata; init_mm.brk = (unsigned long) &_end; - memcpy(saved_command_line, from, COMMAND_LINE_SIZE); - saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; - parse_cmdline(cmdline_p, from); + *cmdline_p = from; + parse_early_options(cmdline_p); bootmem_init(&meminfo); paging_init(&meminfo, mdesc); request_standard_resources(&meminfo, mdesc); --- linux-2.6.5-rc3/arch/arm/mm/mm-armv.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/arm/mm/mm-armv.c 2004-03-31 00:55:02.899353560 -0800 @@ -80,18 +80,18 @@ static struct cachepolicy cache_policies * writebuffer to be turned off. (Note: the write * buffer should not be on and the cache off). */ -static void __init early_cachepolicy(char **p) +static int __init early_cachepolicy(char *p) { int i; for (i = 0; i < ARRAY_SIZE(cache_policies); i++) { int len = strlen(cache_policies[i].policy); - if (memcmp(*p, cache_policies[i].policy, len) == 0) { + if (memcmp(p, cache_policies[i].policy, len) == 0) { cachepolicy = i; cr_alignment &= ~cache_policies[i].cr_mask; cr_no_alignment &= ~cache_policies[i].cr_mask; - *p += len; + p += len; break; } } @@ -99,31 +99,36 @@ static void __init early_cachepolicy(cha printk(KERN_ERR "ERROR: unknown or unsupported cache policy\n"); flush_cache_all(); set_cr(cr_alignment); + + return 0; } -static void __init early_nocache(char **__unused) +static int __init early_nocache(char *__unused) { char *p = "buffered"; printk(KERN_WARNING "nocache is deprecated; use cachepolicy=%s\n", p); - early_cachepolicy(&p); + early_cachepolicy(p); + + return 0; } -static void __init early_nowrite(char **__unused) +static int __init early_nowrite(char *__unused) { char *p = "uncached"; printk(KERN_WARNING "nowb is deprecated; use cachepolicy=%s\n", p); - early_cachepolicy(&p); + early_cachepolicy(p); + + return 0; } -static void __init early_ecc(char **p) +static int __init early_ecc(char *p) { - if (memcmp(*p, "on", 2) == 0) { + if (memcmp(p, "on", 2) == 0) ecc_mask = PMD_PROTECTION; - *p += 2; - } else if (memcmp(*p, "off", 3) == 0) { + else if (memcmp(p, "off", 3) == 0) ecc_mask = 0; - *p += 3; - } + + return 0; } __early_param("nocache", early_nocache); --- linux-2.6.5-rc3/arch/cris/kernel/setup.c 2003-07-10 18:50:30.000000000 -0700 +++ 25/arch/cris/kernel/setup.c 2004-03-31 00:55:03.370281968 -0800 @@ -16,6 +16,7 @@ #include #include #include +#include /* * Setup options @@ -28,10 +29,7 @@ unsigned char aux_device_present; extern int root_mountflags; extern char _etext, _edata, _end; -#define COMMAND_LINE_SIZE 256 - static char command_line[COMMAND_LINE_SIZE] = { 0, }; - char saved_command_line[COMMAND_LINE_SIZE]; extern const unsigned long text_start, edata; /* set by the linker script */ extern unsigned long dram_start, dram_end; @@ -160,6 +158,7 @@ setup_arch(char **cmdline_p) sizeof(command_line)); #endif command_line[COMMAND_LINE_SIZE - 1] = '\0'; + parse_early_options(cmdline_p); /* give credit for the CRIS port */ --- linux-2.6.5-rc3/arch/h8300/kernel/setup.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/h8300/kernel/setup.c 2004-03-31 00:55:03.525258408 -0800 @@ -33,6 +33,7 @@ #include #include +#include #ifdef CONFIG_BLK_DEV_INITRD #include @@ -60,8 +61,7 @@ unsigned long memory_end; struct task_struct *_current_task; -char command_line[512]; -char saved_command_line[512]; +char command_line[COMMAND_LINE_SIZE]; extern int _stext, _etext, _sdata, _edata, _sbss, _ebss, _end; extern int _ramstart, _ramend; @@ -166,8 +166,7 @@ void __init setup_arch(char **cmdline_p) #endif /* Keep a copy of command line */ *cmdline_p = &command_line[0]; - memcpy(saved_command_line, command_line, sizeof(saved_command_line)); - saved_command_line[sizeof(saved_command_line)-1] = 0; + parse_early_options(cmdline_p); #ifdef DEBUG if (strlen(*cmdline_p)) --- linux-2.6.5-rc3/arch/h8300/kernel/vmlinux.lds.S 2003-08-22 19:23:40.000000000 -0700 +++ 25/arch/h8300/kernel/vmlinux.lds.S 2004-03-31 00:55:03.525258408 -0800 @@ -131,6 +131,11 @@ SECTIONS *(.init.setup) . = ALIGN(0x4) ; ___setup_end = .; + + __early_begin = .; + *(__early_param) + __early_end = .; + ___start___param = .; *(__param) ___stop___param = .; --- linux-2.6.5-rc3/arch/i386/boot/setup.S 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/boot/setup.S 2004-03-31 00:56:34.200473680 -0800 @@ -156,7 +156,7 @@ cmd_line_ptr: .long 0 # (Header versio # can be located anywhere in # low memory 0x10000 or higher. -ramdisk_max: .long MAXMEM-1 # (Header version 0x0203 or later) +ramdisk_max: .long __MAXMEM-1 # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd --- linux-2.6.5-rc3/arch/i386/Kconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/Kconfig 2004-03-31 00:56:34.381446168 -0800 @@ -70,7 +70,7 @@ config X86_NUMAQ multiquad box. This changes the way that processors are bootstrapped, and uses Clustered Logical APIC addressing mode instead of Flat Logical. You will need a new lynxer.elf file to flash your firmware with - send - email to Martin.Bligh@us.ibm.com + email to . config X86_SUMMIT bool "Summit/EXA (IBM x440)" @@ -322,9 +322,13 @@ endchoice config X86_GENERIC bool "Generic x86 support" help - Including some tuning for non selected x86 CPUs too. - when it has moderate overhead. This is intended for generic - distributions kernels. + Instead of just including optimizations for the selected + x86 variant (e.g. PII, Crusoe or Athlon), include some more + generic optimizations as well. This will make the kernel + perform better on x86 CPUs other than that selected. + + This is really intended for distributors who need more + generic optimizations. endif @@ -418,6 +422,54 @@ config X86_OOSTORE depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR default y +config X86_4G + bool "4 GB kernel-space and 4 GB user-space virtual memory support" + help + This option is only useful for systems that have more than 1 GB + of RAM. + + The default kernel VM layout leaves 1 GB of virtual memory for + kernel-space mappings, and 3 GB of VM for user-space applications. + This option ups both the kernel-space VM and the user-space VM to + 4 GB. + + The cost of this option is additional TLB flushes done at + system-entry points that transition from user-mode into kernel-mode. + I.e. system calls and page faults, and IRQs that interrupt user-mode + code. There's also additional overhead to kernel operations that copy + memory to/from user-space. The overhead from this is hard to tell and + depends on the workload - it can be anything from no visible overhead + to 20-30% overhead. A good rule of thumb is to count with a runtime + overhead of 20%. + + The upside is the much increased kernel-space VM, which more than + quadruples the maximum amount of RAM supported. Kernels compiled with + this option boot on 64GB of RAM and still have more than 3.1 GB of + 'lowmem' left. Another bonus is that highmem IO bouncing decreases, + if used with drivers that still use bounce-buffers. + + There's also a 33% increase in user-space VM size - database + applications might see a boost from this. + + But the cost of the TLB flushes and the runtime overhead has to be + weighed against the bonuses offered by the larger VM spaces. The + dividing line depends on the actual workload - there might be 4 GB + systems that benefit from this option. Systems with less than 4 GB + of RAM will rarely see a benefit from this option - but it's not + out of question, the exact circumstances have to be considered. + +config X86_SWITCH_PAGETABLES + def_bool X86_4G + +config X86_4G_VM_LAYOUT + def_bool X86_4G + +config X86_UACCESS_INDIRECT + def_bool X86_4G + +config X86_HIGH_ENTRY + def_bool X86_4G + config HPET_TIMER bool "HPET Timer Support" help @@ -475,6 +527,16 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with Intel Pentium 4 chips with HyperThreading at a + cost of slightly increased overhead in some places. If unsure say + N here. + config PREEMPT bool "Preemptible Kernel" help @@ -598,7 +660,7 @@ config I8K For information on utilities to make use of this driver see the I8K Linux utilities web site at: - + Say Y if you intend to run this kernel on a Dell Inspiron 8000. Say N otherwise. @@ -705,7 +767,7 @@ config X86_PAE # Common NUMA Features config NUMA - bool "Numa Memory Allocation Support" + bool "Numa Memory Allocation and Scheduler Support" depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) @@ -809,8 +871,8 @@ config EFI This option is only useful on systems that have EFI firmware and will result in a kernel image that is ~8k larger. In addition, you must use the latest ELILO loader available at - ftp.hpl.hp.com/pub/linux-ia64/ in order to take advantage of kernel - initialization using EFI information (neither GRUB nor LILO know + in order to take advantage of + kernel initialization using EFI information (neither GRUB nor LILO know anything about EFI). However, even with this option, the resultant kernel should continue to boot on existing non-EFI platforms. @@ -1260,6 +1322,15 @@ config DEBUG_PAGEALLOC This results in a large slowdown, but helps to find certain types of memory corruptions. +config SPINLINE + bool "Spinlock inlining" + depends on DEBUG_KERNEL + help + This will change spinlocks from out of line to inline, making them + account cost to the callers in readprofile, rather than the lock + itself (as ".text.lock.filename"). This can be helpful for finding + the callers of locks. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM @@ -1276,20 +1347,211 @@ config DEBUG_INFO Say Y here only if you plan to use gdb to debug the kernel. If you don't debug the kernel, you can say N. +config LOCKMETER + bool "Kernel lock metering" + depends on SMP + help + Say Y to enable kernel lock metering, which adds overhead to SMP locks, + but allows you to see various statistics using the lockstat command. + config DEBUG_SPINLOCK_SLEEP bool "Sleep-inside-spinlock checking" help If you say Y here, various routines which may sleep will become very noisy if they are called with a spinlock held. +config KGDB + bool "Include kgdb kernel debugger" + depends on DEBUG_KERNEL + help + If you say Y here, the system will be compiled with the debug + option (-g) and a debugging stub will be included in the + kernel. This stub communicates with gdb on another (host) + computer via a serial port. The host computer should have + access to the kernel binary file (vmlinux) and a serial port + that is connected to the target machine. Gdb can be made to + configure the serial port or you can use stty and setserial to + do this. See the 'target' command in gdb. This option also + configures in the ability to request a breakpoint early in the + boot process. To request the breakpoint just include 'kgdb' + as a boot option when booting the target machine. The system + will then break as soon as it looks at the boot options. This + option also installs a breakpoint in panic and sends any + kernel faults to the debugger. For more information see the + Documentation/i386/kgdb/kgdb.txt file. + +choice + depends on KGDB + prompt "Debug serial port BAUD" + default KGDB_115200BAUD + help + Gdb and the kernel stub need to agree on the baud rate to be + used. Some systems (x86 family at this writing) allow this to + be configured. + +config KGDB_9600BAUD + bool "9600" + +config KGDB_19200BAUD + bool "19200" + +config KGDB_38400BAUD + bool "38400" + +config KGDB_57600BAUD + bool "57600" + +config KGDB_115200BAUD + bool "115200" +endchoice + +config KGDB_PORT + hex "hex I/O port address of the debug serial port" + depends on KGDB + default 3f8 + help + Some systems (x86 family at this writing) allow the port + address to be configured. The number entered is assumed to be + hex, don't put 0x in front of it. The standard address are: + COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx + will tell you what you have. It is good to test the serial + connection with a live system before trying to debug. + +config KGDB_IRQ + int "IRQ of the debug serial port" + depends on KGDB + default 4 + help + This is the irq for the debug port. If everything is working + correctly and the kernel has interrupts on a control C to the + port should cause a break into the kernel debug stub. + +config DEBUG_INFO + bool + depends on KGDB + default y + +config KGDB_MORE + bool "Add any additional compile options" + depends on KGDB + default n + help + Saying yes here turns on the ability to enter additional + compile options. + + +config KGDB_OPTIONS + depends on KGDB_MORE + string "Additional compile arguments" + default "-O1" + help + This option allows you enter additional compile options for + the whole kernel compile. Each platform will have a default + that seems right for it. For example on PPC "-ggdb -O1", and + for i386 "-O1". Note that by configuring KGDB "-g" is already + turned on. In addition, on i386 platforms + "-fomit-frame-pointer" is deleted from the standard compile + options. + +config NO_KGDB_CPUS + int "Number of CPUs" + depends on KGDB && SMP + default NR_CPUS + help + + This option sets the number of cpus for kgdb ONLY. It is used + to prune some internal structures so they look "nice" when + displayed with gdb. This is to overcome possibly larger + numbers that may have been entered above. Enter the real + number to get nice clean kgdb_info displays. + +config KGDB_TS + bool "Enable kgdb time stamp macros?" + depends on KGDB + default n + help + Kgdb event macros allow you to instrument your code with calls + to the kgdb event recording function. The event log may be + examined with gdb at a break point. Turning on this + capability also allows you to choose how many events to + keep. Kgdb always keeps the lastest events. + +choice + depends on KGDB_TS + prompt "Max number of time stamps to save?" + default KGDB_TS_128 + +config KGDB_TS_64 + bool "64" + +config KGDB_TS_128 + bool "128" + +config KGDB_TS_256 + bool "256" + +config KGDB_TS_512 + bool "512" + +config KGDB_TS_1024 + bool "1024" + +endchoice + +config STACK_OVERFLOW_TEST + bool "Turn on kernel stack overflow testing?" + depends on KGDB + default n + help + This option enables code in the front line interrupt handlers + to check for kernel stack overflow on interrupts and system + calls. This is part of the kgdb code on x86 systems. + +config KGDB_CONSOLE + bool "Enable serial console thru kgdb port" + depends on KGDB + default n + help + This option enables the command line "console=kgdb" option. + When the system is booted with this option in the command line + all kernel printk output is sent to gdb (as well as to other + consoles). For this to work gdb must be connected. For this + reason, this command line option will generate a breakpoint if + gdb has not yet connected. After the gdb continue command is + given all pent up console output will be printed by gdb on the + host machine. Neither this option, nor KGDB require the + serial driver to be configured. + +config KGDB_SYSRQ + bool "Turn on SysRq 'G' command to do a break?" + depends on KGDB + default y + help + This option includes an option in the SysRq code that allows + you to enter SysRq G which generates a breakpoint to the KGDB + stub. This will work if the keyboard is alive and can + interrupt the system. Because of constraints on when the + serial port interrupt can be enabled, this code may allow you + to interrupt the system before the serial port control C is + available. Just say yes here. + config FRAME_POINTER bool "Compile the kernel with frame pointers" + default KGDB help If you say Y here the resulting kernel image will be slightly larger and slower, but it will give very useful debugging information. If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. +config MAGIC_SYSRQ + bool + depends on KGDB_SYSRQ + default y + +config 4KSTACKS + def_bool y + config X86_FIND_SMP_CONFIG bool depends on X86_LOCAL_APIC || X86_VOYAGER --- linux-2.6.5-rc3/arch/i386/kernel/acpi/boot.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/kernel/acpi/boot.c 2004-03-31 00:56:34.382446016 -0800 @@ -67,6 +67,10 @@ int acpi_sci_override_gsi __initdata; static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #endif +#ifndef __HAVE_ARCH_CMPXCHG +#warning ACPI uses CMPXCHG, i486 and later hardware +#endif + /* -------------------------------------------------------------------------- Boot-time Configuration -------------------------------------------------------------------------- */ @@ -434,7 +438,7 @@ acpi_scan_rsdp ( * RSDP signature. */ for (offset = 0; offset < length; offset += 16) { - if (strncmp((char *) (start + offset), "RSD PTR ", sig_len)) + if (strncmp((char *) __va(start + offset), "RSD PTR ", sig_len)) continue; return (start + offset); } --- linux-2.6.5-rc3/arch/i386/kernel/acpi/sleep.c 2003-08-08 22:55:10.000000000 -0700 +++ 25/arch/i386/kernel/acpi/sleep.c 2004-03-31 00:56:34.382446016 -0800 @@ -19,13 +19,29 @@ extern void zap_low_mappings(void); extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); -static void init_low_mapping(pgd_t *pgd, int pgd_limit) +static void map_low(pgd_t *pgd_base, unsigned long start, unsigned long end) { - int pgd_ofs = 0; - - while ((pgd_ofs < pgd_limit) && (pgd_ofs + USER_PTRS_PER_PGD < PTRS_PER_PGD)) { - set_pgd(pgd, *(pgd+USER_PTRS_PER_PGD)); - pgd_ofs++, pgd++; + unsigned long vaddr; + pmd_t *pmd; + pgd_t *pgd; + int i, j; + + pgd = pgd_base; + + for (i = 0; i < PTRS_PER_PGD; pgd++, i++) { + vaddr = i*PGDIR_SIZE; + if (end && (vaddr >= end)) + break; + pmd = pmd_offset(pgd, 0); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + set_pmd(pmd, __pmd(_KERNPG_TABLE + _PAGE_PSE + + vaddr - start)); + } } } @@ -39,7 +55,9 @@ int acpi_save_state_mem (void) { if (!acpi_wakeup_address) return 1; - init_low_mapping(swapper_pg_dir, USER_PTRS_PER_PGD); + if (!cpu_has_pse) + return 1; + map_low(swapper_pg_dir, 0, LOW_MAPPINGS_SIZE); memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); acpi_copy_wakeup_routine(acpi_wakeup_address); --- linux-2.6.5-rc3/arch/i386/kernel/apic.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/apic.c 2004-03-31 00:55:02.388431232 -0800 @@ -616,7 +616,7 @@ static int __init lapic_disable(char *st clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); return 0; } -__setup("nolapic", lapic_disable); +__early_param("nolapic", lapic_disable); static int __init lapic_enable(char *str) { --- linux-2.6.5-rc3/arch/i386/kernel/asm-offsets.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/kernel/asm-offsets.c 2004-03-31 00:56:34.383445864 -0800 @@ -31,5 +31,19 @@ void foo(void) DEFINE(RT_SIGFRAME_sigcontext, offsetof (struct rt_sigframe, uc.uc_mcontext)); + DEFINE(TI_task, offsetof (struct thread_info, task)); + DEFINE(TI_exec_domain, offsetof (struct thread_info, exec_domain)); + DEFINE(TI_flags, offsetof (struct thread_info, flags)); + DEFINE(TI_preempt_count, offsetof (struct thread_info, preempt_count)); + DEFINE(TI_addr_limit, offsetof (struct thread_info, addr_limit)); + DEFINE(TI_real_stack, offsetof (struct thread_info, real_stack)); + DEFINE(TI_virtual_stack, offsetof (struct thread_info, virtual_stack)); + DEFINE(TI_user_pgd, offsetof (struct thread_info, user_pgd)); + + DEFINE(FIX_ENTRY_TRAMPOLINE_0_addr, + __fix_to_virt(FIX_ENTRY_TRAMPOLINE_0)); + DEFINE(FIX_VSYSCALL_addr, __fix_to_virt(FIX_VSYSCALL)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); + DEFINE(task_thread_db7, + offsetof (struct task_struct, thread.debugreg[7])); } --- linux-2.6.5-rc3/arch/i386/kernel/cpu/common.c 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/i386/kernel/cpu/common.c 2004-03-31 00:56:35.426287328 -0800 @@ -196,7 +196,6 @@ int __init have_cpuid_p(void) void __init generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; - int junk; if (have_cpuid_p()) { /* Get vendor name */ @@ -211,8 +210,8 @@ void __init generic_identify(struct cpui /* Intel-defined flags: level 0x00000001 */ if ( c->cpuid_level >= 0x00000001 ) { - u32 capability, excap; - cpuid(0x00000001, &tfms, &junk, &excap, &capability); + u32 capability, excap, misc; + cpuid(0x00000001, &tfms, &misc, &excap, &capability); c->x86_capability[0] = capability; c->x86_capability[4] = excap; c->x86 = (tfms >> 8) & 15; @@ -222,6 +221,9 @@ void __init generic_identify(struct cpui c->x86_model += ((tfms >> 16) & 0xF) << 4; } c->x86_mask = tfms & 15; + + if (c->x86_capability[0] & (1<<19)) + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; } else { /* Have CPUID level 0 only - unheard of */ c->x86 = 4; @@ -261,16 +263,13 @@ static int __init x86_serial_nr_setup(ch } __setup("serialnumber", x86_serial_nr_setup); - - /* * This does the hard work of actually picking apart the CPU stuff... */ -void __init identify_cpu(struct cpuinfo_x86 *c) +void __init early_identify_cpu(struct cpuinfo_x86 *c) { int i; - c->loops_per_jiffy = loops_per_jiffy; c->x86_cache_size = -1; c->x86_vendor = X86_VENDOR_UNKNOWN; c->cpuid_level = -1; /* CPUID not detected */ @@ -279,6 +278,7 @@ void __init identify_cpu(struct cpuinfo_ c->x86_model_id[0] = '\0'; /* Unset */ memset(&c->x86_capability, 0, sizeof c->x86_capability); + c->x86_clflush_size = 0; if (!have_cpuid_p()) { /* First of all, decide if this is a 486 or higher */ /* It's a 486 if we can modify the AC flag */ @@ -299,12 +299,12 @@ void __init identify_cpu(struct cpuinfo_ if (this_cpu->c_identify) { this_cpu->c_identify(c); - printk(KERN_DEBUG "CPU: After vendor identify, caps: %08lx %08lx %08lx %08lx\n", - c->x86_capability[0], - c->x86_capability[1], - c->x86_capability[2], - c->x86_capability[3]); -} + printk(KERN_DEBUG "CPU: After vendor identify, caps: %08lx %08lx %08lx %08lx\n", + c->x86_capability[0], + c->x86_capability[1], + c->x86_capability[2], + c->x86_capability[3]); + } /* * Vendor-specific initialization. In this section we @@ -360,6 +360,16 @@ void __init identify_cpu(struct cpuinfo_ c->x86_capability[2], c->x86_capability[3]); + if (!c->x86_clflush_size) { + /* No cache line size autodetected - manual estimate: */ + if (c->x86 <= 4) + c->x86_clflush_size = 16; + else + c->x86_clflush_size = 32; + } + printk(KERN_DEBUG "CPU: Cache line size %d.\n", c->x86_clflush_size); + + /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -371,12 +381,28 @@ void __init identify_cpu(struct cpuinfo_ for ( i = 0 ; i < NCAPINTS ; i++ ) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } +} +void __init late_identify_cpu(struct cpuinfo_x86 *c) +{ /* Init Machine Check Exception if available. */ #ifdef CONFIG_X86_MCE mcheck_init(c); #endif + /* + * The timer is not yet running when identify cpu is called for the + * first cpu - check_bugs() calls late_identify_cpu and transfers + * loops_per_jiffy from calibrate_delay into the cpu data area. + */ + c->loops_per_jiffy = loops_per_jiffy; } + +void __init identify_cpu(struct cpuinfo_x86 *c) +{ + early_identify_cpu(c); + late_identify_cpu(c); +} + /* * Perform early boot up checks for a valid TSC. See arch/i386/kernel/time.c */ @@ -514,12 +540,16 @@ void __init cpu_init (void) set_tss_desc(cpu,t); cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; load_TR_desc(); - load_LDT(&init_mm.context); + if (cpu) + load_LDT(&init_mm.context); /* Set up doublefault TSS pointer in the GDT */ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); cpu_gdt_table[cpu][GDT_ENTRY_DOUBLEFAULT_TSS].b &= 0xfffffdff; + if (cpu) + trap_init_virtual_GDT(); + /* Clear %fs and %gs. */ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); --- linux-2.6.5-rc3/arch/i386/kernel/cpu/cpufreq/Kconfig 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/cpu/cpufreq/Kconfig 2004-03-30 23:39:45.029174224 -0800 @@ -11,8 +11,8 @@ config CPU_FREQ fly. This is a nice method to save battery power on notebooks, because the lower the clock speed, the less power the CPU consumes. - For more information, take a look at linux/Documentation/cpu-freq or - at + For more information, take a look at + or at If in doubt, say N. @@ -38,7 +38,7 @@ config X86_ACPI_CPUFREQ This driver adds a CPUFreq driver which utilizes the ACPI Processor Performance States. - For details, take a look at linux/Documentation/cpu-freq. + For details, take a look at . If in doubt, say N. @@ -63,7 +63,7 @@ config ELAN_CPUFREQ parameter: elanfreq=maxspeed (in kHz) or as module parameter "max_freq". - For details, take a look at linux/Documentation/cpu-freq. + For details, take a look at . If in doubt, say N. @@ -74,7 +74,7 @@ config X86_POWERNOW_K6 This adds the CPUFreq driver for mobile AMD K6-2+ and mobile AMD K6-3+ processors. - For details, take a look at linux/Documentation/cpu-freq. + For details, take a look at . If in doubt, say N. @@ -84,7 +84,7 @@ config X86_POWERNOW_K7 help This adds the CPUFreq driver for mobile AMD K7 mobile processors. - For details, take a look at linux/Documentation/cpu-freq. + For details, take a look at . If in doubt, say N. @@ -94,7 +94,7 @@ config X86_POWERNOW_K8 help This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors. - For details, take a look at linux/Documentation/cpu-freq. + For details, take a look at . If in doubt, say N. @@ -105,7 +105,7 @@ config X86_GX_SUSPMOD This add the CPUFreq driver for NatSemi Geode processors which support suspend modulation. - For details, take a look at linux/Documentation/cpu-freq. + For details, take a look at . If in doubt, say N. @@ -116,10 +116,25 @@ config X86_SPEEDSTEP_CENTRINO This adds the CPUFreq driver for Enhanced SpeedStep enabled mobile CPUs. This means Intel Pentium M (Centrino) CPUs. - For details, take a look at linux/Documentation/cpu-freq. + For details, take a look at . If in doubt, say N. +config X86_SPEEDSTEP_CENTRINO_TABLE + bool + depends on X86_SPEEDSTEP_CENTRINO + default y + +config X86_SPEEDSTEP_CENTRINO_ACPI + bool "Use ACPI tables to decode valid frequency/voltage pairs (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on ((X86_SPEEDSTEP_CENTRINO = "m" && ACPI_PROCESSOR) || (X86_SPEEDSTEP_CENTRINO = "y" && ACPI_PROCESSOR = "y")) + help + Use primarily the information provided in the BIOS ACPI tables + to determine valid CPU frequency and voltage pairings. + + If in doubt, say Y. + config X86_SPEEDSTEP_ICH tristate "Intel Speedstep on ICH-M chipsets (ioport interface)" depends on CPU_FREQ_TABLE @@ -129,7 +144,7 @@ config X86_SPEEDSTEP_ICH mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2, ICH3 or ICH4 southbridge. - For details, take a look at linux/Documentation/cpu-freq. + For details, take a look at . If in doubt, say N. @@ -141,7 +156,7 @@ config X86_SPEEDSTEP_SMI (Coppermine), all mobile Intel Pentium III-M (Tualatin) on systems which have an Intel 440BX/ZX/MX southbridge. - For details, take a look at linux/Documentation/cpu-freq. + For details, take a look at . If in doubt, say N. @@ -152,7 +167,7 @@ config X86_P4_CLOCKMOD This adds the CPUFreq driver for Intel Pentium 4 / XEON processors. - For details, take a look at linux/Documentation/cpu-freq. + For details, take a look at . If in doubt, say N. @@ -161,6 +176,16 @@ config X86_SPEEDSTEP_LIB depends on (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD) default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD) +config X86_SPEEDSTEP_RELAXED_CAP_CHECK + bool "Relaxed speedstep capability checks" + depends on (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH) + help + Don't perform all checks for a speedstep capable system which would + normally be done. Some ancient or strange systems, though speedstep + capable, don't always indicate that they are speedstep capable. This + option let's the probing code bypass some of those checks if the + parameter "relaxed_check=1" is passed to the module. + config X86_LONGRUN tristate "Transmeta LongRun" depends on CPU_FREQ @@ -168,7 +193,7 @@ config X86_LONGRUN This adds the CPUFreq driver for Transmeta Crusoe processors which support LongRun. - For details, take a look at linux/Documentation/cpu-freq. + For details, take a look at . If in doubt, say N. @@ -180,7 +205,7 @@ config X86_LONGHAUL VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T processors. - For details, take a look at linux/Documentation/cpu-freq. + For details, take a look at . If in doubt, say N. --- linux-2.6.5-rc3/arch/i386/kernel/cpu/cpufreq/longhaul.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/cpu/cpufreq/longhaul.c 2004-03-30 20:21:49.000000000 -0800 @@ -458,7 +458,7 @@ static int __init longhaul_cpu_init (str return 0; } -static int longhaul_cpu_exit(struct cpufreq_policy *policy) +static int __exit longhaul_cpu_exit(struct cpufreq_policy *policy) { cpufreq_frequency_table_put_attr(policy->cpu); return 0; --- linux-2.6.5-rc3/arch/i386/kernel/cpu/cpufreq/longrun.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/cpu/cpufreq/longrun.c 2004-03-30 20:21:49.000000000 -0800 @@ -33,7 +33,7 @@ static unsigned int longrun_low_freq, lo * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS * and MSR_TMTA_LONGRUN_CTRL */ -static void longrun_get_policy(struct cpufreq_policy *policy) +static void __init longrun_get_policy(struct cpufreq_policy *policy) { u32 msr_lo, msr_hi; --- linux-2.6.5-rc3/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2004-03-31 00:54:25.009113752 -0800 @@ -57,8 +57,7 @@ static int cpufreq_p4_setdc(unsigned int u32 l, h; cpumask_t cpus_allowed, affected_cpu_map; struct cpufreq_freqs freqs; - int hyperthreading = 0; - int sibling = 0; + int j; if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV)) @@ -68,13 +67,10 @@ static int cpufreq_p4_setdc(unsigned int cpus_allowed = current->cpus_allowed; /* only run on CPU to be set, or on its sibling */ - affected_cpu_map = cpumask_of_cpu(cpu); -#ifdef CONFIG_X86_HT - hyperthreading = ((cpu_has_ht) && (smp_num_siblings == 2)); - if (hyperthreading) { - sibling = cpu_sibling_map[cpu]; - cpu_set(sibling, affected_cpu_map); - } +#ifdef CONFIG_SMP + affected_cpu_map = cpu_sibling_map[cpu]; +#else + affected_cpu_map = cpumask_of_cpu(cpu); #endif set_cpus_allowed(current, affected_cpu_map); BUG_ON(!cpu_isset(smp_processor_id(), affected_cpu_map)); @@ -97,11 +93,11 @@ static int cpufreq_p4_setdc(unsigned int /* notifiers */ freqs.old = stock_freq * l / 8; freqs.new = stock_freq * newstate / 8; - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - if (hyperthreading) { - freqs.cpu = sibling; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + for_each_cpu(j) { + if (cpu_isset(j, affected_cpu_map)) { + freqs.cpu = j; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } } rdmsr(MSR_IA32_THERM_STATUS, l, h); @@ -132,10 +128,11 @@ static int cpufreq_p4_setdc(unsigned int set_cpus_allowed(current, cpus_allowed); /* notifiers */ - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - if (hyperthreading) { - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + for_each_cpu(j) { + if (cpu_isset(j, affected_cpu_map)) { + freqs.cpu = j; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } } return 0; @@ -181,21 +178,24 @@ static unsigned int cpufreq_p4_get_frequ { if ((c->x86 == 0x06) && (c->x86_model == 0x09)) { /* Pentium M */ - printk(KERN_DEBUG PFX "Warning: Pentium M detected. The speedstep_centrino module\n"); - printk(KERN_DEBUG PFX "offers voltage scaling in addition of frequency scaling. You\n"); - printk(KERN_DEBUG PFX "should use that instead of p4-clockmod, if possible.\n"); + printk(KERN_WARNING PFX "Warning: Pentium M detected. " + "The speedstep_centrino module offers voltage scaling" + " in addition of frequency scaling. You should use " + "that instead of p4-clockmod, if possible.\n"); return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM); } if (c->x86 != 0xF) { - printk(KERN_DEBUG PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to \n"); + printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to \n"); return 0; } if (speedstep_detect_processor() == SPEEDSTEP_PROCESSOR_P4M) { - printk(KERN_DEBUG PFX "Warning: Pentium 4-M detected. The speedstep-ich or acpi cpufreq \n"); - printk(KERN_DEBUG PFX "modules offers voltage scaling in addition of frequency scaling. You\n"); - printk(KERN_DEBUG PFX "should use either one instead of p4-clockmod, if possible.\n"); + printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. " + "The speedstep-ich or acpi cpufreq modules offers " + "voltage scaling in addition of frequency scaling. " + "You should use either one instead of p4-clockmod, " + "if possible.\n"); return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4M); } --- linux-2.6.5-rc3/arch/i386/kernel/cpu/cpufreq/powernow-k8.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/cpu/cpufreq/powernow-k8.c 2004-03-31 00:54:50.187286088 -0800 @@ -1,22 +1,24 @@ /* - * (c) 2003 Advanced Micro Devices, Inc. + * (c) 2003, 2004 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "../../../COPYING" or + * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html * * Support : paul.devriendt@amd.com * * Based on the powernow-k7.c module written by Dave Jones. - * (C) 2003 Dave Jones on behalf of SuSE Labs + * (C) 2003 Dave Jones on behalf of SuSE Labs * (C) 2004 Dominik Brodowski * (C) 2004 Pavel Machek * Licensed under the terms of the GNU GPL License version 2. * Based upon datasheets & sample CPUs kindly provided by AMD. * + * Valuable input gratefully received from Dave Jones, Pavel Machek, Dominik + * Brodowski, and others. + * * Processor information obtained from Chapter 9 (Power and Thermal Management) * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD - * Opteron Processors", revision 3.03, available for download from www.amd.com - * + * Opteron Processors" available for download from www.amd.com */ #include @@ -31,55 +33,47 @@ #include #include +#ifdef CONFIG_ACPI +#define CONFIG_X86_POWERNOW_K8_ACPI +#endif +#ifdef CONFIG_X86_POWERNOW_K8_ACPI + +#include +#include + +#endif /* CONFIG_X86_POWERNOW_K8_ACPI */ + + #define PFX "powernow-k8: " -#define BFX PFX "BIOS error: " -#define VERSION "version 1.00.08a" +#define VERSION "version 1.20.08b - March 20, 2004" #include "powernow-k8.h" -static u32 vstable; /* voltage stabalization time, from PSB, units 20 us */ -static u32 plllock; /* pll lock time, from PSB, units 1 us */ -static u32 numps; /* number of p-states, from PSB */ -static u32 rvo; /* ramp voltage offset, from PSB */ -static u32 irt; /* isochronous relief time, from PSB */ -static u32 vidmvs; /* usable value calculated from mvs, from PSB */ -static u32 currvid; /* keep track of the current fid / vid */ -static u32 currfid; +/* serialize freq changes */ +static DECLARE_MUTEX(fidvid_sem); -static struct cpufreq_frequency_table *powernow_table; +static struct powernow_k8_data *powernow_data[NR_CPUS]; -/* -The PSB table supplied by BIOS allows for the definition of the number of -p-states that can be used when running on a/c, and the number of p-states -that can be used when running on battery. This allows laptop manufacturers -to force the system to save power when running from battery. The relationship -is : - 1 <= number_of_battery_p_states <= maximum_number_of_p_states - -This driver does NOT have the support in it to detect transitions from -a/c power to battery power, and thus trigger the transition to a lower -p-state if required. This is because I need ACPI and the 2.6 kernel to do -this, and this is a 2.4 kernel driver. Check back for a new improved driver -for the 2.6 kernel soon. - -This code therefore assumes it is on battery at all times, and thus -restricts performance to number_of_battery_p_states. For desktops, - number_of_battery_p_states == maximum_number_of_pstates, -so this is not actually a restriction. -*/ -static u32 batps; /* limit on the number of p states when on battery */ - /* - set by BIOS in the PSB/PST */ +/* Return a frequency in MHz, given an input fid */ +static inline u32 find_freq_from_fid(u32 fid) +{ + return 800 + (fid * 100); +} - /* Return a frequency in MHz, given an input fid */ -static u32 find_freq_from_fid(u32 fid) +/* Return a frequency in KHz, given an input fid */ +static inline u32 find_khz_freq_from_fid(u32 fid) { - return 800 + (fid * 100); + return 1000 * find_freq_from_fid(fid); } +/* Return a voltage in miliVolts, given an input vid */ +static inline u32 find_milivolts_from_vid(struct powernow_k8_data *data, u32 vid) +{ + return 1550-vid*25; +} /* Return the vco fid for an input fid */ -static u32 -convert_fid_to_vco_fid(u32 fid) +static u32 convert_fid_to_vco_fid(u32 fid) { if (fid < HI_FID_TABLE_BOTTOM) { return 8 + (2 * fid); @@ -89,11 +83,10 @@ convert_fid_to_vco_fid(u32 fid) } /* - * Return 1 if the pending bit is set. Unless we are actually just told the - * processor to transition a state, seeing this bit set is really bad news. + * Return 1 if the pending bit is set. Unless we just instructed the processor + * to transition to a new state, seeing this bit set is really bad news. */ -static inline int -pending_bit_stuck(void) +static inline int pending_bit_stuck(void) { u32 lo, hi; @@ -102,11 +95,10 @@ pending_bit_stuck(void) } /* - * Update the global current fid / vid values from the status msr. Returns 1 - * on error. + * Update the global current fid / vid values from the status msr. Returns + * 1 on error. */ -static int -query_current_values_with_pending_wait(void) +static int query_current_values_with_pending_wait(struct powernow_k8_data *data) { u32 lo, hi; u32 i = 0; @@ -120,63 +112,69 @@ query_current_values_with_pending_wait(v rdmsr(MSR_FIDVID_STATUS, lo, hi); } - currvid = hi & MSR_S_HI_CURRENT_VID; - currfid = lo & MSR_S_LO_CURRENT_FID; + data->currvid = hi & MSR_S_HI_CURRENT_VID; + data->currfid = lo & MSR_S_LO_CURRENT_FID; return 0; } /* the isochronous relief time */ -static inline void -count_off_irt(void) +static inline void count_off_irt(struct powernow_k8_data *data) { - udelay((1 << irt) * 10); + udelay((1 << data->irt) * 10); return; } -/* the voltage stabalization time */ -static inline void -count_off_vst(void) +/* the voltage stabilization time */ +static inline void count_off_vst(struct powernow_k8_data *data) { - udelay(vstable * VST_UNITS_20US); + udelay(data->vstable * VST_UNITS_20US); return; } +/* need to init the control msr to a safe value (for each cpu) */ +static void fidvid_msr_init(void) +{ + u32 lo, hi; + u8 fid, vid; + + rdmsr(MSR_FIDVID_STATUS, lo, hi); + vid = hi & MSR_S_HI_CURRENT_VID; + fid = lo & MSR_S_LO_CURRENT_FID; + lo = fid | (vid << MSR_C_LO_VID_SHIFT); + hi = MSR_C_HI_STP_GNT_BENIGN; + dprintk(PFX "cpu%d, init lo %x, hi %x\n", smp_processor_id(), lo, hi); + wrmsr(MSR_FIDVID_CTL, lo, hi); +} + /* write the new fid value along with the other control fields to the msr */ -static int -write_new_fid(u32 fid) +static int write_new_fid(struct powernow_k8_data *data, u32 fid) { u32 lo; - u32 savevid = currvid; + u32 savevid = data->currvid; - if ((fid & INVALID_FID_MASK) || (currvid & INVALID_VID_MASK)) { + if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) { printk(KERN_ERR PFX "internal error - overflow on fid write\n"); return 1; } - lo = fid | (currvid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID; - + lo = fid | (data->currvid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID; dprintk(KERN_DEBUG PFX "writing fid %x, lo %x, hi %x\n", - fid, lo, plllock * PLL_LOCK_CONVERSION); - - wrmsr(MSR_FIDVID_CTL, lo, plllock * PLL_LOCK_CONVERSION); - - if (query_current_values_with_pending_wait()) + fid, lo, data->plllock * PLL_LOCK_CONVERSION); + wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION); + if (query_current_values_with_pending_wait(data)) return 1; + count_off_irt(data); - count_off_irt(); - - if (savevid != currvid) { - printk(KERN_ERR PFX - "vid changed on fid transition, save %x, currvid %x\n", - savevid, currvid); + if (savevid != data->currvid) { + printk(KERN_ERR PFX "vid change on fid trans, old %x, new %x\n", + savevid, data->currvid); return 1; } - if (fid != currfid) { - printk(KERN_ERR PFX - "fid transition failed, fid %x, currfid %x\n", - fid, currfid); + if (fid != data->currfid) { + printk(KERN_ERR PFX "fid trans failed, fid %x, curr %x\n", fid, + data->currfid); return 1; } @@ -184,40 +182,33 @@ write_new_fid(u32 fid) } /* Write a new vid to the hardware */ -static int -write_new_vid(u32 vid) +static int write_new_vid(struct powernow_k8_data *data, u32 vid) { u32 lo; - u32 savefid = currfid; + u32 savefid = data->currfid; - if ((currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) { + if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) { printk(KERN_ERR PFX "internal error - overflow on vid write\n"); return 1; } - lo = currfid | (vid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID; - + lo = data->currfid | (vid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID; dprintk(KERN_DEBUG PFX "writing vid %x, lo %x, hi %x\n", vid, lo, STOP_GRANT_5NS); - wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS); - - if (query_current_values_with_pending_wait()) { + if (query_current_values_with_pending_wait(data)) return 1; - } - if (savefid != currfid) { - printk(KERN_ERR PFX - "fid changed on vid transition, save %x currfid %x\n", - savefid, currfid); + if (savefid != data->currfid) { + printk(KERN_ERR PFX "fid changed on vid trans, old %x new %x\n", + savefid, data->currfid); return 1; } - if (vid != currvid) { - printk(KERN_ERR PFX - "vid transition failed, vid %x, currvid %x\n", - vid, currvid); - return 1; + if (vid != data->currvid) { + printk(KERN_ERR PFX "vid trans failed, vid %x, curr %x\n", vid, + data->currvid); + return 1; } return 0; @@ -228,300 +219,279 @@ write_new_vid(u32 vid) * Decreasing vid codes represent increasing voltages: * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of 0x1f is off. */ -static int -decrease_vid_code_by_step(u32 reqvid, u32 step) +static int decrease_vid_code_by_step(struct powernow_k8_data *data, u32 reqvid, u32 step) { - if ((currvid - reqvid) > step) - reqvid = currvid - step; - - if (write_new_vid(reqvid)) + if ((data->currvid - reqvid) > step) + reqvid = data->currvid - step; + if (write_new_vid(data, reqvid)) return 1; - - count_off_vst(); - + count_off_vst(data); return 0; } /* Change the fid and vid, by the 3 phases. */ -static inline int -transition_fid_vid(u32 reqfid, u32 reqvid) +static inline int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid) { - if (core_voltage_pre_transition(reqvid)) + if (core_voltage_pre_transition(data, reqvid)) return 1; - - if (core_frequency_transition(reqfid)) + if (core_frequency_transition(data, reqfid)) return 1; - - if (core_voltage_post_transition(reqvid)) + if (core_voltage_post_transition(data, reqvid)) return 1; - - if (query_current_values_with_pending_wait()) + if (query_current_values_with_pending_wait(data)) return 1; - if ((reqfid != currfid) || (reqvid != currvid)) { - printk(KERN_ERR PFX "failed: req 0x%x 0x%x, curr 0x%x 0x%x\n", - reqfid, reqvid, currfid, currvid); + if ((reqfid != data->currfid) || (reqvid != data->currvid)) { + printk(KERN_ERR PFX "failed (cpu%d): req %x %x, curr %x %x\n", + smp_processor_id(), + reqfid, reqvid, data->currfid, data->currvid); return 1; } - dprintk(KERN_INFO PFX - "transitioned: new fid 0x%x, vid 0x%x\n", currfid, currvid); - + dprintk(KERN_INFO PFX "transitioned (cpu%d): new fid %x, vid %x\n", + smp_processor_id(), data->currfid, data->currvid); return 0; } -/* - * Phase 1 - core voltage transition ... setup appropriate voltage for the - * fid transition. - */ -static inline int -core_voltage_pre_transition(u32 reqvid) +/* Phase 1 - core voltage transition ... setup voltage */ +static inline int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid) { - u32 rvosteps = rvo; - u32 savefid = currfid; + u32 rvosteps = data->rvo; + u32 savefid = data->currfid; dprintk(KERN_DEBUG PFX - "ph1: start, currfid 0x%x, currvid 0x%x, reqvid 0x%x, rvo %x\n", - currfid, currvid, reqvid, rvo); - - while (currvid > reqvid) { - dprintk(KERN_DEBUG PFX "ph1: curr 0x%x, requesting vid 0x%x\n", - currvid, reqvid); - if (decrease_vid_code_by_step(reqvid, vidmvs)) + "ph1 (cpu%d): start, currfid %x, currvid %x, reqvid %x, rvo %x\n", + smp_processor_id(), + data->currfid, data->currvid, reqvid, data->rvo); + + while (data->currvid > reqvid) { + dprintk(KERN_DEBUG PFX "ph1: curr %x, req vid %x\n", + data->currvid, reqvid); + if (decrease_vid_code_by_step(data, reqvid, data->vidmvs)) return 1; } while (rvosteps > 0) { - if (currvid == 0) { + if (data->currvid == 0) { rvosteps = 0; } else { dprintk(KERN_DEBUG PFX - "ph1: changing vid for rvo, requesting 0x%x\n", - currvid - 1); - if (decrease_vid_code_by_step(currvid - 1, 1)) + "ph1: changing vid for rvo, req %x\n", + data->currvid - 1); + if (decrease_vid_code_by_step(data, data->currvid - 1, 1)) return 1; rvosteps--; } } - if (query_current_values_with_pending_wait()) + if (query_current_values_with_pending_wait(data)) return 1; - if (savefid != currfid) { - printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", currfid); + if (savefid != data->currfid) { + printk(KERN_ERR PFX "ph1: err, currfid changed %x\n", data->currfid); return 1; } - dprintk(KERN_DEBUG PFX "ph1 complete, currfid 0x%x, currvid 0x%x\n", - currfid, currvid); + dprintk(KERN_DEBUG PFX "ph1: complete, currfid %x, currvid %x\n", + data->currfid, data->currvid); return 0; } /* Phase 2 - core frequency transition */ -static inline int -core_frequency_transition(u32 reqfid) +static inline int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) { u32 vcoreqfid; u32 vcocurrfid; u32 vcofiddiff; - u32 savevid = currvid; + u32 savevid = data->currvid; - if ((reqfid < HI_FID_TABLE_BOTTOM) && (currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX "ph2 illegal lo-lo transition 0x%x 0x%x\n", - reqfid, currfid); + if ((reqfid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) { + printk(KERN_ERR PFX "ph2: illegal lo-lo transition %x %x\n", + reqfid, data->currfid); return 1; } - if (currfid == reqfid) { - printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", currfid); + if (data->currfid == reqfid) { + printk(KERN_ERR PFX "ph2: null fid transition %x\n", data->currfid); return 0; } dprintk(KERN_DEBUG PFX - "ph2 starting, currfid 0x%x, currvid 0x%x, reqfid 0x%x\n", - currfid, currvid, reqfid); + "ph2 (cpu%d): starting, currfid %x, currvid %x, reqfid %x\n", + smp_processor_id(), + data->currfid, data->currvid, reqfid); vcoreqfid = convert_fid_to_vco_fid(reqfid); - vcocurrfid = convert_fid_to_vco_fid(currfid); + vcocurrfid = convert_fid_to_vco_fid(data->currfid); vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid : vcoreqfid - vcocurrfid; while (vcofiddiff > 2) { - if (reqfid > currfid) { - if (currfid > LO_FID_TABLE_TOP) { - if (write_new_fid(currfid + 2)) { + if (reqfid > data->currfid) { + if (data->currfid > LO_FID_TABLE_TOP) { + if (write_new_fid(data, data->currfid + 2)) { return 1; } } else { if (write_new_fid - (2 + convert_fid_to_vco_fid(currfid))) { + (data, 2 + convert_fid_to_vco_fid(data->currfid))) { return 1; } } } else { - if (write_new_fid(currfid - 2)) + if (write_new_fid(data, data->currfid - 2)) return 1; } - vcocurrfid = convert_fid_to_vco_fid(currfid); + vcocurrfid = convert_fid_to_vco_fid(data->currfid); vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid : vcoreqfid - vcocurrfid; } - if (write_new_fid(reqfid)) + if (write_new_fid(data, reqfid)) return 1; - - if (query_current_values_with_pending_wait()) + if (query_current_values_with_pending_wait(data)) return 1; - if (currfid != reqfid) { + if (data->currfid != reqfid) { printk(KERN_ERR PFX - "ph2 mismatch, failed fid transition, curr %x, req %x\n", - currfid, reqfid); + "ph2: mismatch, failed fid transition, curr %x, req %x\n", + data->currfid, reqfid); return 1; } - if (savevid != currvid) { - printk(KERN_ERR PFX - "ph2 vid changed, save %x, curr %x\n", savevid, - currvid); + if (savevid != data->currvid) { + printk(KERN_ERR PFX "ph2: vid changed, save %x, curr %x\n", + savevid, data->currvid); return 1; } - dprintk(KERN_DEBUG PFX "ph2 complete, currfid 0x%x, currvid 0x%x\n", - currfid, currvid); + dprintk(KERN_DEBUG PFX "ph2: complete, currfid %x, currvid %x\n", + data->currfid, data->currvid); return 0; } /* Phase 3 - core voltage transition flow ... jump to the final vid. */ -static inline int -core_voltage_post_transition(u32 reqvid) +static inline int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid) { - u32 savefid = currfid; + u32 savefid = data->currfid; u32 savereqvid = reqvid; - dprintk(KERN_DEBUG PFX "ph3 starting, currfid 0x%x, currvid 0x%x\n", - currfid, currvid); + dprintk(KERN_DEBUG PFX "ph3 (cpu%d): starting, currfid %x, currvid %x\n", + smp_processor_id(), + data->currfid, data->currvid); - if (reqvid != currvid) { - if (write_new_vid(reqvid)) + if (reqvid != data->currvid) { + if (write_new_vid(data, reqvid)) return 1; - if (savefid != currfid) { + if (savefid != data->currfid) { printk(KERN_ERR PFX "ph3: bad fid change, save %x, curr %x\n", - savefid, currfid); + savefid, data->currfid); return 1; } - if (currvid != reqvid) { + if (data->currvid != reqvid) { printk(KERN_ERR PFX "ph3: failed vid transition\n, req %x, curr %x", - reqvid, currvid); + reqvid, data->currvid); return 1; } } - if (query_current_values_with_pending_wait()) + if (query_current_values_with_pending_wait(data)) return 1; - if (savereqvid != currvid) { - dprintk(KERN_ERR PFX "ph3 failed, currvid 0x%x\n", currvid); + if (savereqvid != data->currvid) { + printk(KERN_ERR PFX "ph3: failed, currvid %x\n", data->currvid); return 1; } - if (savefid != currfid) { - dprintk(KERN_ERR PFX "ph3 failed, currfid changed 0x%x\n", - currfid); + if (savefid != data->currfid) { + printk(KERN_ERR PFX "ph3: failed, currfid changed %x\n", + data->currfid); return 1; } - dprintk(KERN_DEBUG PFX "ph3 complete, currfid 0x%x, currvid 0x%x\n", - currfid, currvid); - + dprintk(KERN_DEBUG PFX "ph3: complete, currfid %x, currvid %x\n", + data->currfid, data->currvid); return 0; } -static inline int -check_supported_cpu(void) +static inline int check_supported_cpu(unsigned int cpu) { - struct cpuinfo_x86 *c = cpu_data; + cpumask_t oldmask = CPU_MASK_ALL; u32 eax, ebx, ecx, edx; + unsigned int rc = 0; - if (num_online_cpus() != 1) { - printk(KERN_INFO PFX "multiprocessor systems not supported\n"); - return 0; - } + oldmask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + schedule(); - if (c->x86_vendor != X86_VENDOR_AMD) { -#ifdef MODULE - printk(KERN_INFO PFX "Not an AMD processor\n"); -#endif - return 0; + if (smp_processor_id() != cpu) { + printk(KERN_ERR "limiting to cpu %u failed\n", cpu); + goto out; } + if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) + goto out; + eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); if ((eax & CPUID_XFAM_MOD) == ATHLON64_XFAM_MOD) { dprintk(KERN_DEBUG PFX "AMD Althon 64 Processor found\n"); - if ((eax & CPUID_F1_STEP) < ATHLON64_REV_C0) { - printk(KERN_INFO PFX "Revision C0 or better " - "AMD Athlon 64 processor required\n"); - return 0; - } } else if ((eax & CPUID_XFAM_MOD) == OPTERON_XFAM_MOD) { dprintk(KERN_DEBUG PFX "AMD Opteron Processor found\n"); } else { printk(KERN_INFO PFX "AMD Athlon 64 or AMD Opteron processor required\n"); - return 0; + goto out; } eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { - printk(KERN_INFO PFX - "No frequency change capabilities detected\n"); - return 0; + printk(KERN_INFO PFX "No freq change capabilities\n"); + goto out; } cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); if ((edx & P_STATE_TRANSITION_CAPABLE) != P_STATE_TRANSITION_CAPABLE) { printk(KERN_INFO PFX "Power state transitions not supported\n"); - return 0; + goto out; } - printk(KERN_INFO PFX "Found AMD64 processor supporting PowerNow (" VERSION ")\n"); - return 1; + rc = 1; + + out: + set_cpus_allowed(current, oldmask); + schedule(); + return rc; } -static int check_pst_table(struct pst_s *pst, u8 maxvid) +static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid) { unsigned int j; - u8 lastfid = 0xFF; + u8 lastfid = 0xff; - for (j = 0; j < numps; j++) { + for (j = 0; j < data->numps; j++) { if (pst[j].vid > LEAST_VID) { - printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid); + printk(KERN_ERR PFX "vid %d bad: %x\n", j, pst[j].vid); return -EINVAL; } - if (pst[j].vid < rvo) { /* vid + rvo >= 0 */ - printk(KERN_ERR PFX - "BIOS error - 0 vid exceeded with pstate %d\n", - j); + if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */ + printk(KERN_ERR PFX "0 vid exceeded with pst %d\n", j); return -ENODEV; } - if (pst[j].vid < maxvid + rvo) { /* vid + rvo >= maxvid */ - printk(KERN_ERR PFX - "BIOS error - maxvid exceeded with pstate %d\n", - j); + if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */ + printk(KERN_ERR PFX "maxvid exceeded with pst %d\n", j); return -ENODEV; } if ((pst[j].fid > MAX_FID) || (pst[j].fid & 1) || (j && (pst[j].fid < HI_FID_TABLE_BOTTOM))) { - /* Only first fid is allowed to be in "low" range */ - printk(KERN_ERR PFX "fid %d invalid : 0x%x\n", j, pst[j].fid); + printk(KERN_ERR PFX "fid %d bad: %x\n", j, pst[j].fid); return -EINVAL; } if (pst[j].fid < lastfid) @@ -531,20 +501,87 @@ static int check_pst_table(struct pst_s printk(KERN_ERR PFX "lastfid invalid\n"); return -EINVAL; } - if (lastfid > LO_FID_TABLE_TOP) { - printk(KERN_INFO PFX "first fid not from lo freq table\n"); + if (lastfid > LO_FID_TABLE_TOP) + printk(KERN_INFO PFX "first fid not from lo freq table\n"); + + return 0; +} + +static void print_basics(struct powernow_k8_data *data) +{ + int j; + for (j = 0; j < data->numps; j++) { + printk(KERN_INFO PFX " %d : fid %x (%d MHz), vid %x (%d mV)\n", j, + data->powernow_table[j].index & 0xff, + data->powernow_table[j].frequency/1000, + data->powernow_table[j].index >> 8, + find_milivolts_from_vid(data, data->powernow_table[j].index >> 8)); + } + if (data->batps) + printk(KERN_INFO PFX "Only %d pstates on battery\n", data->batps); +} + +static inline int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid) +{ + struct cpufreq_frequency_table *powernow_table; + unsigned int j; + + if (data->batps) { /* use ACPI support to get full speed on mains power */ + printk(KERN_WARNING PFX "Only %d pstates usable (use ACPI driver for full range\n", data->batps); + data->numps = data->batps; + } + + for ( j=1; jnumps; j++ ) + if (pst[j-1].fid >= pst[j].fid) { + printk(KERN_ERR PFX "PST out of sequence\n"); + return -EINVAL; + } + + if (data->numps < 2) { + printk(KERN_ERR PFX "no p states to transition\n"); + return -ENODEV; + } + + if (check_pst_table(data, pst, maxvid)) + return -EINVAL; + + powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) + * (data->numps + 1)), GFP_KERNEL); + if (!powernow_table) { + printk(KERN_ERR PFX "powernow_table memory alloc failure\n"); + return -ENOMEM; + } + + for (j = 0; j < data->numps; j++) { + powernow_table[j].index = pst[j].fid; /* lower 8 bits */ + powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */ + powernow_table[j].frequency = find_khz_freq_from_fid(pst[j].fid); + } + powernow_table[data->numps].frequency = CPUFREQ_TABLE_END; + powernow_table[data->numps].index = 0; + + if (query_current_values_with_pending_wait(data)) { + kfree(powernow_table); + return -EIO; } + dprintk(KERN_INFO PFX "cfid %x, cvid %x\n", data->currfid, data->currvid); + data->powernow_table = powernow_table; + print_basics(data); + + for (j = 0; j < data->numps; j++) + if ((pst[j].fid==data->currfid) && (pst[j].vid==data->currvid)) + return 0; + + dprintk(KERN_ERR PFX "currfid/vid do not match PST, ignoring\n"); return 0; } /* Find and validate the PSB/PST table in BIOS. */ -static inline int -find_psb_table(void) +static inline int find_psb_table(struct powernow_k8_data *data) { struct psb_s *psb; - struct pst_s *pst; - unsigned int i, j; + unsigned int i; u32 mvs; u8 maxvid; @@ -556,276 +593,427 @@ find_psb_table(void) if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0) continue; - dprintk(KERN_DEBUG PFX "found PSB header at 0x%p\n", psb); - - dprintk(KERN_DEBUG PFX "table vers: 0x%x\n", psb->tableversion); + dprintk(KERN_DEBUG PFX "found PSB header at %p\n", psb); + dprintk(KERN_DEBUG PFX "table version: %x\n", + psb->tableversion); if (psb->tableversion != PSB_VERSION_1_4) { - printk(KERN_INFO BFX "PSB table is not v1.4\n"); + printk(KERN_INFO PFX "PSB table is not v1.4\n"); return -ENODEV; } - dprintk(KERN_DEBUG PFX "flags: 0x%x\n", psb->flags1); + dprintk(KERN_DEBUG PFX "flags: %x\n", psb->flags1); if (psb->flags1) { - printk(KERN_ERR BFX "unknown flags\n"); + printk(KERN_ERR PFX "unknown flags\n"); return -ENODEV; } - vstable = psb->voltagestabilizationtime; - dprintk(KERN_DEBUG PFX "flags2: 0x%x\n", psb->flags2); - rvo = psb->flags2 & 3; - irt = ((psb->flags2) >> 2) & 3; + data->vstable = psb->voltagestabilizationtime; + dprintk(KERN_INFO PFX "voltage stabilization time: %d(*20us)\n", data->vstable); + + dprintk(KERN_DEBUG PFX "flags2: %x\n", psb->flags2); + + data->rvo = psb->flags2 & 3; + data->irt = ((psb->flags2) >> 2) & 3; mvs = ((psb->flags2) >> 4) & 3; - vidmvs = 1 << mvs; - batps = ((psb->flags2) >> 6) & 3; + data->vidmvs = 1 << mvs; + data->batps = ((psb->flags2) >> 6) & 3; - printk(KERN_INFO PFX "voltage stable in %d usec", vstable * 20); - if (batps) - printk(", only %d lowest states on battery", batps); - printk(", ramp voltage offset: %d", rvo); - printk(", isochronous relief time: %d", irt); - printk(", maximum voltage step: %d\n", mvs); + dprintk(KERN_INFO PFX "ramp voltage offset: %d\n", data->rvo); + dprintk(KERN_INFO PFX "isochronous relief time: %d\n", data->irt); + dprintk(KERN_INFO PFX "maximum voltage step: %d - %x\n", + mvs, data->vidmvs); dprintk(KERN_DEBUG PFX "numpst: 0x%x\n", psb->numpst); if (psb->numpst != 1) { - printk(KERN_ERR BFX "numpst must be 1\n"); + printk(KERN_ERR PFX "numpst must be 1\n"); return -ENODEV; } - dprintk(KERN_DEBUG PFX "cpuid: 0x%x\n", psb->cpuid); - - plllock = psb->plllocktime; - printk(KERN_INFO PFX "pll lock time: 0x%x, ", plllock); - + data->plllock = psb->plllocktime; + dprintk(KERN_INFO PFX "plllocktime: %x (units 1us)\n", + psb->plllocktime); + dprintk(KERN_INFO PFX "maxfid: %x\n", psb->maxfid); + dprintk(KERN_INFO PFX "maxvid: %x\n", psb->maxvid); maxvid = psb->maxvid; - printk("maxfid 0x%x (%d MHz), maxvid 0x%x\n", - psb->maxfid, find_freq_from_fid(psb->maxfid), maxvid); - numps = psb->numpstates; - if (numps < 2) { - printk(KERN_ERR BFX "no p states to transition\n"); - return -ENODEV; - } + data->numps = psb->numpstates; + dprintk(KERN_INFO PFX "numpstates: %x\n", data->numps); + return fill_powernow_table(data, (struct pst_s *)(psb+1), maxvid); + } - if (batps == 0) { - batps = numps; - } else if (batps > numps) { - printk(KERN_ERR BFX "batterypstates > numpstates\n"); - batps = numps; - } else { - printk(KERN_ERR PFX - "Restricting operation to %d p-states\n", batps); - printk(KERN_ERR PFX - "Check for an updated driver to access all " - "%d p-states\n", numps); - } + /* + * If you see this message, complain to BIOS manufacturer. If + * he tells you "we do not support Linux" or some similar + * nonsense, remember that Windows 2000 uses the same legacy + * mechanism that the old Linux PSB driver uses. Tell them it + * is broken with Windows 2000. + * + * The reference to the AMD documentation is chapter 9 in the + * BIOS and Kernel Developer's Guide, which is available on + * www.amd.com + */ + printk(KERN_ERR PFX "BIOS error - no PSB\n"); + return -ENODEV; +} - if (numps <= 1) { - printk(KERN_ERR PFX "only 1 p-state to transition\n"); - return -ENODEV; - } - pst = (struct pst_s *) (psb + 1); - if (check_pst_table(pst, maxvid)) - return -EINVAL; +#ifdef CONFIG_X86_POWERNOW_K8_ACPI - powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) * (numps + 1)), GFP_KERNEL); - if (!powernow_table) { - printk(KERN_ERR PFX "powernow_table memory alloc failure\n"); - return -ENOMEM; - } +static inline void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) +{ + if (!data->acpi_data.state_count) + return; - for (j = 0; j < psb->numpstates; j++) { - powernow_table[j].index = pst[j].fid; /* lower 8 bits */ - powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */ - } + data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK; + data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK; + data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; + data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK); + data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK; +} - /* If you want to override your frequency tables, this - is right place. */ +static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) +{ + int i; + int cntlofreq = 0; + struct cpufreq_frequency_table *powernow_table; - for (j = 0; j < numps; j++) { - powernow_table[j].frequency = find_freq_from_fid(powernow_table[j].index & 0xff)*1000; - printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x\n", j, - powernow_table[j].index & 0xff, - powernow_table[j].frequency/1000, - powernow_table[j].index >> 8); + if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { + dprintk(KERN_DEBUG PFX "register performance failed\n"); + return -EIO; + } + + /* verify the data contained in the ACPI structures */ + if (data->acpi_data.state_count <= 1) { + dprintk(KERN_DEBUG PFX "No ACPI P-States\n"); + goto err_out; + } + + if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || + (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { + dprintk(KERN_DEBUG PFX "Invalid control/status registers\n"); + goto err_out; + } + + /* fill in data->powernow_table */ + powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) + * (data->acpi_data.state_count + 1)), GFP_KERNEL); + if (!powernow_table) { + dprintk(KERN_ERR PFX "powernow_table memory alloc failure\n"); + goto err_out; + } + + for (i = 0; i < data->acpi_data.state_count; i++) { + u32 fid = data->acpi_data.states[i].control & FID_MASK; + u32 vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; + + dprintk(KERN_INFO PFX " %d : fid %x, vid %x\n", i, fid, vid); + + powernow_table[i].index = fid; /* lower 8 bits */ + powernow_table[i].index |= (vid << 8); /* upper 8 bits */ + powernow_table[i].frequency = find_khz_freq_from_fid(fid); + + /* verify frequency is OK */ + if ((powernow_table[i].frequency > (MAX_FREQ * 1000)) || + (powernow_table[i].frequency < (MIN_FREQ * 1000))) { + dprintk(KERN_INFO PFX "invalid freq %u kHz\n", powernow_table[i].frequency); + powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; + continue; } - powernow_table[numps].frequency = CPUFREQ_TABLE_END; - powernow_table[numps].index = 0; + /* verify only 1 entry from the lo frequency table */ + if ((fid < HI_FID_TABLE_BOTTOM) && (cntlofreq++)) { + printk(KERN_ERR PFX "Too many lo freq table entries\n"); + goto err_out; + } - if (query_current_values_with_pending_wait()) { - kfree(powernow_table); - return -EIO; + if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { + printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", + powernow_table[i].frequency, + (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); + powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; + continue; } + } + powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END; + powernow_table[data->acpi_data.state_count].index = 0; + data->powernow_table = powernow_table; + + /* fill in data */ + data->numps = data->acpi_data.state_count; + print_basics(data); - printk(KERN_INFO PFX "currfid 0x%x (%d MHz), currvid 0x%x\n", - currfid, find_freq_from_fid(currfid), currvid); + powernow_k8_acpi_pst_values(data, 0); - for (j = 0; j < numps; j++) - if ((pst[j].fid==currfid) && (pst[j].vid==currvid)) - return 0; + return 0; - printk(KERN_ERR BFX "currfid/vid do not match PST, ignoring\n"); - return 0; - } + err_out: + acpi_processor_unregister_performance(&data->acpi_data, data->cpu); + + /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ + data->acpi_data.state_count = 0; - printk(KERN_ERR BFX "no PSB\n"); return -ENODEV; } +static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) +{ + if (data->acpi_data.state_count) + acpi_processor_unregister_performance(&data->acpi_data, data->cpu); +} + +#else +static inline int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; } +static inline void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; } +static inline void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; } +#endif /* CONFIG_X86_POWERNOW_K8_ACPI */ + + /* Take a frequency, and issue the fid/vid transition command */ -static inline int -transition_frequency(unsigned int index) +static inline int transition_frequency(struct powernow_k8_data *data, unsigned int index) { u32 fid; u32 vid; int res; struct cpufreq_freqs freqs; + dprintk(KERN_DEBUG PFX "cpu %d transition to index %u\n", + smp_processor_id(), index ); + /* fid are the lower 8 bits of the index we stored into * the cpufreq frequency table in find_psb_table, vid are * the upper 8 bits. */ - fid = powernow_table[index].index & 0xFF; - vid = (powernow_table[index].index & 0xFF00) >> 8; + fid = data->powernow_table[index].index & 0xFF; + vid = (data->powernow_table[index].index & 0xFF00) >> 8; - dprintk(KERN_DEBUG PFX "table matched fid 0x%x, giving vid 0x%x\n", - fid, vid); + dprintk(KERN_DEBUG PFX "matched fid %x, giving vid %x\n", fid, vid); - if (query_current_values_with_pending_wait()) + if (query_current_values_with_pending_wait(data)) return 1; - if ((currvid == vid) && (currfid == fid)) { - dprintk(KERN_DEBUG PFX - "target matches current values (fid 0x%x, vid 0x%x)\n", + if ((data->currvid == vid) && (data->currfid == fid)) { + dprintk(KERN_DEBUG PFX "target matches curr (fid %x, vid %x)\n", fid, vid); return 0; } - if ((fid < HI_FID_TABLE_BOTTOM) && (currfid < HI_FID_TABLE_BOTTOM)) { + if ((fid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) { printk(KERN_ERR PFX "ignoring illegal change in lo freq table-%x to %x\n", - currfid, fid); + data->currfid, fid); return 1; } - dprintk(KERN_DEBUG PFX "changing to fid 0x%x, vid 0x%x\n", fid, vid); - - freqs.cpu = 0; /* only true because SMP not supported */ + dprintk(KERN_DEBUG PFX "cpu %d, changing to fid %x, vid %x\n", + smp_processor_id(), fid, vid); + freqs.cpu = data->cpu; - freqs.old = find_freq_from_fid(currfid); - freqs.new = find_freq_from_fid(fid); + freqs.old = find_khz_freq_from_fid(data->currfid); + freqs.new = find_khz_freq_from_fid(fid); cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - res = transition_fid_vid(fid, vid); + down(&fidvid_sem); + res = transition_fid_vid(data, fid, vid); + up(&fidvid_sem); - freqs.new = find_freq_from_fid(currfid); + freqs.new = find_khz_freq_from_fid(data->currfid); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); return res; } /* Driver entry point to switch to the target frequency */ -static int -powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) +static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { - u32 checkfid = currfid; - u32 checkvid = currvid; + cpumask_t oldmask = CPU_MASK_ALL; + struct powernow_k8_data *data = powernow_data[pol->cpu]; + u32 checkfid = data->currfid; + u32 checkvid = data->currvid; unsigned int newstate; + int ret = -EIO; + + /* only run on specific CPU from here on */ + oldmask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(pol->cpu)); + schedule(); + + if (smp_processor_id() != pol->cpu) { + printk(KERN_ERR "limiting to cpu %u failed\n", pol->cpu); + goto err_out; + } + + /* from this point, do not exit without restoring preempt and cpu */ + preempt_disable(); if (pending_bit_stuck()) { - printk(KERN_ERR PFX "drv targ fail: change pending bit set\n"); - return -EIO; + printk(KERN_ERR PFX "failing targ, change pending bit set\n"); + goto err_out; } - dprintk(KERN_DEBUG PFX "targ: %d kHz, min %d, max %d, relation %d\n", - targfreq, pol->min, pol->max, relation); + dprintk(KERN_DEBUG PFX "targ: cpu %d, %d kHz, min %d, max %d, relation %d\n", + pol->cpu, targfreq, pol->min, pol->max, relation); - if (query_current_values_with_pending_wait()) - return -EIO; + if (query_current_values_with_pending_wait(data)) { + ret = -EIO; + goto err_out; + } - dprintk(KERN_DEBUG PFX "targ: curr fid 0x%x, vid 0x%x\n", - currfid, currvid); + dprintk(KERN_DEBUG PFX "targ: curr fid %x, vid %x\n", + data->currfid, data->currvid); - if ((checkvid != currvid) || (checkfid != currfid)) { - printk(KERN_ERR PFX - "error - out of sync, fid 0x%x 0x%x, vid 0x%x 0x%x\n", - checkfid, currfid, checkvid, currvid); + if ((checkvid != data->currvid) || (checkfid != data->currfid)) { + printk(KERN_ERR PFX "out of sync, fid %x %x, vid %x %x\n", + checkfid, data->currfid, checkvid, data->currvid); } - if (cpufreq_frequency_table_target(pol, powernow_table, targfreq, relation, &newstate)) - return -EINVAL; - - if (transition_frequency(newstate)) + if (cpufreq_frequency_table_target(pol, data->powernow_table, targfreq, relation, &newstate)) + goto err_out; + + powernow_k8_acpi_pst_values(data, newstate); + + if (transition_frequency(data, newstate)) { printk(KERN_ERR PFX "transition frequency failed\n"); - return 1; + ret = 1; + goto err_out; } - pol->cur = 1000 * find_freq_from_fid(currfid); + pol->cur = find_khz_freq_from_fid(data->currfid); + ret = 0; - return 0; + err_out: + preempt_enable_no_resched(); + set_cpus_allowed(current, oldmask); + schedule(); + + return ret; } /* Driver entry point to verify the policy and range of frequencies */ -static int -powernowk8_verify(struct cpufreq_policy *pol) +static int powernowk8_verify(struct cpufreq_policy *pol) { - if (pending_bit_stuck()) { - printk(KERN_ERR PFX "failing verify, change pending bit set\n"); - return -EIO; - } + struct powernow_k8_data *data = powernow_data[pol->cpu]; - return cpufreq_frequency_table_verify(pol, powernow_table); + return cpufreq_frequency_table_verify(pol, data->powernow_table); } /* per CPU init entry point to the driver */ -static int __init -powernowk8_cpu_init(struct cpufreq_policy *pol) +static int __init powernowk8_cpu_init(struct cpufreq_policy *pol) { - if (pol->cpu != 0) { - printk(KERN_ERR PFX "init not cpu 0\n"); + cpumask_t oldmask = CPU_MASK_ALL; + int rc; + struct powernow_k8_data *data; + + if (!check_supported_cpu(pol->cpu)) return -ENODEV; + + data = kmalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); + if (!data) { + printk(KERN_ERR PFX "unable to alloc powernow_k8_data"); + return -ENOMEM; + } + memset(data,0,sizeof(struct powernow_k8_data)); + + data->cpu = pol->cpu; + + if (powernow_k8_cpu_init_acpi(data)) { + /* + * Use the PSB BIOS structure. This is only availabe on + * an UP version, and is deprecated by AMD. + */ + + if (pol->cpu != 0) { + printk(KERN_ERR PFX "init - cpu 0\n"); + kfree(data); + return -ENODEV; + } + + if ((num_online_cpus() != 1) || (num_possible_cpus() != 1)) { + printk(KERN_INFO PFX "MP systems not supported by PSB BIOS structure\n"); + kfree(data); + return 0; + } + + rc = find_psb_table(data); + if (rc) { + kfree(data); + return -ENODEV; + } } + /* only run on specific CPU from here on */ + oldmask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(pol->cpu)); + schedule(); + + if (smp_processor_id() != pol->cpu) { + printk(KERN_ERR "limiting to cpu %u failed\n", pol->cpu); + goto err_out; + } + + if (pending_bit_stuck()) { + printk(KERN_ERR PFX "failing init, change pending bit set\n"); + goto err_out; + } + + if (query_current_values_with_pending_wait(data)) { + goto err_out; + } + + fidvid_msr_init(); + + + /* run on any CPU again */ + set_cpus_allowed(current, oldmask); + schedule(); + pol->governor = CPUFREQ_DEFAULT_GOVERNOR; /* Take a crude guess here. - * That guess was in microseconds, so multply with 1000 */ - pol->cpuinfo.transition_latency = (((rvo + 8) * vstable * VST_UNITS_20US) - + (3 * (1 << irt) * 10)) * 1000; - - if (query_current_values_with_pending_wait()) - return -EIO; + * That guess was in microseconds, so multiply with 1000 */ + pol->cpuinfo.transition_latency = (((data->rvo + 8) * data->vstable * VST_UNITS_20US) + + (3 * (1 << data->irt) * 10)) * 1000; - pol->cur = 1000 * find_freq_from_fid(currfid); + pol->cur = find_khz_freq_from_fid(data->currfid); dprintk(KERN_DEBUG PFX "policy current frequency %d kHz\n", pol->cur); /* min/max the cpu is capable of */ - if (cpufreq_frequency_table_cpuinfo(pol, powernow_table)) { + if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { printk(KERN_ERR PFX "invalid powernow_table\n"); - kfree(powernow_table); + kfree(data->powernow_table); + kfree(data); return -EINVAL; } + cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); - cpufreq_frequency_table_get_attr(powernow_table, pol->cpu); + dprintk(KERN_INFO PFX "init, curr fid %x vid %x\n", data->currfid, data->currvid); - printk(KERN_INFO PFX "cpu_init done, current fid 0x%x, vid 0x%x\n", - currfid, currvid); + powernow_data[pol->cpu] = data; return 0; + + err_out: + set_cpus_allowed(current, oldmask); + schedule(); + + kfree(data); + return -ENODEV; } static int __exit powernowk8_cpu_exit (struct cpufreq_policy *pol) { - if (pol->cpu != 0) + struct powernow_k8_data *data = powernow_data[pol->cpu]; + + if (!data) return -EINVAL; + powernow_k8_cpu_exit_acpi(data); + cpufreq_frequency_table_put_attr(pol->cpu); - if (powernow_table) - kfree(powernow_table); + kfree(data->powernow_table); + kfree(data); return 0; } @@ -845,33 +1033,31 @@ static struct cpufreq_driver cpufreq_amd .attr = powernow_k8_attr, }; - /* driver entry point for init */ -static int __init -powernowk8_init(void) +static int __init powernowk8_init(void) { - int rc; - - if (check_supported_cpu() == 0) - return -ENODEV; + unsigned int i, supported_cpus = 0; - rc = find_psb_table(); - if (rc) - return rc; + for (i=0; itarget() call + * when ACPI is used */ + u32 rvo; /* ramp voltage offset */ + u32 irt; /* isochronous relief time */ + u32 vidmvs; /* usable value calculated from mvs */ + u32 vstable; /* voltage stabilization time, units 20 us */ + u32 plllock; /* pll lock time, units 1 us */ + + /* keep track of the current fid / vid */ + u32 currvid; + u32 currfid; + + /* the powernow_table includes all frequency and vid/fid pairings: + * fid are the lower 8 bits of the index, vid are the upper 8 bits. + * frequency is in kHz */ + struct cpufreq_frequency_table *powernow_table; + +#ifdef CONFIG_X86_POWERNOW_K8_ACPI + /* the acpi table needs to be kept. it's only available if ACPI was + * used to determine valid frequency/vid/fid states */ + struct acpi_processor_performance acpi_data; +#endif +}; + /* processor's cpuid instruction support */ -#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ -#define CPUID_F1_FAM 0x00000f00 /* family mask */ -#define CPUID_F1_XFAM 0x0ff00000 /* extended family mask */ -#define CPUID_F1_MOD 0x000000f0 /* model mask */ -#define CPUID_F1_STEP 0x0000000f /* stepping level mask */ -#define CPUID_XFAM_MOD 0x0ff00ff0 /* xtended fam, fam + model */ -#define ATHLON64_XFAM_MOD 0x00000f40 /* xtended fam, fam + model */ -#define OPTERON_XFAM_MOD 0x00000f50 /* xtended fam, fam + model */ -#define ATHLON64_REV_C0 8 +#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ +#define CPUID_XFAM_MOD 0x0ff00ff0 /* extended fam, fam + model */ +#define ATHLON64_XFAM_MOD 0x00000f40 /* extended fam, fam + model */ +#define OPTERON_XFAM_MOD 0x00000f50 /* extended fam, fam + model */ #define CPUID_GET_MAX_CAPABILITIES 0x80000000 #define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 #define P_STATE_TRANSITION_CAPABLE 6 @@ -23,7 +49,6 @@ /* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */ /* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */ /* the register number is placed in ecx, and the data is returned in edx:eax. */ - #define MSR_FIDVID_CTL 0xc0010041 #define MSR_FIDVID_STATUS 0xc0010042 @@ -47,10 +72,24 @@ #define MSR_S_HI_MAX_WORKING_VID 0x001f0000 #define MSR_S_HI_START_VID 0x00001f00 #define MSR_S_HI_CURRENT_VID 0x0000001f +#define MSR_C_HI_STP_GNT_BENIGN 0x00000001 + +/* + There are restrictions frequencies have to follow: + - only 1 entry in the low fid table ( <=1.4GHz ) + - lowest entry in the high fid table must be >= 2 * the + entry in the low fid table + - lowest entry in the high fid table must be a <= 200MHz + + 2 * the entry in the low fid table + - the parts can only step at 200 MHz intervals, so 1.9 GHz is + never valid + - lowest frequency must be >= interprocessor hypertransport link + speed (only applies to MP systems obviously) + */ /* fids (frequency identifiers) are arranged in 2 tables - lo and hi */ -#define LO_FID_TABLE_TOP 6 -#define HI_FID_TABLE_BOTTOM 8 +#define LO_FID_TABLE_TOP 6 /* fid values marking the boundary */ +#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */ #define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */ #define HI_VCOFREQ_TABLE_BOTTOM 1600 @@ -58,33 +97,44 @@ #define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */ #define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */ - #define LEAST_VID 0x1e /* Lowest (numerically highest) useful vid value */ - #define MIN_FREQ 800 /* Min and max freqs, per spec */ #define MAX_FREQ 5000 -#define INVALID_FID_MASK 0xffffffc1 /* not a valid fid if these bits are set */ - -#define INVALID_VID_MASK 0xffffffe0 /* not a valid vid if these bits are set */ +#define INVALID_FID_MASK 0xffffffc1 /* not a valid fid if these bits are set */ +#define INVALID_VID_MASK 0xffffffe0 /* not a valid vid if these bits are set */ #define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */ - #define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */ - #define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */ +#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */ -#define VST_UNITS_20US 20 /* Voltage Stabalization Time is in units of 20us */ +/* + * Most values of interest are enocoded in a single field of the _PSS + * entries: the "control" value. + */ + +#define IRT_SHIFT 30 +#define RVO_SHIFT 28 +#define PLL_L_SHIFT 20 +#define MVS_SHIFT 18 +#define VST_SHIFT 11 +#define VID_SHIFT 6 +#define IRT_MASK 3 +#define RVO_MASK 3 +#define PLL_L_MASK 0x7f +#define MVS_MASK 3 +#define VST_MASK 0x7f +#define VID_MASK 0x1f +#define FID_MASK 0x3f /* -Version 1.4 of the PSB table. This table is constructed by BIOS and is -to tell the OS's power management driver which VIDs and FIDs are -supported by this particular processor. This information is obtained from -the data sheets for each processor model by the system vendor and -incorporated into the BIOS. -If the data in the PSB / PST is wrong, then this driver will program the -wrong values into hardware, which is very likely to lead to a crash. -*/ + * Version 1.4 of the PSB table. This table is constructed by BIOS and is + * to tell the OS's power management driver which VIDs and FIDs are + * supported by this particular processor. + * If the data in the PSB / PST is wrong, then this driver will program the + * wrong values into hardware, which is very likely to lead to a crash. + */ #define PSB_ID_STRING "AMDK7PNOW!" #define PSB_ID_STRING_LEN 10 @@ -117,6 +167,7 @@ struct pst_s { #define dprintk(msg...) do { } while(0) #endif -static inline int core_voltage_pre_transition(u32 reqvid); -static inline int core_voltage_post_transition(u32 reqvid); -static inline int core_frequency_transition(u32 reqfid); +static inline int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); +static inline int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); +static inline int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); +static inline void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index); --- linux-2.6.5-rc3/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c 2004-03-30 20:21:49.000000000 -0800 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -46,7 +47,9 @@ struct cpu_model }; /* Operating points for current CPU */ -static const struct cpu_model *centrino_model; +static struct cpu_model *centrino_model; + +#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE /* Computes the correct form for IA32_PERF_CTL MSR for a particular frequency/voltage operating point; frequency in MHz, volts in mV. @@ -172,7 +175,7 @@ static struct cpufreq_frequency_table op /* CPU models, their operating frequency range, and freq/voltage operating points */ -static const struct cpu_model models[] = +static struct cpu_model models[] = { _CPU( 900, " 900"), CPU(1000), @@ -187,6 +190,48 @@ static const struct cpu_model models[] = }; #undef CPU +static int centrino_cpu_init_table(struct cpufreq_policy *policy) +{ + struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu]; + struct cpu_model *model; + + if (!cpu_has(cpu, X86_FEATURE_EST)) + return -ENODEV; + + /* Only Intel Pentium M stepping 5 for now - add new CPUs as + they appear after making sure they use PERF_CTL in the same + way. */ + if (cpu->x86_vendor != X86_VENDOR_INTEL || + cpu->x86 != 6 || + cpu->x86_model != 9 || + cpu->x86_mask != 5) { + printk(KERN_INFO PFX "found unsupported CPU with Enhanced SpeedStep: " + "send /proc/cpuinfo to " MAINTAINER "\n"); + return -ENODEV; + } + + for(model = models; model->model_name != NULL; model++) + if (strcmp(cpu->x86_model_id, model->model_name) == 0) + break; + if (model->model_name == NULL) { + printk(KERN_INFO PFX "no support for CPU model \"%s\": " + "send /proc/cpuinfo to " MAINTAINER "\n", + cpu->x86_model_id); + return -ENOENT; + } + + centrino_model = model; + + printk(KERN_INFO PFX "found \"%s\": max frequency: %dkHz\n", + model->model_name, model->max_freq); + + return 0; +} + +#else +static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } +#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ + /* Extract clock in kHz from PERF_CTL value */ static unsigned extract_clock(unsigned msr) { @@ -203,13 +248,148 @@ static unsigned get_cur_freq(void) return extract_clock(l); } + +#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI + +static struct acpi_processor_performance p; + +#include +#include + +#define ACPI_PDC_CAPABILITY_ENHANCED_SPEEDSTEP 0x1 + +/* + * centrino_cpu_init_acpi - register with ACPI P-States library + * + * Register with the ACPI P-States library (part of drivers/acpi/processor.c) + * in order to determine correct frequency and voltage pairings by reading + * the _PSS of the ACPI DSDT or SSDT tables. + */ +static int centrino_cpu_init_acpi(struct cpufreq_policy *policy) +{ + union acpi_object arg0 = {ACPI_TYPE_BUFFER}; + u32 arg0_buf[3]; + struct acpi_object_list arg_list = {1, &arg0}; + unsigned long cur_freq; + int result = 0, i; + + /* _PDC settings */ + arg0.buffer.length = 12; + arg0.buffer.pointer = (u8 *) arg0_buf; + arg0_buf[0] = ACPI_PDC_REVISION_ID; + arg0_buf[1] = 1; + arg0_buf[2] = ACPI_PDC_CAPABILITY_ENHANCED_SPEEDSTEP; + + p.pdc = &arg_list; + + /* register with ACPI core */ + if (acpi_processor_register_performance(&p, 0)) + return -EIO; + + /* verify the acpi_data */ + if (p.state_count <= 1) { + printk(KERN_DEBUG "No P-States\n"); + result = -ENODEV; + goto err_unreg; + } + + if ((p.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || + (p.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { + printk(KERN_DEBUG "Invalid control/status registers\n"); + result = -EIO; + goto err_unreg; + } + + for (i=0; imodel_name=NULL; + centrino_model->max_freq = p.states[0].core_frequency * 1000; + centrino_model->op_points = kmalloc(sizeof(struct cpufreq_frequency_table) * + (p.state_count + 1), GFP_KERNEL); + if (!centrino_model->op_points) { + result = -ENOMEM; + goto err_kfree; + } + + cur_freq = get_cur_freq(); + + for (i=0; iop_points[i].index = p.states[i].control; + centrino_model->op_points[i].frequency = p.states[i].core_frequency * 1000; + if (cur_freq == centrino_model->op_points[i].frequency) + p.state = i; + } + centrino_model->op_points[p.state_count].frequency = CPUFREQ_TABLE_END; + + return 0; + + err_kfree: + kfree(centrino_model); + err_unreg: + acpi_processor_unregister_performance(&p, 0); + return (result); +} +#else +static inline int centrino_cpu_init_acpi(struct cpufreq_policy *policy) { return -ENODEV; } +#endif + static int centrino_cpu_init(struct cpufreq_policy *policy) { unsigned freq; + unsigned l, h; + int ret; - if (policy->cpu != 0 || centrino_model == NULL) + if (policy->cpu != 0) return -ENODEV; + if (centrino_cpu_init_acpi(policy)) { + if (centrino_cpu_init_table(policy)) { + return -ENODEV; + } + } + + /* Check to see if Enhanced SpeedStep is enabled, and try to + enable it if not. */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + + if (!(l & (1<<16))) { + l |= (1<<16); + wrmsr(MSR_IA32_MISC_ENABLE, l, h); + + /* check to see if it stuck */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + if (!(l & (1<<16))) { + printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); + return -ENODEV; + } + } + freq = get_cur_freq(); policy->governor = CPUFREQ_DEFAULT_GOVERNOR; @@ -219,7 +399,33 @@ static int centrino_cpu_init(struct cpuf dprintk(KERN_INFO PFX "centrino_cpu_init: policy=%d cur=%dkHz\n", policy->policy, policy->cur); - return cpufreq_frequency_table_cpuinfo(policy, centrino_model->op_points); + ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model->op_points); + if (ret) + return (ret); + + cpufreq_frequency_table_get_attr(centrino_model->op_points, policy->cpu); + + return 0; +} + +static int centrino_cpu_exit(struct cpufreq_policy *policy) +{ + if (!centrino_model) + return -ENODEV; + + cpufreq_frequency_table_put_attr(policy->cpu); + +#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI + if (!centrino_model->model_name) { + acpi_processor_unregister_performance(&p, 0); + kfree(centrino_model->op_points); + kfree(centrino_model); + } +#endif + + centrino_model = NULL; + + return 0; } /** @@ -295,12 +501,19 @@ static int centrino_target (struct cpufr return 0; } +static struct freq_attr* centrino_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + static struct cpufreq_driver centrino_driver = { .name = "centrino", /* should be speedstep-centrino, but there's a 16 char limit */ .init = centrino_cpu_init, + .exit = centrino_cpu_exit, .verify = centrino_verify, .target = centrino_target, + .attr = centrino_attr, .owner = THIS_MODULE, }; @@ -322,55 +535,10 @@ static struct cpufreq_driver centrino_dr static int __init centrino_init(void) { struct cpuinfo_x86 *cpu = cpu_data; - const struct cpu_model *model; - unsigned l, h; if (!cpu_has(cpu, X86_FEATURE_EST)) return -ENODEV; - /* Only Intel Pentium M stepping 5 for now - add new CPUs as - they appear after making sure they use PERF_CTL in the same - way. */ - if (cpu->x86_vendor != X86_VENDOR_INTEL || - cpu->x86 != 6 || - cpu->x86_model != 9 || - cpu->x86_mask != 5) { - printk(KERN_INFO PFX "found unsupported CPU with Enhanced SpeedStep: " - "send /proc/cpuinfo to " MAINTAINER "\n"); - return -ENODEV; - } - - /* Check to see if Enhanced SpeedStep is enabled, and try to - enable it if not. */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - - if (!(l & (1<<16))) { - l |= (1<<16); - wrmsr(MSR_IA32_MISC_ENABLE, l, h); - - /* check to see if it stuck */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - if (!(l & (1<<16))) { - printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); - return -ENODEV; - } - } - - for(model = models; model->model_name != NULL; model++) - if (strcmp(cpu->x86_model_id, model->model_name) == 0) - break; - if (model->model_name == NULL) { - printk(KERN_INFO PFX "no support for CPU model \"%s\": " - "send /proc/cpuinfo to " MAINTAINER "\n", - cpu->x86_model_id); - return -ENOENT; - } - - centrino_model = model; - - printk(KERN_INFO PFX "found \"%s\": max frequency: %dkHz\n", - model->model_name, model->max_freq); - return cpufreq_register_driver(¢rino_driver); } @@ -383,5 +551,5 @@ MODULE_AUTHOR ("Jeremy Fitzhardinge #include +#include #include #include #include @@ -30,6 +31,12 @@ #define dprintk(msg...) do { } while(0) #endif +#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK +static int relaxed_check = 0; +#else +#define relaxed_check 0 +#endif + /********************************************************************* * GET PROCESSOR CORE SPEED IN KHZ * *********************************************************************/ @@ -120,7 +127,7 @@ static unsigned int pentiumM_get_frequen msr_tmp = (msr_lo >> 22) & 0x1f; dprintk(KERN_DEBUG "speedstep-lib: bits 22-26 are 0x%x\n", msr_tmp); - return (msr_tmp * 100 * 10000); + return (msr_tmp * 100 * 1000); } @@ -265,6 +272,7 @@ unsigned int speedstep_detect_processor ebx = cpuid_ebx(0x00000001); ebx &= 0x000000FF; + if (ebx != 0x06) return 0; @@ -292,7 +300,7 @@ unsigned int speedstep_detect_processor */ rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi); dprintk(KERN_DEBUG "cpufreq: Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", msr_lo, msr_hi); - if ((msr_hi & (1<<18)) && (msr_hi & (3<<24))) { + if ((msr_hi & (1<<18)) && (relaxed_check ? 1 : (msr_hi & (3<<24)))) { if (c->x86_mask == 0x01) return SPEEDSTEP_PROCESSOR_PIII_C_EARLY; else @@ -362,6 +370,11 @@ unsigned int speedstep_get_freqs(unsigne } EXPORT_SYMBOL_GPL(speedstep_get_freqs); +#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK +module_param(relaxed_check, int, 0444); +MODULE_PARM_DESC(relaxed_check, "Don't do all checks for speedstep capability."); +#endif + MODULE_AUTHOR ("Dominik Brodowski "); MODULE_DESCRIPTION ("Library for Intel SpeedStep 1 or 2 cpufreq drivers."); MODULE_LICENSE ("GPL"); --- linux-2.6.5-rc3/arch/i386/kernel/cpu/intel.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/cpu/intel.c 2004-03-31 00:56:35.427287176 -0800 @@ -10,6 +10,7 @@ #include #include #include +#include #include "cpu.h" @@ -19,8 +20,6 @@ #include #endif -extern int trap_init_f00f_bug(void); - #ifdef CONFIG_X86_INTEL_USERCOPY /* * Alignment at which movsl is preferred for bulk memory copies. @@ -148,30 +147,34 @@ static void __init Intel_errata_workarou } -static void __init init_intel(struct cpuinfo_x86 *c) -{ - char *p = NULL; - unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ - #ifdef CONFIG_X86_F00F_BUG +static int __init check_f00f(void) +{ /* * All current models of Pentium and Pentium with MMX technology CPUs * have the F0 0F bug, which lets nonprivileged users lock up the system. - * Note that the workaround only should be initialized once... + * + * Only the boot cpu is checked - there are no mixed Pentium and P6 systems. */ - c->f00f_bug = 0; - if ( c->x86 == 5 ) { - static int f00f_workaround_enabled = 0; - - c->f00f_bug = 1; - if ( !f00f_workaround_enabled ) { - trap_init_f00f_bug(); - printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); - f00f_workaround_enabled = 1; - } + boot_cpu_data.f00f_bug = 0; + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 5 ) { + boot_cpu_data.f00f_bug = 1; + + trap_init_virtual_IDT(); + printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); } + return 0; +} +__initcall(check_f00f); #endif + +static void __init init_intel(struct cpuinfo_x86 *c) +{ + char *p = NULL; + unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ + select_idle_routine(c); if (c->cpuid_level > 1) { /* supports eax=2 call */ @@ -250,6 +253,12 @@ static void __init init_intel(struct cpu /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) clear_bit(X86_FEATURE_SEP, c->x86_capability); + /* + * FIXME: SEP is disabled for 4G/4G for now: + */ +#ifdef CONFIG_X86_HIGH_ENTRY + clear_bit(X86_FEATURE_SEP, c->x86_capability); +#endif /* Names for the Pentium II/Celeron processors detectable only by also checking the cache size. --- linux-2.6.5-rc3/arch/i386/kernel/doublefault.c 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/doublefault.c 2004-03-31 00:56:34.384445712 -0800 @@ -7,12 +7,13 @@ #include #include #include +#include #define DOUBLEFAULT_STACKSIZE (1024) static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; #define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) -#define ptr_ok(x) ((x) > 0xc0000000 && (x) < 0xc1000000) +#define ptr_ok(x) (((x) > __PAGE_OFFSET && (x) < (__PAGE_OFFSET + 0x01000000)) || ((x) >= FIXADDR_START)) static void doublefault_fn(void) { @@ -38,8 +39,8 @@ static void doublefault_fn(void) printk("eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", t->eax, t->ebx, t->ecx, t->edx); - printk("esi = %08lx, edi = %08lx\n", - t->esi, t->edi); + printk("esi = %08lx, edi = %08lx, ebp = %08lx\n", + t->esi, t->edi, t->ebp); } } --- linux-2.6.5-rc3/arch/i386/kernel/entry.S 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/entry.S 2004-03-31 00:56:34.387445256 -0800 @@ -43,11 +43,25 @@ #include #include #include +#include #include #include +#include #include #include #include "irq_vectors.h" + /* We do not recover from a stack overflow, but at least + * we know it happened and should be able to track it down. + */ +#ifdef CONFIG_STACK_OVERFLOW_TEST +#define STACK_OVERFLOW_TEST \ + testl $(THREAD_SIZE - 512),%esp; \ + jnz 10f; \ + call stack_overflow; \ +10: +#else +#define STACK_OVERFLOW_TEST +#endif #define nr_syscalls ((syscall_table_size)/4) @@ -87,7 +101,102 @@ TSS_ESP0_OFFSET = (4 - 0x200) #define resume_kernel restore_all #endif -#define SAVE_ALL \ +#ifdef CONFIG_X86_HIGH_ENTRY + +#ifdef CONFIG_X86_SWITCH_PAGETABLES + +#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) +/* + * If task is preempted in __SWITCH_KERNELSPACE, and moved to another cpu, + * __switch_to repoints %esp to the appropriate virtual stack; but %ebp is + * left stale, so we must check whether to repeat the real stack calculation. + */ +#define repeat_if_esp_changed \ + xorl %esp, %ebp; \ + testl $-THREAD_SIZE, %ebp; \ + jnz 0b +#else +#define repeat_if_esp_changed +#endif + +/* clobbers ebx, edx and ebp */ + +#define __SWITCH_KERNELSPACE \ + cmpl $0xff000000, %esp; \ + jb 1f; \ + \ + /* \ + * switch pagetables and load the real stack, \ + * keep the stack offset: \ + */ \ + \ + movl $swapper_pg_dir-__PAGE_OFFSET, %edx; \ + \ + /* GET_THREAD_INFO(%ebp) intermixed */ \ +0: \ + movl %esp, %ebp; \ + movl %esp, %ebx; \ + andl $(-THREAD_SIZE), %ebp; \ + andl $(THREAD_SIZE-1), %ebx; \ + orl TI_real_stack(%ebp), %ebx; \ + repeat_if_esp_changed; \ + \ + movl %edx, %cr3; \ + movl %ebx, %esp; \ +1: + +#endif + + +#define __SWITCH_USERSPACE \ + /* interrupted any of the user return paths? */ \ + \ + movl EIP(%esp), %eax; \ + \ + cmpl $int80_ret_start_marker, %eax; \ + jb 33f; /* nope - continue with sysexit check */\ + cmpl $int80_ret_end_marker, %eax; \ + jb 22f; /* yes - switch to virtual stack */ \ +33: \ + cmpl $sysexit_ret_start_marker, %eax; \ + jb 44f; /* nope - continue with user check */ \ + cmpl $sysexit_ret_end_marker, %eax; \ + jb 22f; /* yes - switch to virtual stack */ \ + /* return to userspace? */ \ +44: \ + movl EFLAGS(%esp),%ecx; \ + movb CS(%esp),%cl; \ + testl $(VM_MASK | 3),%ecx; \ + jz 2f; \ +22: \ + /* \ + * switch to the virtual stack, then switch to \ + * the userspace pagetables. \ + */ \ + \ + GET_THREAD_INFO(%ebp); \ + movl TI_virtual_stack(%ebp), %edx; \ + movl TI_user_pgd(%ebp), %ecx; \ + \ + movl %esp, %ebx; \ + andl $(THREAD_SIZE-1), %ebx; \ + orl %ebx, %edx; \ +int80_ret_start_marker: \ + movl %edx, %esp; \ + movl %ecx, %cr3; \ + \ + __RESTORE_ALL; \ +int80_ret_end_marker: \ +2: + +#else /* !CONFIG_X86_HIGH_ENTRY */ + +#define __SWITCH_KERNELSPACE +#define __SWITCH_USERSPACE + +#endif + +#define __SAVE_ALL \ cld; \ pushl %es; \ pushl %ds; \ @@ -102,7 +211,7 @@ TSS_ESP0_OFFSET = (4 - 0x200) movl %edx, %ds; \ movl %edx, %es; -#define RESTORE_INT_REGS \ +#define __RESTORE_INT_REGS \ popl %ebx; \ popl %ecx; \ popl %edx; \ @@ -111,29 +220,28 @@ TSS_ESP0_OFFSET = (4 - 0x200) popl %ebp; \ popl %eax -#define RESTORE_REGS \ - RESTORE_INT_REGS; \ -1: popl %ds; \ -2: popl %es; \ +#define __RESTORE_REGS \ + __RESTORE_INT_REGS; \ +111: popl %ds; \ +222: popl %es; \ .section .fixup,"ax"; \ -3: movl $0,(%esp); \ - jmp 1b; \ -4: movl $0,(%esp); \ - jmp 2b; \ +444: movl $0,(%esp); \ + jmp 111b; \ +555: movl $0,(%esp); \ + jmp 222b; \ .previous; \ .section __ex_table,"a";\ .align 4; \ - .long 1b,3b; \ - .long 2b,4b; \ + .long 111b,444b;\ + .long 222b,555b;\ .previous - -#define RESTORE_ALL \ - RESTORE_REGS \ +#define __RESTORE_ALL \ + __RESTORE_REGS \ addl $4, %esp; \ -1: iret; \ +333: iret; \ .section .fixup,"ax"; \ -2: sti; \ +666: sti; \ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ movl %edx, %es; \ @@ -142,10 +250,19 @@ TSS_ESP0_OFFSET = (4 - 0x200) .previous; \ .section __ex_table,"a";\ .align 4; \ - .long 1b,2b; \ + .long 333b,666b;\ .previous +#define SAVE_ALL \ + __SAVE_ALL; \ + __SWITCH_KERNELSPACE; \ + STACK_OVERFLOW_TEST; + +#define RESTORE_ALL \ + __SWITCH_USERSPACE; \ + __RESTORE_ALL; +.section .entry.text,"ax" ENTRY(lcall7) pushfl # We get a different stack layout with call @@ -163,7 +280,7 @@ do_lcall: movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO - movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain + movl TI_exec_domain(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp popl %eax @@ -208,7 +325,7 @@ ENTRY(resume_userspace) cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - movl TI_FLAGS(%ebp), %ecx + movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done on # int/exception return? jne work_pending @@ -216,18 +333,18 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - cmpl $0,TI_PRE_COUNT(%ebp) # non-zero preempt_count ? + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_all need_resched: - movl TI_FLAGS(%ebp), %ecx # need_resched set ? + movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl jz restore_all testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all - movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) + movl $PREEMPT_ACTIVE,TI_preempt_count(%ebp) sti call schedule - movl $0,TI_PRE_COUNT(%ebp) + movl $0,TI_preempt_count(%ebp) cli jmp need_resched #endif @@ -246,38 +363,50 @@ sysenter_past_esp: pushl $(__USER_CS) pushl $SYSENTER_RETURN -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault -1: movl (%ebp),%ebp -.section __ex_table,"a" - .align 4 - .long 1b,syscall_fault -.previous - pushl %eax SAVE_ALL GET_THREAD_INFO(%ebp) cmpl $(nr_syscalls), %eax jae syscall_badsys - testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp) + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) jnz syscall_trace_entry call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) cli - movl TI_FLAGS(%ebp), %ecx + movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work + +#ifdef CONFIG_X86_SWITCH_PAGETABLES + + GET_THREAD_INFO(%ebp) + movl TI_virtual_stack(%ebp), %edx + movl TI_user_pgd(%ebp), %ecx + movl %esp, %ebx + andl $(THREAD_SIZE-1), %ebx + orl %ebx, %edx +sysexit_ret_start_marker: + movl %edx, %esp + movl %ecx, %cr3 +#endif + /* + * only ebx is not restored by the userspace sysenter vsyscall + * code, it assumes it to be callee-saved. + */ + movl EBX(%esp), %ebx + /* if something modifies registers it must also disable sysexit */ + movl EIP(%esp), %edx movl OLDESP(%esp), %ecx + sti sysexit - +#ifdef CONFIG_X86_SWITCH_PAGETABLES +sysexit_ret_end_marker: + nop +#endif # system call handler stub ENTRY(system_call) @@ -287,7 +416,7 @@ ENTRY(system_call) cmpl $(nr_syscalls), %eax jae syscall_badsys # system call tracing in operation - testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp) + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) jnz syscall_trace_entry syscall_call: call *sys_call_table(,%eax,4) @@ -296,10 +425,23 @@ syscall_exit: cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - movl TI_FLAGS(%ebp), %ecx + movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work restore_all: +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS + movl EFLAGS(%esp), %eax # mix EFLAGS and CS + movb CS(%esp), %al + testl $(VM_MASK | 3), %eax + jz resume_kernelX # returning to kernel or vm86-space + + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? + jz resume_kernelX + + int $3 + +resume_kernelX: +#endif RESTORE_ALL # perform work that needs to be done immediately before resumption @@ -312,7 +454,7 @@ work_resched: cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - movl TI_FLAGS(%ebp), %ecx + movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all @@ -327,6 +469,22 @@ work_notifysig: # deal with pending s # vm86-space xorl %edx, %edx call do_notify_resume + +#if CONFIG_X86_HIGH_ENTRY + /* + * Reload db7 if necessary: + */ + movl TI_flags(%ebp), %ecx + testb $_TIF_DB7, %cl + jnz work_db7 + + jmp restore_all + +work_db7: + movl TI_task(%ebp), %edx; + movl task_thread_db7(%edx), %edx; + movl %edx, %db7; +#endif jmp restore_all ALIGN @@ -354,7 +512,7 @@ syscall_trace_entry: # perform syscall exit tracing ALIGN syscall_exit_work: - testb $_TIF_SYSCALL_TRACE, %cl + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT), %cl jz work_pending sti # could let do_syscall_trace() call # schedule() instead @@ -382,7 +540,7 @@ syscall_badsys: */ .data ENTRY(interrupt) -.text +.previous vector=0 ENTRY(irq_entries_start) @@ -392,7 +550,7 @@ ENTRY(irq_entries_start) jmp common_interrupt .data .long 1b -.text +.previous vector=vector+1 .endr @@ -433,12 +591,17 @@ error_code: movl ES(%esp), %edi # get the function address movl %eax, ORIG_EAX(%esp) movl %ecx, ES(%esp) - movl %esp, %edx pushl %esi # push the error code - pushl %edx # push the pt_regs pointer movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es + +/* clobbers edx, ebx and ebp */ + __SWITCH_KERNELSPACE + + leal 4(%esp), %edx # prepare pt_regs + pushl %edx # push pt_regs + call *%edi addl $8, %esp jmp ret_from_exception @@ -529,7 +692,7 @@ nmi_stack_correct: pushl %edx call do_nmi addl $8, %esp - RESTORE_ALL + jmp restore_all nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) @@ -606,6 +769,8 @@ ENTRY(spurious_interrupt_bug) pushl $do_spurious_interrupt_bug jmp error_code +.previous + .data ENTRY(sys_call_table) .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ @@ -865,7 +1030,7 @@ ENTRY(sys_call_table) .long sys_epoll_create .long sys_epoll_ctl /* 255 */ .long sys_epoll_wait - .long sys_remap_file_pages + .long old_remap_file_pages .long sys_set_tid_address .long sys_timer_create .long sys_timer_settime /* 260 */ @@ -882,5 +1047,12 @@ ENTRY(sys_call_table) .long sys_utimes .long sys_fadvise64_64 .long sys_ni_syscall /* sys_vserver */ + .long sys_mq_open + .long sys_mq_unlink /* 275 */ + .long sys_mq_timedsend + .long sys_mq_timedreceive + .long sys_mq_notify + .long sys_mq_getsetattr + .long sys_remap_file_pages /* 280 */ syscall_table_size=(.-sys_call_table) --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/arch/i386/kernel/entry_trampoline.c 2004-03-31 00:56:35.256313168 -0800 @@ -0,0 +1,75 @@ +/* + * linux/arch/i386/kernel/entry_trampoline.c + * + * (C) Copyright 2003 Ingo Molnar + * + * This file contains the needed support code for 4GB userspace + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern char __entry_tramp_start, __entry_tramp_end, __start___entry_text; + +void __init init_entry_mappings(void) +{ +#ifdef CONFIG_X86_HIGH_ENTRY + + void *tramp; + int p; + + /* + * We need a high IDT and GDT for the 4G/4G split: + */ + trap_init_virtual_IDT(); + + __set_fixmap(FIX_ENTRY_TRAMPOLINE_0, __pa((unsigned long)&__entry_tramp_start), PAGE_KERNEL); + __set_fixmap(FIX_ENTRY_TRAMPOLINE_1, __pa((unsigned long)&__entry_tramp_start) + PAGE_SIZE, PAGE_KERNEL); + tramp = (void *)fix_to_virt(FIX_ENTRY_TRAMPOLINE_0); + + printk("mapped 4G/4G trampoline to %p.\n", tramp); + BUG_ON((void *)&__start___entry_text != tramp); + /* + * Virtual kernel stack: + */ + BUG_ON(__kmap_atomic_vaddr(KM_VSTACK_TOP) & (THREAD_SIZE-1)); + BUG_ON(sizeof(struct desc_struct)*NR_CPUS*GDT_ENTRIES > 2*PAGE_SIZE); + BUG_ON((unsigned int)&__entry_tramp_end - (unsigned int)&__entry_tramp_start > 2*PAGE_SIZE); + + /* + * set up the initial thread's virtual stack related + * fields: + */ + for (p = 0; p < ARRAY_SIZE(current->thread.stack_page); p++) + current->thread.stack_page[p] = virt_to_page((char *)current->thread_info + (p*PAGE_SIZE)); + + current->thread_info->virtual_stack = (void *)__kmap_atomic_vaddr(KM_VSTACK_TOP); + + for (p = 0; p < ARRAY_SIZE(current->thread.stack_page); p++) { + __kunmap_atomic_type(KM_VSTACK_TOP-p); + __kmap_atomic(current->thread.stack_page[p], KM_VSTACK_TOP-p); + } +#endif + current->thread_info->real_stack = (void *)current->thread_info; + current->thread_info->user_pgd = NULL; + current->thread.esp0 = (unsigned long)current->thread_info->real_stack + THREAD_SIZE; +} + + + +void __init entry_trampoline_setup(void) +{ + /* + * old IRQ entries set up by the boot code will still hang + * around - they are a sign of hw trouble anyway, now they'll + * produce a double fault message. + */ + trap_init_virtual_GDT(); +} --- linux-2.6.5-rc3/arch/i386/kernel/i386_ksyms.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/kernel/i386_ksyms.c 2004-03-31 00:56:34.387445256 -0800 @@ -93,7 +93,6 @@ EXPORT_SYMBOL_NOVERS(__down_failed_inter EXPORT_SYMBOL_NOVERS(__down_failed_trylock); EXPORT_SYMBOL_NOVERS(__up_wakeup); /* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_generic); /* Delay loops */ EXPORT_SYMBOL(__ndelay); EXPORT_SYMBOL(__udelay); @@ -107,13 +106,17 @@ EXPORT_SYMBOL_NOVERS(__get_user_4); EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strstr); +#if !defined(CONFIG_X86_UACCESS_INDIRECT) EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); +EXPORT_SYMBOL(__direct_strncpy_from_user); EXPORT_SYMBOL(clear_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__copy_from_user_ll); EXPORT_SYMBOL(__copy_to_user_ll); EXPORT_SYMBOL(strnlen_user); +#else /* CONFIG_X86_UACCESS_INDIRECT */ +EXPORT_SYMBOL(direct_csum_partial_copy_generic); +#endif EXPORT_SYMBOL(dma_alloc_coherent); EXPORT_SYMBOL(dma_free_coherent); --- linux-2.6.5-rc3/arch/i386/kernel/i387.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/i386/kernel/i387.c 2004-03-31 00:56:34.388445104 -0800 @@ -218,6 +218,7 @@ void set_fpu_mxcsr( struct task_struct * static int convert_fxsr_to_user( struct _fpstate __user *buf, struct i387_fxsave_struct *fxsave ) { + struct _fpreg tmp[8]; /* 80 bytes scratch area */ unsigned long env[7]; struct _fpreg __user *to; struct _fpxreg *from; @@ -234,23 +235,25 @@ static int convert_fxsr_to_user( struct if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) ) return 1; - to = &buf->_st[0]; + to = tmp; from = (struct _fpxreg *) &fxsave->st_space[0]; for ( i = 0 ; i < 8 ; i++, to++, from++ ) { unsigned long *t = (unsigned long *)to; unsigned long *f = (unsigned long *)from; - if (__put_user(*f, t) || - __put_user(*(f + 1), t + 1) || - __put_user(from->exponent, &to->exponent)) - return 1; + *t = *f; + *(t + 1) = *(f+1); + to->exponent = from->exponent; } + if (copy_to_user(buf->_st, tmp, sizeof(struct _fpreg [8]))) + return 1; return 0; } static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave, struct _fpstate __user *buf ) { + struct _fpreg tmp[8]; /* 80 bytes scratch area */ unsigned long env[7]; struct _fpxreg *to; struct _fpreg __user *from; @@ -258,6 +261,8 @@ static int convert_fxsr_from_user( struc if ( __copy_from_user( env, buf, 7 * sizeof(long) ) ) return 1; + if (copy_from_user(tmp, buf->_st, sizeof(struct _fpreg [8]))) + return 1; fxsave->cwd = (unsigned short)(env[0] & 0xffff); fxsave->swd = (unsigned short)(env[1] & 0xffff); @@ -269,15 +274,14 @@ static int convert_fxsr_from_user( struc fxsave->fos = env[6]; to = (struct _fpxreg *) &fxsave->st_space[0]; - from = &buf->_st[0]; + from = tmp; for ( i = 0 ; i < 8 ; i++, to++, from++ ) { unsigned long *t = (unsigned long *)to; unsigned long *f = (unsigned long *)from; - if (__get_user(*t, f) || - __get_user(*(t + 1), f + 1) || - __get_user(to->exponent, &from->exponent)) - return 1; + *t = *f; + *(t + 1) = *(f + 1); + to->exponent = from->exponent; } return 0; } --- linux-2.6.5-rc3/arch/i386/kernel/i8259.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/i8259.c 2004-03-31 00:56:32.878674624 -0800 @@ -444,4 +444,7 @@ void __init init_IRQ(void) */ if (boot_cpu_data.hard_math && !cpu_has_fpu) setup_irq(FPU_IRQ, &fpu_irq); + + current_thread_info()->cpu = 0; + irq_ctx_init(0); } --- linux-2.6.5-rc3/arch/i386/kernel/init_task.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/kernel/init_task.c 2004-03-31 00:56:34.389444952 -0800 @@ -26,7 +26,7 @@ EXPORT_SYMBOL(init_mm); */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union) }; /* * Initial task structure. @@ -44,5 +44,5 @@ EXPORT_SYMBOL(init_task); * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -struct tss_struct init_tss[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = INIT_TSS }; +struct tss_struct init_tss[NR_CPUS] __attribute__((__section__(".data.tss"))) = { [0 ... NR_CPUS-1] = INIT_TSS }; --- linux-2.6.5-rc3/arch/i386/kernel/io_apic.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/io_apic.c 2004-03-31 00:54:25.011113448 -0800 @@ -317,8 +317,7 @@ struct irq_cpu_info { #define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) -#define CPU_TO_PACKAGEINDEX(i) \ - ((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i) +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) @@ -401,6 +400,7 @@ static void do_irq_balance(void) unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); unsigned long move_this_load = 0; int max_loaded = 0, min_loaded = 0; + int load; unsigned long useful_load_threshold = balanced_irq_interval + 10; int selected_irq; int tmp_loaded, first_attempt = 1; @@ -452,7 +452,7 @@ static void do_irq_balance(void) for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - if (physical_balance && i > cpu_sibling_map[i]) + if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (min_cpu_irq > CPU_IRQ(i)) { min_cpu_irq = CPU_IRQ(i); @@ -471,7 +471,7 @@ tryanothercpu: for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - if (physical_balance && i > cpu_sibling_map[i]) + if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (max_cpu_irq <= CPU_IRQ(i)) continue; @@ -551,9 +551,14 @@ tryanotherirq: * We seek the least loaded sibling by making the comparison * (A+B)/2 vs B */ - if (physical_balance && (CPU_IRQ(min_loaded) >> 1) > - CPU_IRQ(cpu_sibling_map[min_loaded])) - min_loaded = cpu_sibling_map[min_loaded]; + load = CPU_IRQ(min_loaded) >> 1; + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + if (load > CPU_IRQ(j)) { + /* This won't change cpu_sibling_map[min_loaded] */ + load = CPU_IRQ(j); + min_loaded = j; + } + } cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]); target_cpu_mask = cpumask_of_cpu(min_loaded); --- linux-2.6.5-rc3/arch/i386/kernel/irq.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/irq.c 2004-03-31 00:56:34.974356032 -0800 @@ -75,6 +75,14 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline static void register_irq_proc (unsigned int irq); /* + * per-CPU IRQ handling stacks + */ +#ifdef CONFIG_4KSTACKS +union irq_ctx *hardirq_ctx[NR_CPUS]; +union irq_ctx *softirq_ctx[NR_CPUS]; +#endif + +/* * Special irq handlers. */ @@ -126,11 +134,9 @@ struct hw_interrupt_type no_irq_type = { }; atomic_t irq_err_count; -#ifdef CONFIG_X86_IO_APIC -#ifdef APIC_MISMATCH_DEBUG +#if defined(CONFIG_X86_IO_APIC) && defined(APIC_MISMATCH_DEBUG) atomic_t irq_mis_count; #endif -#endif /* * Generic, controller-independent functions: @@ -186,11 +192,9 @@ skip: seq_putc(p, '\n'); #endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#ifdef CONFIG_X86_IO_APIC -#ifdef APIC_MISMATCH_DEBUG +#if defined(CONFIG_X86_IO_APIC) && defined(APIC_MISMATCH_DEBUG) seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); #endif -#endif } return 0; } @@ -213,7 +217,7 @@ inline void synchronize_irq(unsigned int * waste of time and is not what some drivers would * prefer. */ -int handle_IRQ_event(unsigned int irq, +asmlinkage int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, struct irqaction *action) { int status = 1; /* Force the "do bottom halves" bit */ @@ -436,7 +440,7 @@ asmlinkage unsigned int do_IRQ(struct pt __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE - 1)); - if (unlikely(esp < (sizeof(struct thread_info) + 1024))) { + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { printk("do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); dump_stack(); @@ -484,11 +488,70 @@ asmlinkage unsigned int do_IRQ(struct pt * useful for irq hardware that does not mask cleanly in an * SMP environment. */ +#ifdef CONFIG_4KSTACKS + for (;;) { irqreturn_t action_ret; + u32 *isp; + union irq_ctx * curctx; + union irq_ctx * irqctx; + + curctx = (union irq_ctx *) current_thread_info(); + irqctx = hardirq_ctx[smp_processor_id()]; spin_unlock(&desc->lock); + + /* + * this is where we switch to the IRQ stack. However, if we are already using + * the IRQ stack (because we interrupted a hardirq handler) we can't do that + * and just have to keep using the current stack (which is the irq stack already + * after all) + */ + + if (curctx == irqctx) + action_ret = handle_IRQ_event(irq, ®s, action); + else { + /* build the stack frame on the IRQ stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + irqctx->tinfo.task = curctx->tinfo.task; + irqctx->tinfo.real_stack = curctx->tinfo.real_stack; + irqctx->tinfo.virtual_stack = curctx->tinfo.virtual_stack; + irqctx->tinfo.previous_esp = current_stack_pointer(); + + *--isp = (u32) action; + *--isp = (u32) ®s; + *--isp = (u32) irq; + + asm volatile( + " xchgl %%ebx,%%esp \n" + " call handle_IRQ_event \n" + " xchgl %%ebx,%%esp \n" + : "=a"(action_ret) + : "b"(isp) + : "memory", "cc", "edx", "ecx" + ); + + + } + spin_lock(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + if (curctx != irqctx) + irqctx->tinfo.task = NULL; + if (likely(!(desc->status & IRQ_PENDING))) + break; + desc->status &= ~IRQ_PENDING; + } + +#else + + for (;;) { + irqreturn_t action_ret; + + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, ®s, action); + spin_lock(&desc->lock); if (!noirqdebug) note_interrupt(irq, desc, action_ret); @@ -496,6 +559,7 @@ asmlinkage unsigned int do_IRQ(struct pt break; desc->status &= ~IRQ_PENDING; } +#endif desc->status &= ~IRQ_INPROGRESS; out: @@ -508,6 +572,8 @@ out: irq_exit(); + kgdb_process_breakpoint(); + return 1; } @@ -1053,3 +1119,81 @@ void init_irq_proc (void) register_irq_proc(i); } + +#ifdef CONFIG_4KSTACKS +static char softirq_stack[NR_CPUS * THREAD_SIZE] __attribute__((__aligned__(THREAD_SIZE))); +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __attribute__((__aligned__(THREAD_SIZE))); + +/* + * allocate per-cpu stacks for hardirq and for softirq processing + */ +void irq_ctx_init(int cpu) +{ + union irq_ctx *irqctx; + + if (hardirq_ctx[cpu]) + return; + + irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + hardirq_ctx[cpu] = irqctx; + + irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = SOFTIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + softirq_ctx[cpu] = irqctx; + + printk("CPU %u irqstacks, hard=%p soft=%p\n", + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); +} + +extern asmlinkage void __do_softirq(void); + +asmlinkage void do_softirq(void) +{ + unsigned long flags; + struct thread_info *curctx; + union irq_ctx *irqctx; + u32 *isp; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + if (local_softirq_pending()) { + curctx = current_thread_info(); + irqctx = softirq_ctx[smp_processor_id()]; + irqctx->tinfo.task = curctx->task; + irqctx->tinfo.real_stack = curctx->real_stack; + irqctx->tinfo.virtual_stack = curctx->virtual_stack; + irqctx->tinfo.previous_esp = current_stack_pointer(); + + /* build the stack frame on the softirq stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + + + asm volatile( + " xchgl %%ebx,%%esp \n" + " call __do_softirq \n" + " movl %%ebx,%%esp \n" + : "=b"(isp) + : "0"(isp) + : "memory", "cc", "edx", "ecx", "eax" + ); + } + + local_irq_restore(flags); +} + +EXPORT_SYMBOL(do_softirq); +#endif --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/arch/i386/kernel/kgdb_stub.c 2004-03-30 23:39:44.581242320 -0800 @@ -0,0 +1,2458 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (c) 2000 VERITAS Software Corporation. + * + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: David Grothe + * Updated by: Robert Walsh + * Updated by: wangdi + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Compatibility with 2.1.xx kernel by David Grothe + * + * Changes to allow auto initilization. All that is needed is that it + * be linked with the kernel and a break point (int 3) be executed. + * The header file defines BREAKPOINT to allow one to do + * this. It should also be possible, once the interrupt system is up, to + * call putDebugChar("+"). Once this is done, the remote debugger should + * get our attention by sending a ^C in a packet. George Anzinger + * + * Integrated into 2.2.5 kernel by Tigran Aivazian + * Added thread support, support for multiple processors, + * support for ia-32(x86) hardware debugging. + * Amit S. Kale ( akale@veritas.com ) + * + * Modified to support debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ +#define KGDB_VERSION "<20030915.1651.33>" +#include +#include +#include /* for strcpy */ +#include +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#include +#include +#include +#include +#include +#include +#include + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int tty_putDebugChar(int); /* write a single character */ +extern int tty_getDebugChar(void); /* read and return a single char */ +extern void tty_flushDebugChar(void); /* flush pending characters */ +extern int eth_putDebugChar(int); /* write a single character */ +extern int eth_getDebugChar(void); /* read and return a single char */ +extern void eth_flushDebugChar(void); /* flush pending characters */ + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 400 + +char *kgdb_version = KGDB_VERSION; + +/* debug > 0 prints ill-formed commands in valid packets & checksum errors */ +int debug_regs = 0; /* set to non-zero to print registers */ + +/* filled in by an external module */ +char *gdb_module_offsets; + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES 64 +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +enum regnames { _EAX, /* 0 */ + _ECX, /* 1 */ + _EDX, /* 2 */ + _EBX, /* 3 */ + _ESP, /* 4 */ + _EBP, /* 5 */ + _ESI, /* 6 */ + _EDI, /* 7 */ + _PC /* 8 also known as eip */ , + _PS /* 9 also known as eflags */ , + _CS, /* 10 */ + _SS, /* 11 */ + _DS, /* 12 */ + _ES, /* 13 */ + _FS, /* 14 */ + _GS /* 15 */ +}; + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* + * Put the error code here just in case the user cares. + * Likewise, the vector number here (since GDB only gets the signal + * number through the usual means, and that's not very specific). + * The called_from is the return address so he can tell how we entered kgdb. + * This will allow him to seperate out the various possible entries. + */ +#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */ + +#define PID_MAX PID_MAX_DEFAULT + +#ifdef CONFIG_SMP +void smp_send_nmi_allbutself(void); +#define IF_SMP(x) x +#undef MAX_NO_CPUS +#ifndef CONFIG_NO_KGDB_CPUS +#define CONFIG_NO_KGDB_CPUS 2 +#endif +#if CONFIG_NO_KGDB_CPUS > NR_CPUS +#define MAX_NO_CPUS NR_CPUS +#else +#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS +#endif +#define hold_init hold_on_sstep: 1, +#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL) +#define NUM_CPUS num_online_cpus() +#else +#define IF_SMP(x) +#define hold_init +#undef MAX_NO_CPUS +#define MAX_NO_CPUS 1 +#define NUM_CPUS 1 +#endif +#define NOCPU (struct task_struct *)0xbad1fbad +/* *INDENT-OFF* */ +struct kgdb_info { + int used_malloc; + void *called_from; + long long entry_tsc; + int errcode; + int vector; + int print_debug_info; +#ifdef CONFIG_SMP + int hold_on_sstep; + struct { + volatile struct task_struct *task; + int pid; + int hold; + struct pt_regs *regs; + } cpus_waiting[MAX_NO_CPUS]; +#endif +} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1}; + +/* *INDENT-ON* */ + +#define used_m kgdb_info.used_malloc +/* + * This is little area we set aside to contain the stack we + * need to build to allow gdb to call functions. We use one + * per cpu to avoid locking issues. We will do all this work + * with interrupts off so that should take care of the protection + * issues. + */ +#define LOOKASIDE_SIZE 200 /* should be more than enough */ +#define MALLOC_MAX 200 /* Max malloc size */ +struct { + unsigned int esp; + int array[LOOKASIDE_SIZE]; +} fn_call_lookaside[MAX_NO_CPUS]; + +static int trap_cpu; +static unsigned int OLD_esp; + +#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE] +#define IF_BIT 0x200 +#define TF_BIT 0x100 + +#define MALLOC_ROUND 8-1 + +static char malloc_array[MALLOC_MAX]; +IF_SMP(static void to_gdb(const char *mess)); +void * +malloc(int size) +{ + + if (size <= (MALLOC_MAX - used_m)) { + int old_used = used_m; + used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND)); + return &malloc_array[old_used]; + } else { + return NULL; + } +} + +/* + * I/O dispatch functions... + * Based upon kgdboe, either call the ethernet + * handler or the serial one.. + */ +void +putDebugChar(int c) +{ + if (!kgdboe) { + tty_putDebugChar(c); + } else { + eth_putDebugChar(c); + } +} + +int +getDebugChar(void) +{ + if (!kgdboe) { + return tty_getDebugChar(); + } else { + return eth_getDebugChar(); + } +} + +void +flushDebugChar(void) +{ + if (!kgdboe) { + tty_flushDebugChar(); + } else { + eth_flushDebugChar(); + } +} + +/* + * Gdb calls functions by pushing agruments, including a return address + * on the stack and the adjusting EIP to point to the function. The + * whole assumption in GDB is that we are on a different stack than the + * one the "user" i.e. code that hit the break point, is on. This, of + * course is not true in the kernel. Thus various dodges are needed to + * do the call without directly messing with EIP (which we can not change + * as it is just a location and not a register. To adjust it would then + * require that we move every thing below EIP up or down as needed. This + * will not work as we may well have stack relative pointer on the stack + * (such as the pointer to regs, for example). + + * So here is what we do: + * We detect gdb attempting to store into the stack area and instead, store + * into the fn_call_lookaside.array at the same relative location as if it + * were the area ESP pointed at. We also trap ESP modifications + * and uses these to adjust fn_call_lookaside.esp. On entry + * fn_call_lookaside.esp will be set to point at the last entry in + * fn_call_lookaside.array. This allows us to check if it has changed, and + * if so, on exit, we add the registers we will use to do the move and a + * trap/ interrupt return exit sequence. We then adjust the eflags in the + * regs array (remember we now have a copy in the fn_call_lookaside.array) to + * kill the interrupt bit, AND we change EIP to point at our set up stub. + * As part of the register set up we preset the registers to point at the + * begining and end of the fn_call_lookaside.array, so all the stub needs to + * do is move words from the array to the stack until ESP= the desired value + * then do the rti. This will then transfer to the desired function with + * all the correct registers. Nifty huh? + */ +extern asmlinkage void fn_call_stub(void); +extern asmlinkage void fn_rtn_stub(void); +/* *INDENT-OFF* */ +__asm__("fn_rtn_stub:\n\t" + "movl %eax,%esp\n\t" + "fn_call_stub:\n\t" + "1:\n\t" + "addl $-4,%ebx\n\t" + "movl (%ebx), %eax\n\t" + "pushl %eax\n\t" + "cmpl %esp,%ecx\n\t" + "jne 1b\n\t" + "popl %eax\n\t" + "popl %ebx\n\t" + "popl %ecx\n\t" + "iret \n\t"); +/* *INDENT-ON* */ +#define gdb_i386vector kgdb_info.vector +#define gdb_i386errcode kgdb_info.errcode +#define waiting_cpus kgdb_info.cpus_waiting +#define remote_debug kgdb_info.print_debug_info +#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold +/* gdb locks */ + +#ifdef CONFIG_SMP +static int in_kgdb_called; +static spinlock_t waitlocks[MAX_NO_CPUS] = + {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; +/* + * The following array has the thread pointer of each of the "other" + * cpus. We make it global so it can be seen by gdb. + */ +volatile int in_kgdb_entry_log[MAX_NO_CPUS]; +volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; +/* +static spinlock_t continuelocks[MAX_NO_CPUS]; +*/ +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +/* waiters on our spinlock plus us */ +static atomic_t spinlock_waiters = ATOMIC_INIT(1); +static int spinlock_count = 0; +static int spinlock_cpu = 0; +/* + * Note we use nested spin locks to account for the case where a break + * point is encountered when calling a function by user direction from + * kgdb. Also there is the memory exception recursion to account for. + * Well, yes, but this lets other cpus thru too. Lets add a + * cpu id to the lock. + */ +#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \ + spinlock_cpu != smp_processor_id()){\ + atomic_inc(&spinlock_waiters); \ + while (! spin_trylock(x)) {\ + in_kgdb(®s);\ + }\ + atomic_dec(&spinlock_waiters); \ + spinlock_count = 1; \ + spinlock_cpu = smp_processor_id(); \ + }else{ \ + spinlock_count++; \ + } +#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x) +#else +unsigned kgdb_spinlock = 0; +#define KGDB_SPIN_LOCK(x) --*x +#define KGDB_SPIN_UNLOCK(x) ++*x +#endif + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + if ((remote_debug) && (checksum != xmitcsum)) { + printk + ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n", + checksum, xmitcsum, buffer); + } + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + + if (remote_debug) + printk("R:%s\n", buffer); + flushDebugChar(); +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + + if (!kgdboe) { + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + putDebugChar(ch); + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + + } while ((getDebugChar() & 0x7f) != '+'); + } else { + /* + * For udp, we can not transfer too much bytes once. + * We only transfer MAX_SEND_COUNT size bytes each time + */ + +#define MAX_SEND_COUNT 30 + + int send_count = 0, i = 0; + char send_buf[MAX_SEND_COUNT]; + + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + send_count = 0; + while ((ch = buffer[count])) { + if (send_count >= MAX_SEND_COUNT) { + for(i = 0; i < MAX_SEND_COUNT; i++) { + putDebugChar(send_buf[i]); + } + flushDebugChar(); + send_count = 0; + } else { + send_buf[send_count] = ch; + checksum += ch; + count ++; + send_count++; + } + } + for(i = 0; i < send_count; i++) + putDebugChar(send_buf[i]); + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + } while ((getDebugChar() & 0x7f) != '+'); + } +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static short error; + +void +debug_error(char *format, char *parm) +{ + if (remote_debug) + printk(format, parm); +} + +static void +print_regs(struct pt_regs *regs) +{ + printk("EAX=%08lx ", regs->eax); + printk("EBX=%08lx ", regs->ebx); + printk("ECX=%08lx ", regs->ecx); + printk("EDX=%08lx ", regs->edx); + printk("\n"); + printk("ESI=%08lx ", regs->esi); + printk("EDI=%08lx ", regs->edi); + printk("EBP=%08lx ", regs->ebp); + printk("ESP=%08lx ", (long) ®s->esp); + printk("\n"); + printk(" DS=%08x ", regs->xds); + printk(" ES=%08x ", regs->xes); + printk(" SS=%08x ", __KERNEL_DS); + printk(" FL=%08lx ", regs->eflags); + printk("\n"); + printk(" CS=%08x ", regs->xcs); + printk(" IP=%08lx ", regs->eip); +#if 0 + printk(" FS=%08x ", regs->fs); + printk(" GS=%08x ", regs->gs); +#endif + printk("\n"); + +} /* print_regs */ + +#define NEW_esp fn_call_lookaside[trap_cpu].esp + +static void +regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_EAX] = regs->eax; + gdb_regs[_EBX] = regs->ebx; + gdb_regs[_ECX] = regs->ecx; + gdb_regs[_EDX] = regs->edx; + gdb_regs[_ESI] = regs->esi; + gdb_regs[_EDI] = regs->edi; + gdb_regs[_EBP] = regs->ebp; + gdb_regs[_DS] = regs->xds; + gdb_regs[_ES] = regs->xes; + gdb_regs[_PS] = regs->eflags; + gdb_regs[_CS] = regs->xcs; + gdb_regs[_PC] = regs->eip; + /* Note, as we are a debugging the kernel, we will always + * trap in kernel code, this means no priviledge change, + * and so the pt_regs structure is not completely valid. In a non + * privilege change trap, only EFLAGS, CS and EIP are put on the stack, + * SS and ESP are not stacked, this means that the last 2 elements of + * pt_regs is not valid (they would normally refer to the user stack) + * also, using regs+1 is no good because you end up will a value that is + * 2 longs (8) too high. This used to cause stepping over functions + * to fail, so my fix is to use the address of regs->esp, which + * should point at the end of the stack frame. Note I have ignored + * completely exceptions that cause an error code to be stacked, such + * as double fault. Stuart Hughes, Zentropix. + * original code: gdb_regs[_ESP] = (int) (regs + 1) ; + + * this is now done on entry and moved to OLD_esp (as well as NEW_esp). + */ + gdb_regs[_ESP] = NEW_esp; + gdb_regs[_SS] = __KERNEL_DS; + gdb_regs[_FS] = 0xFFFF; + gdb_regs[_GS] = 0xFFFF; +} /* regs_to_gdb_regs */ + +static void +gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) +{ + regs->eax = gdb_regs[_EAX]; + regs->ebx = gdb_regs[_EBX]; + regs->ecx = gdb_regs[_ECX]; + regs->edx = gdb_regs[_EDX]; + regs->esi = gdb_regs[_ESI]; + regs->edi = gdb_regs[_EDI]; + regs->ebp = gdb_regs[_EBP]; + regs->xds = gdb_regs[_DS]; + regs->xes = gdb_regs[_ES]; + regs->eflags = gdb_regs[_PS]; + regs->xcs = gdb_regs[_CS]; + regs->eip = gdb_regs[_PC]; + NEW_esp = gdb_regs[_ESP]; /* keep the value */ +#if 0 /* can't change these */ + regs->esp = gdb_regs[_ESP]; + regs->xss = gdb_regs[_SS]; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif + +} /* gdb_regs_to_regs */ +extern void scheduling_functions_start_here(void); +extern void scheduling_functions_end_here(void); +#define first_sched ((unsigned long) scheduling_functions_start_here) +#define last_sched ((unsigned long) scheduling_functions_end_here) + +int thread_list = 0; + +void +get_gdb_regs(struct task_struct *p, struct pt_regs *regs, int *gdb_regs) +{ + unsigned long stack_page; + int count = 0; + IF_SMP(int i); + if (!p || p == current) { + regs_to_gdb_regs(gdb_regs, regs); + return; + } +#ifdef CONFIG_SMP + for (i = 0; i < MAX_NO_CPUS; i++) { + if (p == kgdb_info.cpus_waiting[i].task) { + regs_to_gdb_regs(gdb_regs, + kgdb_info.cpus_waiting[i].regs); + gdb_regs[_ESP] = + (int) &kgdb_info.cpus_waiting[i].regs->esp; + + return; + } + } +#endif + memset(gdb_regs, 0, NUMREGBYTES); + gdb_regs[_ESP] = p->thread.esp; + gdb_regs[_PC] = p->thread.eip; + gdb_regs[_EBP] = *(int *) gdb_regs[_ESP]; + gdb_regs[_EDI] = *(int *) (gdb_regs[_ESP] + 4); + gdb_regs[_ESI] = *(int *) (gdb_regs[_ESP] + 8); + +/* + * This code is to give a more informative notion of where a process + * is waiting. It is used only when the user asks for a thread info + * list. If he then switches to the thread, s/he will find the task + * is in schedule, but a back trace should show the same info we come + * up with. This code was shamelessly purloined from process.c. It was + * then enhanced to provide more registers than simply the program + * counter. + */ + + if (!thread_list) { + return; + } + + if (p->state == TASK_RUNNING) + return; + stack_page = (unsigned long) p->thread_info; + if (gdb_regs[_ESP] < stack_page || gdb_regs[_ESP] > + THREAD_SIZE - sizeof(long) + stack_page) + return; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + do { + if (gdb_regs[_EBP] < stack_page || + gdb_regs[_EBP] > THREAD_SIZE - 2*sizeof(long) + stack_page) + return; + gdb_regs[_PC] = *(unsigned long *) (gdb_regs[_EBP] + 4); + gdb_regs[_ESP] = gdb_regs[_EBP] + 8; + gdb_regs[_EBP] = *(unsigned long *) gdb_regs[_EBP]; + if (gdb_regs[_PC] < first_sched || gdb_regs[_PC] >= last_sched) + return; + } while (count++ < 16); + return; +} + +/* Indicate to caller of mem2hex or hex2mem that there has been an + error. */ +static volatile int mem_err = 0; +static volatile int mem_err_expected = 0; +static volatile int mem_err_cnt = 0; +static int garbage_loc = -1; + +int +get_char(char *addr) +{ + return *addr; +} + +void +set_char(char *addr, int val, int may_fault) +{ + /* + * This code traps references to the area mapped to the kernel + * stack as given by the regs and, instead, stores to the + * fn_call_lookaside[cpu].array + */ + if (may_fault && + (unsigned int) addr < OLD_esp && + ((unsigned int) addr > (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) { + addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr); + } + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set mem_err in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + /* printk("%lx = ", mem) ; */ + + ch = get_char(mem++); + + /* printk("%02x\n", ch & 0xFF) ; */ + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault fetching from addr %lx\n", + (long) (mem - 1)); + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + mem_err_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +/* NOTE: We use the may fault flag to also indicate if the write is to + * the registers (0) or "other" memory (!=0) + */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch, may_fault); + + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault storing to addr %lx\n", + (long) (mem - 1)); + return (mem); + } + } + if (may_fault) + mem_err_expected = 0; + return (mem); +} + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToInt(char **ptr, int *intValue) +{ + int numChars = 0; + int hexValue; + + *intValue = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *intValue = (*intValue << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#define stubhex(h) hex(h) +#ifdef old_thread_list + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +#ifdef old_thread_list +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} +int +int_to_hex_v(unsigned char * id, int value) +{ + unsigned char *start = id; + int shift; + int ch; + + for (shift = 28; shift >= 0; shift -= 4) { + if ((ch = (value >> shift) & 0xf) || (id != start)) { + *id = hexchars[ch]; + id++; + } + } + if (id == start) + *id++ = '0'; + return id - start; +} +#ifdef old_thread_list + +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} +#endif +static int +cmp_str(char *s1, char *s2, int count) +{ + while (count--) { + if (*s1++ != *s2++) + return 0; + } + return 1; +} + +#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */ +extern struct task_struct *kgdb_get_idle(int cpu); +#define idle_task(cpu) kgdb_get_idle(cpu) +#else +#define idle_task(cpu) init_tasks[cpu] +#endif + +extern int kgdb_pid_init_done; + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) { + + return idle_task(pid - PID_MAX); + } else { + /* + * find_task_by_pid is relatively safe all the time + * Other pid functions require lock downs which imply + * that we may be interrupting them (as we get here + * in the middle of most any lock down). + * Still we don't want to call until the table exists! + */ + if (kgdb_pid_init_done){ + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } + } + } + return NULL; +} +/* *INDENT-OFF* */ +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned addr; +} breakinfo[4] = { {enabled:0}, + {enabled:0}, + {enabled:0}, + {enabled:0}}; +/* *INDENT-ON* */ +unsigned hw_breakpoint_status; +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned dr7; + + asm volatile ("movl %%db7, %0\n":"=r" (dr7) + :); + /* *INDENT-OFF* */ + do { + unsigned addr0, addr1, addr2, addr3; + asm volatile ("movl %%db0, %0\n" + "movl %%db1, %1\n" + "movl %%db2, %2\n" + "movl %%db3, %3\n" + :"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3) + :); + } while (0); + /* *INDENT-ON* */ + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movl %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movl %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movl %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movl %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movl %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +#ifdef CONFIG_SMP +static int in_kgdb_console = 0; + +int +in_kgdb(struct pt_regs *regs) +{ + unsigned flags; + int cpu = smp_processor_id(); + in_kgdb_called = 1; + if (!spin_is_locked(&kgdb_spinlock)) { + if (in_kgdb_here_log[cpu] || /* we are holding this cpu */ + in_kgdb_console) { /* or we are doing slow i/o */ + return 1; + } + return 0; + } + + /* As I see it the only reason not to let all cpus spin on + * the same spin_lock is to allow selected ones to proceed. + * This would be a good thing, so we leave it this way. + * Maybe someday.... Done ! + + * in_kgdb() is called from an NMI so we don't pretend + * to have any resources, like printk() for example. + */ + + kgdb_local_irq_save(flags); /* only local here, to avoid hanging */ + /* + * log arival of this cpu + * The NMI keeps on ticking. Protect against recurring more + * than once, and ignor the cpu that has the kgdb lock + */ + in_kgdb_entry_log[cpu]++; + in_kgdb_here_log[cpu] = regs; + if (cpu == spinlock_cpu || waiting_cpus[cpu].task) + goto exit_in_kgdb; + + /* + * For protection of the initilization of the spin locks by kgdb + * it locks the kgdb spinlock before it gets the wait locks set + * up. We wait here for the wait lock to be taken. If the + * kgdb lock goes away first?? Well, it could be a slow exit + * sequence where the wait lock is removed prior to the kgdb lock + * so if kgdb gets unlocked, we just exit. + */ + + while (spin_is_locked(&kgdb_spinlock) && + !spin_is_locked(waitlocks + cpu)) ; + if (!spin_is_locked(&kgdb_spinlock)) + goto exit_in_kgdb; + + waiting_cpus[cpu].task = current; + waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu); + waiting_cpus[cpu].regs = regs; + + spin_unlock_wait(waitlocks + cpu); + + /* + * log departure of this cpu + */ + waiting_cpus[cpu].task = 0; + waiting_cpus[cpu].pid = 0; + waiting_cpus[cpu].regs = 0; + correct_hw_break(); + exit_in_kgdb: + in_kgdb_here_log[cpu] = 0; + kgdb_local_irq_restore(flags); + return 1; + /* + spin_unlock(continuelocks + smp_processor_id()); + */ +} + +void +smp__in_kgdb(struct pt_regs regs) +{ + ack_APIC_irq(); + in_kgdb(®s); +} +#else +int +in_kgdb(struct pt_regs *regs) +{ + return (kgdb_spinlock); +} +#endif + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +kgdb_handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + struct task_struct *thread_list_start = 0, *thread = NULL; + int addr, length; + int breakno, breaktype; + char *ptr; + int newPC; + threadref thref; + int threadid; + int thread_min = PID_MAX + MAX_NO_CPUS; +#ifdef old_thread_list + int maxthreads; +#endif + int nothreads; + unsigned long flags; + int gdb_regs[NUMREGBYTES / 4]; + int dr6; + IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */ +#define NO_NMI 1 +#define NO_SYNC 2 +#define regs (*linux_regs) +#define NUMREGS NUMREGBYTES/4 + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { + printk("ignoring non-kernel exception\n"); + print_regs(®s); + return (0); + } + /* + * If we're using eth mode, set the 'mode' in the netdevice. + */ + + if (kgdboe) + netpoll_set_trap(1); + + kgdb_local_irq_save(flags); + + /* Get kgdb spinlock */ + + KGDB_SPIN_LOCK(&kgdb_spinlock); + rdtscll(kgdb_info.entry_tsc); + /* + * We depend on this spinlock and the NMI watch dog to control the + * other cpus. They will arrive at "in_kgdb()" as a result of the + * NMI and will wait there for the following spin locks to be + * released. + */ +#ifdef CONFIG_SMP + +#if 0 + if (cpu_callout_map & ~MAX_CPU_MASK) { + printk("kgdb : too many cpus, possibly not mapped" + " in contiguous space, change MAX_NO_CPUS" + " in kgdb_stub and make new kernel.\n" + " cpu_callout_map is %lx\n", cpu_callout_map); + goto exit_just_unlock; + } +#endif + if (spinlock_count == 1) { + int time = 0, end_time, dum = 0; + int i; + int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0) + }; + if (remote_debug) { + printk("kgdb : cpu %d entry, syncing others\n", + smp_processor_id()); + } + for (i = 0; i < MAX_NO_CPUS; i++) { + /* + * Use trylock as we may already hold the lock if + * we are holding the cpu. Net result is all + * locked. + */ + spin_trylock(&waitlocks[i]); + } + for (i = 0; i < MAX_NO_CPUS; i++) + cpu_logged_in[i] = 0; + /* + * Wait for their arrival. We know the watch dog is active if + * in_kgdb() has ever been called, as it is always called on a + * watchdog tick. + */ + rdtsc(dum, time); + end_time = time + 2; /* Note: we use the High order bits! */ + i = 1; + if (num_online_cpus() > 1) { + int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()]; + smp_send_nmi_allbutself(); + + while (i < num_online_cpus() && time != end_time) { + int j; + for (j = 0; j < MAX_NO_CPUS; j++) { + if (waiting_cpus[j].task && + waiting_cpus[j].task != NOCPU && + !cpu_logged_in[j]) { + i++; + cpu_logged_in[j] = 1; + if (remote_debug) { + printk + ("kgdb : cpu %d arrived at kgdb\n", + j); + } + break; + } else if (!waiting_cpus[j].task && + !cpu_online(j)) { + waiting_cpus[j].task = NOCPU; + cpu_logged_in[j] = 1; + waiting_cpus[j].hold = 1; + break; + } + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + + int wait = 100000; + while (wait--) ; + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + printk + ("kgdb : cpu %d stall" + " in in_kgdb\n", + j); + i++; + cpu_logged_in[j] = 1; + waiting_cpus[j].task = + (struct task_struct + *) 1; + } + } + } + + if (in_kgdb_entry_log[smp_processor_id()] > + (me_in_kgdb + 10)) { + break; + } + + rdtsc(dum, time); + } + if (i < num_online_cpus()) { + printk + ("kgdb : time out, proceeding without sync\n"); +#if 0 + printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n", + waiting_cpus[0].task != 0, + waiting_cpus[1].task != 0); + printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n", + cpu_logged_in[0], cpu_logged_in[1]); + printk + ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n", + in_kgdb_here_log[0] != 0, + in_kgdb_here_log[1] != 0); +#endif + entry_state = NO_SYNC; + } else { +#if 0 + int ent = + in_kgdb_entry_log[smp_processor_id()] - + me_in_kgdb; + printk("kgdb : sync after %d entries\n", ent); +#endif + } + } else { + if (remote_debug) { + printk + ("kgdb : %d cpus, but watchdog not active\n" + "proceeding without locking down other cpus\n", + num_online_cpus()); + entry_state = NO_NMI; + } + } + } +#endif + + if (remote_debug) { + unsigned long *lp = (unsigned long *) &linux_regs; + + printk("handle_exception(exceptionVector=%d, " + "signo=%d, err_code=%d, linux_regs=%p)\n", + exceptionVector, signo, err_code, linux_regs); + if (debug_regs) { + print_regs(®s); + printk("Stk: %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[0], lp[1], lp[2], lp[3], + lp[4], lp[5], lp[6], lp[7]); + printk(" %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[8], lp[9], lp[10], lp[11], + lp[12], lp[13], lp[14], lp[15]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[16], lp[17], lp[18], lp[19], + lp[20], lp[21], lp[22], lp[23]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[24], lp[25], lp[26], lp[27], + lp[28], lp[29], lp[30], lp[31]); + } + } + + /* Disable hardware debugging while we are in kgdb */ + /* Get the debug register status register */ +/* *INDENT-OFF* */ + __asm__("movl %0,%%db7" + : /* no output */ + :"r"(0)); + + asm volatile ("movl %%db6, %0\n" + :"=r" (hw_breakpoint_status) + :); + +/* *INDENT-ON* */ + switch (exceptionVector) { + case 0: /* divide error */ + case 1: /* debug exception */ + case 2: /* NMI */ + case 3: /* breakpoint */ + case 4: /* overflow */ + case 5: /* bounds check */ + case 6: /* invalid opcode */ + case 7: /* device not available */ + case 8: /* double fault (errcode) */ + case 10: /* invalid TSS (errcode) */ + case 12: /* stack fault (errcode) */ + case 16: /* floating point error */ + case 17: /* alignment check (errcode) */ + default: /* any undocumented */ + break; + case 11: /* segment not present (errcode) */ + case 13: /* general protection (errcode) */ + case 14: /* page fault (special errcode) */ + case 19: /* cache flush denied */ + if (mem_err_expected) { + /* + * This fault occured because of the + * get_char or set_char routines. These + * two routines use either eax of edx to + * indirectly reference the location in + * memory that they are working with. + * For a page fault, when we return the + * instruction will be retried, so we + * have to make sure that these + * registers point to valid memory. + */ + mem_err = 1; /* set mem error flag */ + mem_err_expected = 0; + mem_err_cnt++; /* helps in debugging */ + /* make valid address */ + regs.eax = (long) &garbage_loc; + /* make valid address */ + regs.edx = (long) &garbage_loc; + if (remote_debug) + printk("Return after memory error: " + "mem_err_cnt=%d\n", mem_err_cnt); + if (debug_regs) + print_regs(®s); + goto exit_kgdb; + } + break; + } + if (remote_debug) + printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id()); + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + kgdb_info.called_from = __builtin_return_address(0); +#ifdef CONFIG_SMP + /* + * OK, we can now communicate, lets tell gdb about the sync. + * but only if we had a problem. + */ + switch (entry_state) { + case NO_NMI: + to_gdb("NMI not active, other cpus not stopped\n"); + break; + case NO_SYNC: + to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n"); + default:; + } + +#endif +/* + * Set up the gdb function call area. + */ + trap_cpu = smp_processor_id(); + OLD_esp = NEW_esp = (int) (&linux_regs->esp); + + IF_SMP(once_again:) + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'd': + remote_debug = !(remote_debug); /* toggle debug flag */ + printk("Remote debug %s\n", + remote_debug ? "on" : "off"); + break; + case 'g': /* return the value of the CPU registers */ + get_gdb_regs(usethread, ®s, gdb_regs); + mem2hex((char *) gdb_regs, + remcomOutBuffer, NUMREGBYTES, 0); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], + (char *) gdb_regs, NUMREGBYTES, 0); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + case 'P':{ /* set the value of a single CPU register - + return OK */ + /* + * For some reason, gdb wants to talk about psudo + * registers (greater than 15). These may have + * meaning for ptrace, but for us it is safe to + * ignor them. We do this by dumping them into + * _GS which we also ignor, but do have memory for. + */ + int regno; + + ptr = &remcomInBuffer[1]; + regs_to_gdb_regs(gdb_regs, ®s); + if ((!usethread || usethread == current) && + hexToInt(&ptr, ®no) && + *ptr++ == '=' && (regno >= 0)) { + regno = + (regno >= NUMREGS ? _GS : regno); + hex2mem(ptr, (char *) &gdb_regs[regno], + 4, 0); + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + break; + } + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr) && + (*(ptr++) == ',') && (hexToInt(&ptr, &length))) { + ptr = 0; + /* + * hex doubles the byte count + */ + if (length > (BUFMAX / 2)) + length = BUFMAX / 2; + mem2hex((char *) addr, + remcomOutBuffer, length, 1); + if (mem_err) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + debug_error + ("malformed read memory command: %s\n", + remcomInBuffer); + } + break; + + /* MAA..AA,LLLL: + Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr) && + (*(ptr++) == ',') && + (hexToInt(&ptr, &length)) && (*(ptr++) == ':')) { + hex2mem(ptr, (char *) addr, length, 1); + + if (mem_err) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } else { + strcpy(remcomOutBuffer, "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + debug_error + ("malformed write memory command: %s\n", + remcomInBuffer); + } + break; + case 'S': + remcomInBuffer[0] = 's'; + case 'C': + /* Csig;AA..AA where ;AA..AA is optional + * continue with signal + * Since signals are meaning less to us, delete that + * part and then fall into the 'c' code. + */ + ptr = &remcomInBuffer[1]; + length = 2; + while (*ptr && *ptr != ';') { + length++; + ptr++; + } + if (*ptr) { + do { + ptr++; + *(ptr - length++) = *ptr; + } while (*ptr); + } else { + remcomInBuffer[1] = 0; + } + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + /* D detach, reply OK and then continue */ + case 'c': + case 's': + case 'D': + + /* try to read optional parameter, + pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) { + if (remote_debug) + printk("Changing EIP to 0x%x\n", addr); + + regs.eip = addr; + } + + newPC = regs.eip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + /* detach is a friendly version of continue. Note that + debugging is still enabled (e.g hit control C) + */ + if (remcomInBuffer[0] == 'D') { + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + } + + if (remote_debug) { + printk("Resuming execution\n"); + print_regs(®s); + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno) && + (breakinfo[breakno].type == 0)) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + + if (kgdboe) + netpoll_set_trap(0); + + correct_hw_break(); + asm volatile ("movl %0, %%db6\n"::"r" (0)); + goto exit_kgdb; + + /* kill the program */ + case 'k': /* do nothing */ + break; + + /* query */ + case 'q': + nothreads = 0; + switch (remcomInBuffer[1]) { + case 'f': + threadid = 1; + thread_list = 2; + thread_list_start = (usethread ? : current); + case 's': + if (!cmp_str(&remcomInBuffer[2], + "ThreadInfo", 10)) + break; + + remcomOutBuffer[nothreads++] = 'm'; + for (; threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + nothreads += int_to_hex_v( + &remcomOutBuffer[ + nothreads], + threadid); + if (thread_min > threadid) + thread_min = threadid; + remcomOutBuffer[ + nothreads] = ','; + nothreads++; + if (nothreads > BUFMAX - 10) + break; + } + } + if (remcomOutBuffer[nothreads - 1] == 'm') { + remcomOutBuffer[nothreads - 1] = 'l'; + } else { + nothreads--; + } + remcomOutBuffer[nothreads] = 0; + break; + +#ifdef old_thread_list /* Old thread info request */ + case 'L': + /* List threads */ + thread_list = 2; + thread_list_start = (usethread ? : current); + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + do { + int buf_thread_limit = + (BUFMAX - 22) / BUF_THREAD_ID_SIZE; + if (maxthreads > buf_thread_limit) { + maxthreads = buf_thread_limit; + } + } while (0); + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads && + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + if (thread_min > threadid) + thread_min = threadid; + } + } + + if (threadid == PID_MAX + MAX_NO_CPUS) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; +#endif + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + if (!threadid) { + /* + * idle thread + */ + for (threadid = PID_MAX; + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + if (current == + idle_task(threadid - + PID_MAX)) + break; + } + } + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, + err_code, remcomOutBuffer); + break; + case 'T':{ + char * nptr; + /* Thread extra info */ + if (!cmp_str(&remcomInBuffer[2], + "hreadExtraInfo,", 15)) { + break; + } + ptr = &remcomInBuffer[17]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + nptr = &thread->comm[0]; + length = 0; + ptr = &remcomOutBuffer[0]; + do { + length++; + ptr = pack_hex_byte(ptr, *nptr++); + } while (*nptr && length < 16); + /* + * would like that 16 to be the size of + * task_struct.comm but don't know the + * syntax.. + */ + *ptr = 0; + } + } + break; + + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + /* + * Just in case I forget what this is all about, + * the "thread info" command to gdb causes it + * to ask for a thread list. It then switches + * to each thread and asks for the registers. + * For this (and only this) usage, we want to + * fudge the registers of tasks not on the run + * list (i.e. waiting) to show the routine that + * called schedule. Also, gdb, is a minimalist + * in that if the current thread is the last + * it will not re-read the info when done. + * This means that in this case we must show + * the real registers. So here is how we do it: + * Each entry we keep track of the min + * thread in the list (the last that gdb will) + * get info for. We also keep track of the + * starting thread. + * "thread_list" is cleared when switching back + * to the min thread if it is was current, or + * if it was not current, thread_list is set + * to 1. When the switch to current comes, + * if thread_list is 1, clear it, else do + * nothing. + */ + usethread = thread; + if ((thread_list == 1) && + (thread == thread_list_start)) { + thread_list = 0; + } + if (thread_list && (threadid == thread_min)) { + if (thread == thread_list_start) { + thread_list = 0; + } else { + thread_list = 1; + } + } + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + if (thread_min > threadid) + thread_min = threadid; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; + + case 'Y': /* set up a hardware breakpoint */ + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + ptr++; + hexToInt(&ptr, &breaktype); + ptr++; + hexToInt(&ptr, &length); + ptr++; + hexToInt(&ptr, &addr); + if (set_hw_break(breakno & 0x3, + breaktype & 0x3, + length & 0x3, addr) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + case 'r': /* reboot */ + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + /*to_gdb("Rebooting\n"); */ + /* triplefault no return from here */ + { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt[0])); + BREAKPOINT; + } + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + } /* while(1==1) */ + /* + * reached by goto only. + */ + exit_kgdb: + /* + * Here is where we set up to trap a gdb function call. NEW_esp + * will be changed if we are trying to do this. We handle both + * adding and subtracting, thus allowing gdb to put grung on + * the stack which it removes later. + */ + if (NEW_esp != OLD_esp) { + int *ptr = END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) + ptr -= (OLD_esp - NEW_esp) / sizeof (int); + *--ptr = linux_regs->eflags; + *--ptr = linux_regs->xcs; + *--ptr = linux_regs->eip; + *--ptr = linux_regs->ecx; + *--ptr = linux_regs->ebx; + *--ptr = linux_regs->eax; + linux_regs->ecx = NEW_esp - (sizeof (int) * 6); + linux_regs->ebx = (unsigned int) END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) { + linux_regs->eip = (unsigned int) fn_call_stub; + } else { + linux_regs->eip = (unsigned int) fn_rtn_stub; + linux_regs->eax = NEW_esp; + } + linux_regs->eflags &= ~(IF_BIT | TF_BIT); + } +#ifdef CONFIG_SMP + /* + * Release gdb wait locks + * Sanity check time. Must have at least one cpu to run. Also single + * step must not be done if the current cpu is on hold. + */ + if (spinlock_count == 1) { + int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep; + int cpu_avail = 0; + int i; + + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!cpu_online(i)) + break; + if (!hold_cpu(i)) { + cpu_avail = 1; + } + } + /* + * Early in the bring up there will be NO cpus on line... + */ + if (!cpu_avail && !cpus_empty(cpu_online_map)) { + to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n"); + goto once_again; + } + if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) { + to_gdb + ("Current cpu must be unblocked to single step\n"); + goto once_again; + } + if (!(ss_hold)) { + int i; + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!hold_cpu(i)) { + spin_unlock(&waitlocks[i]); + } + } + } else { + spin_unlock(&waitlocks[smp_processor_id()]); + } + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + /* + * If this cpu is on hold, this is where we + * do it. Note, the NMI will pull us out of here, + * but will return as the above lock is not held. + * We will stay here till another cpu releases the lock for us. + */ + spin_unlock_wait(waitlocks + smp_processor_id()); + kgdb_local_irq_restore(flags); + return (0); + } +#if 0 +exit_just_unlock: +#endif +#endif + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + kgdb_local_irq_restore(flags); + return (0); +} + +/* this function is used to set up exception handlers for tracing and + * breakpoints. + * This function is not needed as the above line does all that is needed. + * We leave it for backward compatitability... + */ +void +set_debug_traps(void) +{ + /* + * linux_debug_hook is defined in traps.c. We store a pointer + * to our own exception handler into it. + + * But really folks, every hear of labeled common, an old Fortran + * concept. Lots of folks can reference it and it is define if + * anyone does. Only one can initialize it at link time. We do + * this with the hook. See the statement above. No need for any + * executable code and it is ready as soon as the kernel is + * loaded. Very desirable in kernel debugging. + + linux_debug_hook = handle_exception ; + */ + + /* In case GDB is started before us, ack any packets (presumably + "$?#xx") sitting there. + putDebugChar ('+'); + + initialized = 1; + */ +} + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ +/* But really, just use the BREAKPOINT macro. We will handle the int stuff + */ + +#ifdef later +/* + * possibly we should not go thru the traps.c code at all? Someday. + */ +void +do_kgdb_int3(struct pt_regs *regs, long error_code) +{ + kgdb_handle_exception(3, 5, error_code, regs); + return; +} +#endif +#undef regs +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS +asmlinkage void +bad_sys_call_exit(int stuff) +{ + struct pt_regs *regs = (struct pt_regs *) &stuff; + printk("Sys call %d return with %x preempt_count\n", + (int) regs->orig_eax, preempt_count()); +} +#endif +#ifdef CONFIG_STACK_OVERFLOW_TEST +#include +asmlinkage void +stack_overflow(void) +{ +#ifdef BREAKPOINT + BREAKPOINT; +#else + printk("Kernel stack overflow, looping forever\n"); +#endif + while (1) { + } +} +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE) +char gdbconbuf[BUFMAX]; + +static void +kgdb_gdb_message(const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + /* + * This takes care of NMI while spining out chars to gdb + */ + IF_SMP(in_kgdb_console = 1); + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } + IF_SMP(in_kgdb_console = 0); +} +#endif +#ifdef CONFIG_SMP +static void +to_gdb(const char *s) +{ + int count = 0; + while (s[count] && (count++ < BUFMAX)) ; + kgdb_gdb_message(s, count); +} +#endif +#ifdef CONFIG_KGDB_CONSOLE +#include +#include +#include +#include +#include + +void +kgdb_console_write(struct console *co, const char *s, unsigned count) +{ + + if (gdb_i386vector == -1) { + /* + * We have not yet talked to gdb. What to do... + * lets break, on continue we can do the write. + * But first tell him whats up. Uh, well no can do, + * as this IS the console. Oh well... + * We do need to wait or the messages will be lost. + * Other option would be to tell the above code to + * ignore this breakpoint and do an auto return, + * but that might confuse gdb. Also this happens + * early enough in boot up that we don't have the traps + * set up yet, so... + */ + breakpoint(); + } + kgdb_gdb_message(s, count); +} + +/* + * ------------------------------------------------------------ + * Serial KGDB driver + * ------------------------------------------------------------ + */ + +static struct console kgdbcons = { + name:"kgdb", + write:kgdb_console_write, +#ifdef CONFIG_KGDB_USER_CONSOLE + device:kgdb_console_device, +#endif + flags:CON_PRINTBUFFER | CON_ENABLED, + index:-1, +}; + +/* + * The trick here is that this file gets linked before printk.o + * That means we get to peer at the console info in the command + * line before it does. If we are up, we register, otherwise, + * do nothing. By returning 0, we allow printk to look also. + */ +static int kgdb_console_enabled; + +int __init +kgdb_console_init(char *str) +{ + if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) { + register_console(&kgdbcons); + kgdb_console_enabled = 1; + } + return 0; /* let others look at the string */ +} + +__setup("console=", kgdb_console_init); + +#ifdef CONFIG_KGDB_USER_CONSOLE +static kdev_t kgdb_console_device(struct console *c); +/* This stuff sort of works, but it knocks out telnet devices + * we are leaving it here in case we (or you) find time to figure it out + * better.. + */ + +/* + * We need a real char device as well for when the console is opened for user + * space activities. + */ + +static int +kgdb_consdev_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static ssize_t +kgdb_consdev_write(struct file *file, const char *buf, + size_t count, loff_t * ppos) +{ + int size, ret = 0; + static char kbuf[128]; + static DECLARE_MUTEX(sem); + + /* We are not reentrant... */ + if (down_interruptible(&sem)) + return -ERESTARTSYS; + + while (count > 0) { + /* need to copy the data from user space */ + size = count; + if (size > sizeof (kbuf)) + size = sizeof (kbuf); + if (copy_from_user(kbuf, buf, size)) { + ret = -EFAULT; + break;; + } + kgdb_console_write(&kgdbcons, kbuf, size); + count -= size; + ret += size; + buf += size; + } + + up(&sem); + + return ret; +} + +struct file_operations kgdb_consdev_fops = { + open:kgdb_consdev_open, + write:kgdb_consdev_write +}; +static kdev_t +kgdb_console_device(struct console *c) +{ + return MKDEV(TTYAUX_MAJOR, 1); +} + +/* + * This routine gets called from the serial stub in the i386/lib + * This is so it is done late in bring up (just before the console open). + */ +void +kgdb_console_finit(void) +{ + if (kgdb_console_enabled) { + char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1)); + char *cp = cptr; + while (*cptr && *cptr != '(') + cptr++; + *cptr = 0; + unregister_chrdev(TTYAUX_MAJOR, cp); + register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops); + } +} +#endif +#endif +#ifdef CONFIG_KGDB_TS +#include /* time stamp code */ +#include /* in_interrupt */ +#ifdef CONFIG_KGDB_TS_64 +#define DATA_POINTS 64 +#endif +#ifdef CONFIG_KGDB_TS_128 +#define DATA_POINTS 128 +#endif +#ifdef CONFIG_KGDB_TS_256 +#define DATA_POINTS 256 +#endif +#ifdef CONFIG_KGDB_TS_512 +#define DATA_POINTS 512 +#endif +#ifdef CONFIG_KGDB_TS_1024 +#define DATA_POINTS 1024 +#endif +#ifndef DATA_POINTS +#define DATA_POINTS 128 /* must be a power of two */ +#endif +#define INDEX_MASK (DATA_POINTS - 1) +#if (INDEX_MASK & DATA_POINTS) +#error "CONFIG_KGDB_TS_COUNT must be a power of 2" +#endif +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + int data0; + int data1; +}; +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + struct task_struct *t1; + struct task_struct *t2; +}; +struct kgdb_and_then_struct kgdb_data[DATA_POINTS]; + +struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0]; +int kgdb_and_then_count; + +void +kgdb_tstamp(int line, char *source, int data0, int data1) +{ + static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; + int flags; + kgdb_local_irq_save(flags); + spin_lock(&ts_spin); + rdtscll(kgdb_and_then->at_time); +#ifdef CONFIG_SMP + kgdb_and_then->on_cpu = smp_processor_id(); +#endif + kgdb_and_then->task = current; + kgdb_and_then->from_ln = line; + kgdb_and_then->in_src = source; + kgdb_and_then->from = __builtin_return_address(0); + kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) | + (preempt_count() << 8)); + kgdb_and_then->data0 = data0; + kgdb_and_then->data1 = data1; + kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK]; + spin_unlock(&ts_spin); + kgdb_local_irq_restore(flags); +#ifdef CONFIG_PREEMPT + +#endif + return; +} +#endif +typedef int gdb_debug_hook(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs); +gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */ + +static int kgdb_need_breakpoint[NR_CPUS]; + +void kgdb_schedule_breakpoint(void) +{ + kgdb_need_breakpoint[smp_processor_id()] = 1; +} + +void kgdb_process_breakpoint(void) +{ + /* + * Handle a breakpoint queued from inside network driver code + * to avoid reentrancy issues + */ + if (kgdb_need_breakpoint[smp_processor_id()]) { + kgdb_need_breakpoint[smp_processor_id()] = 0; + BREAKPOINT; + } +} + --- linux-2.6.5-rc3/arch/i386/kernel/ldt.c 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/ldt.c 2004-03-31 00:56:34.390444800 -0800 @@ -2,7 +2,7 @@ * linux/kernel/ldt.c * * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 1999, 2003 Ingo Molnar */ #include @@ -18,6 +18,8 @@ #include #include #include +#include +#include #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *null) @@ -29,34 +31,31 @@ static void flush_ldt(void *null) static int alloc_ldt(mm_context_t *pc, int mincount, int reload) { - void *oldldt; - void *newldt; - int oldsize; + int oldsize, newsize, i; if (mincount <= pc->size) return 0; + /* + * LDT got larger - reallocate if necessary. + */ oldsize = pc->size; mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - pc->ldt = newldt; - wmb(); + newsize = mincount*LDT_ENTRY_SIZE; + for (i = 0; i < newsize; i += PAGE_SIZE) { + int nr = i/PAGE_SIZE; + BUG_ON(i >= 64*1024); + if (!pc->ldt_pages[nr]) { + pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER); + if (!pc->ldt_pages[nr]) + return -ENOMEM; + clear_highpage(pc->ldt_pages[nr]); + } + } pc->size = mincount; - wmb(); - if (reload) { #ifdef CONFIG_SMP cpumask_t mask; + preempt_disable(); load_LDT(pc); mask = cpumask_of_cpu(smp_processor_id()); @@ -67,21 +66,20 @@ static int alloc_ldt(mm_context_t *pc, i load_LDT(pc); #endif } - if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } return 0; } static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { - int err = alloc_ldt(new, old->size, 0); - if (err < 0) + int i, err, size = old->size, nr_pages = (size*LDT_ENTRY_SIZE + PAGE_SIZE-1)/PAGE_SIZE; + + err = alloc_ldt(new, size, 0); + if (err < 0) { + new->size = 0; return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + } + for (i = 0; i < nr_pages; i++) + copy_user_highpage(new->ldt_pages[i], old->ldt_pages[i], 0); return 0; } @@ -96,6 +94,7 @@ int init_new_context(struct task_struct init_MUTEX(&mm->context.sem); mm->context.size = 0; + memset(mm->context.ldt_pages, 0, sizeof(struct page *) * MAX_LDT_PAGES); old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { down(&old_mm->context.sem); @@ -107,23 +106,21 @@ int init_new_context(struct task_struct /* * No need to lock the MM as we are the last user + * Do not touch the ldt register, we are already + * in the next thread. */ void destroy_context(struct mm_struct *mm) { - if (mm->context.size) { - if (mm == current->active_mm) - clear_LDT(); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } + int i, nr_pages = (mm->context.size*LDT_ENTRY_SIZE + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) + __free_page(mm->context.ldt_pages[i]); + mm->context.size = 0; } static int read_ldt(void __user * ptr, unsigned long bytecount) { - int err; + int err, i; unsigned long size; struct mm_struct * mm = current->mm; @@ -138,8 +135,25 @@ static int read_ldt(void __user * ptr, u size = bytecount; err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; + /* + * This is necessary just in case we got here straight from a + * context-switch where the ptes were set but no tlb flush + * was done yet. We rather avoid doing a TLB flush in the + * context-switch path and do it here instead. + */ + __flush_tlb_global(); + + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + if (copy_to_user(ptr + i, kaddr, size - i)) + err = -EFAULT; + kunmap(mm->context.ldt_pages[nr]); + } up(&mm->context.sem); if (err < 0) return err; @@ -158,7 +172,7 @@ static int read_default_ldt(void __user err = 0; address = &default_ldt[0]; - size = 5*sizeof(struct desc_struct); + size = 5*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -200,7 +214,15 @@ static int write_ldt(void __user * ptr, goto out_unlock; } - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + /* + * No rescheduling allowed from this point to the install. + * + * We do a TLB flush for the same reason as in the read_ldt() path. + */ + preempt_disable(); + __flush_tlb_global(); + lp = (__u32 *) ((ldt_info.entry_number << 3) + + (char *) __kmap_atomic_vaddr(KM_LDT_PAGE0)); /* Allow LDTs to be cleared by the user. */ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { @@ -221,6 +243,7 @@ install: *lp = entry_1; *(lp+1) = entry_2; error = 0; + preempt_enable(); out_unlock: up(&mm->context.sem); @@ -248,3 +271,26 @@ asmlinkage int sys_modify_ldt(int func, } return ret; } + +/* + * load one particular LDT into the current CPU + */ +void load_LDT_nolock(mm_context_t *pc, int cpu) +{ + struct page **pages = pc->ldt_pages; + int count = pc->size; + int nr_pages, i; + + if (likely(!count)) { + pages = &default_ldt_page; + count = 5; + } + nr_pages = (count*LDT_ENTRY_SIZE + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) { + __kunmap_atomic_type(KM_LDT_PAGE0 - i); + __kmap_atomic(pages[i], KM_LDT_PAGE0 - i); + } + set_ldt_desc(cpu, (void *)__kmap_atomic_vaddr(KM_LDT_PAGE0), count); + load_LDT_desc(); +} --- linux-2.6.5-rc3/arch/i386/kernel/Makefile 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/kernel/Makefile 2004-03-31 00:56:34.390444800 -0800 @@ -7,13 +7,14 @@ extra-y := head.o init_task.o vmlinux.ld obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ - doublefault.o + doublefault.o entry_trampoline.o obj-y += cpu/ obj-y += timers/ obj-$(CONFIG_ACPI_BOOT) += acpi/ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca.o +obj-$(CONFIG_KGDB) += kgdb_stub.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o --- linux-2.6.5-rc3/arch/i386/kernel/mpparse.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/kernel/mpparse.c 2004-03-31 00:56:34.391444648 -0800 @@ -675,7 +675,7 @@ void __init get_smp_config (void) * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc((void *)mpf->mpf_physptr)) { + if (!smp_read_mpc((void *)phys_to_virt(mpf->mpf_physptr))) { smp_found_config = 0; printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); --- linux-2.6.5-rc3/arch/i386/kernel/nmi.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/nmi.c 2004-03-30 20:43:36.996759352 -0800 @@ -31,7 +31,16 @@ #include #include +#ifdef CONFIG_KGDB +#include +#ifdef CONFIG_SMP +unsigned int nmi_watchdog = NMI_IO_APIC; +#else +unsigned int nmi_watchdog = NMI_LOCAL_APIC; +#endif +#else unsigned int nmi_watchdog = NMI_NONE; +#endif static unsigned int nmi_hz = HZ; unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ extern void show_registers(struct pt_regs *regs); @@ -408,6 +417,9 @@ void touch_nmi_watchdog (void) for (i = 0; i < NR_CPUS; i++) alert_counter[i] = 0; } +#ifdef CONFIG_KGDB +int tune_watchdog = 5*HZ; +#endif void nmi_watchdog_tick (struct pt_regs * regs) { @@ -421,12 +433,24 @@ void nmi_watchdog_tick (struct pt_regs * sum = irq_stat[cpu].apic_timer_irqs; +#ifdef CONFIG_KGDB + if (! in_kgdb(regs) && last_irq_sums[cpu] == sum ) { + +#else if (last_irq_sums[cpu] == sum) { +#endif /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; +#ifdef CONFIG_KGDB + if (alert_counter[cpu] == tune_watchdog) { + kgdb_handle_exception(2, SIGPWR, 0, regs); + last_irq_sums[cpu] = sum; + alert_counter[cpu] = 0; + } +#endif if (alert_counter[cpu] == 5*nmi_hz) { spin_lock(&nmi_print_lock); /* --- linux-2.6.5-rc3/arch/i386/kernel/process.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/kernel/process.c 2004-03-31 00:56:35.257313016 -0800 @@ -46,6 +46,7 @@ #include #include #include +#include #ifdef CONFIG_MATH_EMULATION #include #endif @@ -303,6 +304,9 @@ void flush_thread(void) struct task_struct *tsk = current; memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); +#ifdef CONFIG_X86_HIGH_ENTRY + clear_thread_flag(TIF_DB7); +#endif memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -316,9 +320,8 @@ void release_thread(struct task_struct * if (dead_task->mm) { // temporary debugging check if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + printk("WARNING: dead process %8s still has LDT? <%d>\n", dead_task->comm, - dead_task->mm->context.ldt, dead_task->mm->context.size); BUG(); } @@ -342,7 +345,7 @@ int copy_thread(int nr, unsigned long cl { struct pt_regs * childregs; struct task_struct *tsk; - int err; + int err, i; childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; struct_cpy(childregs, regs); @@ -353,7 +356,18 @@ int copy_thread(int nr, unsigned long cl p->thread.esp = (unsigned long) childregs; p->thread.esp0 = (unsigned long) (childregs+1); + /* + * get the two stack pages, for the virtual stack. + * + * IMPORTANT: this code relies on the fact that the task + * structure is an THREAD_SIZE aligned piece of physical memory. + */ + for (i = 0; i < ARRAY_SIZE(p->thread.stack_page); i++) + p->thread.stack_page[i] = + virt_to_page((unsigned long)p->thread_info + (i*PAGE_SIZE)); + p->thread.eip = (unsigned long) ret_from_fork; + p->thread_info->real_stack = p->thread_info; savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); @@ -505,10 +519,42 @@ struct task_struct fastcall * __switch_t __unlazy_fpu(prev_p); +#ifdef CONFIG_X86_HIGH_ENTRY +{ + int i; + /* + * Set the ptes of the virtual stack. (NOTE: a one-page TLB flush is + * needed because otherwise NMIs could interrupt the + * user-return code with a virtual stack and stale TLBs.) + */ + for (i = 0; i < ARRAY_SIZE(next->stack_page); i++) { + __kunmap_atomic_type(KM_VSTACK_TOP-i); + __kmap_atomic(next->stack_page[i], KM_VSTACK_TOP-i); + } + /* + * NOTE: here we rely on the task being the stack as well + */ + next_p->thread_info->virtual_stack = + (void *)__kmap_atomic_vaddr(KM_VSTACK_TOP); +} +#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) + /* + * If next was preempted on entry from userspace to kernel, + * and now it's on a different cpu, we need to adjust %esp. + * This assumes that entry.S does not copy %esp while on the + * virtual stack (with interrupts enabled): which is so, + * except within __SWITCH_KERNELSPACE itself. + */ + if (unlikely(next->esp >= TASK_SIZE)) { + next->esp &= THREAD_SIZE - 1; + next->esp |= (unsigned long) next_p->thread_info->virtual_stack; + } +#endif +#endif /* * Reload esp0, LDT and the page table pointer: */ - load_esp0(tss, next); + load_virtual_esp0(tss, next_p); /* * Load the per-thread Thread-Local Storage descriptor. --- linux-2.6.5-rc3/arch/i386/kernel/ptrace.c 2003-06-14 12:18:05.000000000 -0700 +++ 25/arch/i386/kernel/ptrace.c 2004-03-31 00:54:31.294158280 -0800 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -524,6 +525,15 @@ out: __attribute__((regparm(3))) void do_syscall_trace(struct pt_regs *regs, int entryexit) { + if (unlikely(current->audit_context)) { + if (!entryexit) + audit_syscall_entry(current, regs->orig_eax, + regs->ebx, regs->ecx, + regs->edx, regs->esi); + else + audit_syscall_exit(current, regs->eax); + } + if (!test_thread_flag(TIF_SYSCALL_TRACE)) return; if (!(current->ptrace & PT_PTRACED)) --- linux-2.6.5-rc3/arch/i386/kernel/reboot.c 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/i386/kernel/reboot.c 2004-03-31 00:56:34.393444344 -0800 @@ -155,12 +155,11 @@ void machine_real_restart(unsigned char CMOS_WRITE(0x00, 0x8f); spin_unlock_irqrestore(&rtc_lock, flags); - /* Remap the kernel at virtual address zero, as well as offset zero - from the kernel segment. This assumes the kernel segment starts at - virtual address PAGE_OFFSET. */ - - memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); + /* + * Remap the first 16 MB of RAM (which includes the kernel image) + * at virtual address zero: + */ + setup_identity_mappings(swapper_pg_dir, 0, LOW_MAPPINGS_SIZE); /* * Use `swapper_pg_dir' as our page directory. --- linux-2.6.5-rc3/arch/i386/kernel/setup.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/kernel/setup.c 2004-03-31 00:56:35.428287024 -0800 @@ -47,8 +47,8 @@ #include #include #include +#include #include "setup_arch_pre.h" -#include "mach_resources.h" /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* @@ -127,27 +127,11 @@ unsigned long saved_videomode; #define RAMDISK_PROMPT_FLAG 0x8000 #define RAMDISK_LOAD_FLAG 0x4000 -static char command_line[COMMAND_LINE_SIZE]; - char saved_command_line[COMMAND_LINE_SIZE]; - unsigned char __initdata boot_params[PARAM_SIZE]; static struct resource code_resource = { "Kernel code", 0x100000, 0 }; static struct resource data_resource = { "Kernel data", 0, 0 }; -static void __init probe_roms(void) -{ - int roms = 1; - - request_resource(&iomem_resource, rom_resources+0); - - /* Video ROM is standard at C000:0000 - C7FF:0000, check signature */ - probe_video_rom(roms); - - /* Extension roms */ - probe_extension_roms(roms); -} - static void __init limit_regions(unsigned long long size) { unsigned long long current_addr = 0; @@ -484,165 +468,177 @@ static void __init setup_memory_region(v print_memory_map(who); } /* setup_memory_region */ +/* + * "mem=nopentium" disables the 4MB page tables. + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM + * to , overriding the bios size. + * + * HPA tells me bootloaders need to parse mem=, so no new + * option should be mem= [also see Documentation/i386/boot.txt] + */ +static int __init early_mem(char *from) +{ + if (!memcmp(from, "nopentium", 9)) { + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long mem_size; + + mem_size = memparse(from, &from); + limit_regions(mem_size); + printk(KERN_INFO "user-defined physical RAM map:\n"); + print_memory_map("user"); + } -static void __init parse_cmdline_early (char ** cmdline_p) + return 0; +} +__early_param("mem=", early_mem); + +/* + * "memmap=XXX[KkmM][@#$]XXX[KkmM]" defines a memory region from + * to +, overriding the bios size. + */ +static int __init early_memmap(char *from) { - char c = ' ', *to = command_line, *from = saved_command_line; - int len = 0; int userdef = 0; - /* Save unparsed command line copy for /proc/cmdline */ - saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; - - for (;;) { - /* - * "mem=nopentium" disables the 4MB page tables. - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM - * to , overriding the bios size. - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from - * to +, overriding the bios size. - * - * HPA tells me bootloaders need to parse mem=, so no new - * option should be mem= [also see Documentation/i386/boot.txt] + if (!memcmp(from, "exactmap", 8)) { + e820.nr_map = 0; + userdef = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. */ - if (c == ' ' && !memcmp(from, "mem=", 4)) { - if (to != command_line) - to--; - if (!memcmp(from+4, "nopentium", 9)) { - from += 9+4; - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long mem_size; - - mem_size = memparse(from+4, &from); - limit_regions(mem_size); - userdef=1; - } + unsigned long long start_at, mem_size; + mem_size = memparse(from, &from); + if (*from == '@') { + start_at = memparse(from + 1, &from); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*from == '#') { + start_at = memparse(from + 1, &from); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*from == '$') { + start_at = memparse(from + 1, &from); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + userdef = 1; } + } - if (c == ' ' && !memcmp(from, "memmap=", 7)) { - if (to != command_line) - to--; - if (!memcmp(from+7, "exactmap", 8)) { - from += 8+7; - e820.nr_map = 0; - userdef = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long start_at, mem_size; - - mem_size = memparse(from+7, &from); - if (*from == '@') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*from == '#') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*from == '$') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - limit_regions(mem_size); - userdef=1; - } - } - } + if (userdef) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + print_memory_map("user"); + } + + return 0; +} +__early_param("memmap=", early_memmap); #ifdef CONFIG_SMP - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; +/* + * If the BIOS enumerates physical processors before logical, + * maxcpus=N at enumeration-time can be used to disable HT. + */ +static int __init early_maxcpus(char *from) +{ + extern unsigned int maxcpus; - maxcpus = simple_strtoul(from + 8, NULL, 0); - } + maxcpus = simple_strtoul(from, NULL, 0); + + return 0; +} +__early_param("maxcpus=", early_maxcpus); #endif + #ifdef CONFIG_ACPI_BOOT - /* "acpi=off" disables both ACPI table parsing and interpreter */ - else if (!memcmp(from, "acpi=off", 8)) { +static int __init early_acpi(char *from) +{ + /* "off" disables both ACPI table parsing and interpreter */ + if (!memcmp(from, "off", 3)) + disable_acpi(); + + /* "force" to over-ride black-list */ + else if (!memcmp(from, "force", 5)) { + acpi_force = 1; + acpi_ht = 1; + acpi_disabled = 0; + } + + /* "strict" disables out-of-spec workarounds */ + else if (!memcmp(from, "strict", 6)) + acpi_strict = 1; + + /* Limit ACPI just to boot-time to enable HT */ + else if (!memcmp(from, "ht", 2)) { + if (!acpi_force) disable_acpi(); - } - - /* acpi=force to over-ride black-list */ - else if (!memcmp(from, "acpi=force", 10)) { - acpi_force = 1; - acpi_ht = 1; - acpi_disabled = 0; - } + acpi_ht = 1; + } - /* acpi=strict disables out-of-spec workarounds */ - else if (!memcmp(from, "acpi=strict", 11)) { - acpi_strict = 1; - } + return 0; +} +__early_param("acpi=", early_acpi); - /* Limit ACPI just to boot-time to enable HT */ - else if (!memcmp(from, "acpi=ht", 7)) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } +static int __init early_pci_noacpi(char *ign) +{ + acpi_noirq_set(); - /* "pci=noacpi" disables ACPI interrupt routing */ - else if (!memcmp(from, "pci=noacpi", 10)) { - acpi_noirq_set(); - } + return 0; +} +__early_param("pci=noacpi", early_pci_noacpi); - else if (!memcmp(from, "acpi_sci=edge", 13)) - acpi_sci_flags.trigger = 1; +static int __init early_acpi_sci(char *from) +{ + if (!memcmp(from, "edge", 4)) + acpi_sci_flags.trigger = 1; - else if (!memcmp(from, "acpi_sci=level", 14)) - acpi_sci_flags.trigger = 3; + else if (!memcmp(from, "level", 5)) + acpi_sci_flags.trigger = 3; - else if (!memcmp(from, "acpi_sci=high", 13)) - acpi_sci_flags.polarity = 1; + else if (!memcmp(from, "high", 4)) + acpi_sci_flags.polarity = 1; - else if (!memcmp(from, "acpi_sci=low", 12)) - acpi_sci_flags.polarity = 3; + else if (!memcmp(from, "low", 3)) + acpi_sci_flags.polarity = 3; + return 0; +} +__early_param("acpi_sci=", early_acpi_sci); #ifdef CONFIG_X86_LOCAL_APIC - /* disable IO-APIC */ - else if (!memcmp(from, "noapic", 6)) - disable_ioapic_setup(); +/* Disable IO-APIC. */ +static int __init early_disable_ioapic(char *from) +{ + disable_ioapic_setup(); + + return 0; +} +__early_param("noapic", early_disable_ioapic); #endif /* CONFIG_X86_LOCAL_APIC */ #endif /* CONFIG_ACPI_BOOT */ - /* - * highmem=size forces highmem to be exactly 'size' bytes. - * This works even on boxes that have no highmem otherwise. - * This also works to reduce highmem size on bigger boxes. - */ - if (c == ' ' && !memcmp(from, "highmem=", 8)) - highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; - - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - *to = '\0'; - *cmdline_p = command_line; - if (userdef) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - print_memory_map("user"); - } +/* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ +static int __init early_highmem_size(char *from) +{ + highmem_pages = memparse(from, &from) >> PAGE_SHIFT; + + return 0; } +__early_param("highmem=", early_highmem_size); /* * Callback for efi_memory_walk. @@ -951,19 +947,17 @@ legacy_init_iomem_resources(struct resou static void __init register_memory(unsigned long max_low_pfn) { unsigned long low_mem_size; - int i; if (efi_enabled) efi_initialize_iomem_resources(&code_resource, &data_resource); else legacy_init_iomem_resources(&code_resource, &data_resource); - /* EFI systems may still have VGA */ + /* EFI systems may still have VGA */ request_graphics_resource(); /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < STANDARD_IO_RESOURCES; i++) - request_resource(&ioport_resource, standard_io_resources+i); + request_standard_io_resources(); /* Tell the PCI layer not to allocate too close to the RAM area.. */ low_mem_size = ((max_low_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff; @@ -1087,6 +1081,7 @@ __setup("noreplacement", noreplacement_s */ void __init setup_arch(char **cmdline_p) { + extern char saved_command_line[]; unsigned long max_low_pfn; memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); @@ -1145,10 +1140,12 @@ void __init setup_arch(char **cmdline_p) data_resource.start = virt_to_phys(_etext); data_resource.end = virt_to_phys(_edata)-1; - parse_cmdline_early(cmdline_p); - + /* We have the original cmdline stored here already. */ + *cmdline_p = saved_command_line; + parse_early_options(cmdline_p); max_low_pfn = setup_memory(); + early_identify_cpu(&boot_cpu_data); /* * NOTE: before this point _nobody_ is allowed to allocate * any memory using the bootmem allocator. @@ -1159,24 +1156,8 @@ void __init setup_arch(char **cmdline_p) #endif paging_init(); -#ifdef CONFIG_EARLY_PRINTK - { - char *s = strstr(*cmdline_p, "earlyprintk="); - if (s) { - extern void setup_early_printk(char *); - - setup_early_printk(s); - printk("early console enabled\n"); - } - } -#endif - - dmi_scan_machine(); -#ifdef CONFIG_X86_GENERICARCH - generic_apic_probe(*cmdline_p); -#endif if (efi_enabled) efi_map_memmap(); --- linux-2.6.5-rc3/arch/i386/kernel/signal.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/signal.c 2004-03-31 00:56:34.395444040 -0800 @@ -128,28 +128,29 @@ sys_sigaltstack(const stack_t __user *us */ static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax) +restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *__sc, int *peax) { - unsigned int err = 0; + struct sigcontext scratch; /* 88 bytes of scratch area */ /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) err |= __get_user(regs->x, &sc->x) + if (copy_from_user(&scratch, __sc, sizeof(scratch))) + return -EFAULT; + +#define COPY(x) regs->x = scratch.x #define COPY_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ + { unsigned short tmp = scratch.seg; \ regs->x##seg = tmp; } #define COPY_SEG_STRICT(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ + { unsigned short tmp = scratch.seg; \ regs->x##seg = tmp|3; } #define GET_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ + { unsigned short tmp = scratch.seg; \ loadsegment(seg,tmp); } GET_SEG(gs); @@ -168,27 +169,23 @@ restore_sigcontext(struct pt_regs *regs, COPY_SEG_STRICT(ss); { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); + unsigned int tmpflags = scratch.eflags; regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); regs->orig_eax = -1; /* disable syscall checks */ } { - struct _fpstate __user * buf; - err |= __get_user(buf, &sc->fpstate); + struct _fpstate * buf = scratch.fpstate; if (buf) { if (verify_area(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); + return -EFAULT; + if (restore_i387(buf)) + return -EFAULT; } } - err |= __get_user(*peax, &sc->eax); - return err; - -badframe: - return 1; + *peax = scratch.eax; + return 0; } asmlinkage int sys_sigreturn(unsigned long __unused) @@ -266,53 +263,54 @@ badframe: */ static int -setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, +setup_sigcontext(struct sigcontext __user *__sc, struct _fpstate __user *fpstate, struct pt_regs *regs, unsigned long mask) { - int tmp, err = 0; + struct sigcontext sc; /* 88 bytes of scratch area */ + int tmp; tmp = 0; __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int *)&sc->gs); + *(unsigned int *)&sc.gs = tmp; __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int *)&sc->fs); - - err |= __put_user(regs->xes, (unsigned int *)&sc->es); - err |= __put_user(regs->xds, (unsigned int *)&sc->ds); - err |= __put_user(regs->edi, &sc->edi); - err |= __put_user(regs->esi, &sc->esi); - err |= __put_user(regs->ebp, &sc->ebp); - err |= __put_user(regs->esp, &sc->esp); - err |= __put_user(regs->ebx, &sc->ebx); - err |= __put_user(regs->edx, &sc->edx); - err |= __put_user(regs->ecx, &sc->ecx); - err |= __put_user(regs->eax, &sc->eax); - err |= __put_user(current->thread.trap_no, &sc->trapno); - err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user(regs->eip, &sc->eip); - err |= __put_user(regs->xcs, (unsigned int *)&sc->cs); - err |= __put_user(regs->eflags, &sc->eflags); - err |= __put_user(regs->esp, &sc->esp_at_signal); - err |= __put_user(regs->xss, (unsigned int *)&sc->ss); + *(unsigned int *)&sc.fs = tmp; + *(unsigned int *)&sc.es = regs->xes; + *(unsigned int *)&sc.ds = regs->xds; + sc.edi = regs->edi; + sc.esi = regs->esi; + sc.ebp = regs->ebp; + sc.esp = regs->esp; + sc.ebx = regs->ebx; + sc.edx = regs->edx; + sc.ecx = regs->ecx; + sc.eax = regs->eax; + sc.trapno = current->thread.trap_no; + sc.err = current->thread.error_code; + sc.eip = regs->eip; + *(unsigned int *)&sc.cs = regs->xcs; + sc.eflags = regs->eflags; + sc.esp_at_signal = regs->esp; + *(unsigned int *)&sc.ss = regs->xss; tmp = save_i387(fpstate); if (tmp < 0) - err = 1; - else - err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate); + return 1; + sc.fpstate = tmp ? fpstate : NULL; /* non-iBCS2 extensions.. */ - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->thread.cr2, &sc->cr2); + sc.oldmask = mask; + sc.cr2 = current->thread.cr2; - return err; + if (copy_to_user(__sc, &sc, sizeof(sc))) + return 1; + return 0; } /* * Determine which stack to use.. */ static inline void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) +get_sigframe(struct k_sigaction *ka_copy, struct pt_regs * regs, size_t frame_size) { unsigned long esp; @@ -320,16 +318,16 @@ get_sigframe(struct k_sigaction *ka, str esp = regs->esp; /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { + if (ka_copy->sa.sa_flags & SA_ONSTACK) { if (sas_ss_flags(esp) == 0) esp = current->sas_ss_sp + current->sas_ss_size; } /* This is the legacy signal stack switching. */ else if ((regs->xss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) { - esp = (unsigned long) ka->sa.sa_restorer; + !(ka_copy->sa.sa_flags & SA_RESTORER) && + ka_copy->sa.sa_restorer) { + esp = (unsigned long) ka_copy->sa.sa_restorer; } return (void __user *)((esp - frame_size) & -8ul); @@ -339,14 +337,14 @@ get_sigframe(struct k_sigaction *ka, str See vsyscall-sigreturn.S. */ extern void __kernel_sigreturn, __kernel_rt_sigreturn; -static void setup_frame(int sig, struct k_sigaction *ka, +static void setup_frame(int sig, struct k_sigaction *ka_copy, sigset_t *set, struct pt_regs * regs) { void *restorer; struct sigframe __user *frame; int err = 0; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka_copy, regs, sizeof(*frame)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; @@ -372,8 +370,8 @@ static void setup_frame(int sig, struct goto give_sigsegv; restorer = &__kernel_sigreturn; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ka_copy->sa.sa_flags & SA_RESTORER) + restorer = ka_copy->sa.sa_restorer; /* Set up to return from userspace. */ err |= __put_user(restorer, &frame->pretcode); @@ -394,7 +392,7 @@ static void setup_frame(int sig, struct /* Set up registers for signal handler */ regs->esp = (unsigned long) frame; - regs->eip = (unsigned long) ka->sa.sa_handler; + regs->eip = (unsigned long) ka_copy->sa.sa_handler; set_fs(USER_DS); regs->xds = __USER_DS; @@ -412,18 +410,18 @@ static void setup_frame(int sig, struct give_sigsegv: if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + current->sighand->action[sig-1].sa.sa_handler = SIG_DFL; force_sig(SIGSEGV, current); } -static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +static void setup_rt_frame(int sig, struct k_sigaction *ka_copy, siginfo_t *info, sigset_t *set, struct pt_regs * regs) { void *restorer; struct rt_sigframe __user *frame; int err = 0; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka_copy, regs, sizeof(*frame)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; @@ -443,7 +441,7 @@ static void setup_rt_frame(int sig, stru /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(current->sas_ss_sp, (unsigned long *)&frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->esp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); @@ -455,8 +453,8 @@ static void setup_rt_frame(int sig, stru /* Set up to return from userspace. */ restorer = &__kernel_rt_sigreturn; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ka_copy->sa.sa_flags & SA_RESTORER) + restorer = ka_copy->sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); /* @@ -475,7 +473,7 @@ static void setup_rt_frame(int sig, stru /* Set up registers for signal handler */ regs->esp = (unsigned long) frame; - regs->eip = (unsigned long) ka->sa.sa_handler; + regs->eip = (unsigned long) ka_copy->sa.sa_handler; set_fs(USER_DS); regs->xds = __USER_DS; @@ -493,7 +491,7 @@ static void setup_rt_frame(int sig, stru give_sigsegv: if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + current->sighand->action[sig-1].sa.sa_handler = SIG_DFL; force_sig(SIGSEGV, current); } @@ -502,11 +500,9 @@ give_sigsegv: */ static void -handle_signal(unsigned long sig, siginfo_t *info, sigset_t *oldset, - struct pt_regs * regs) +handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka_copy, + sigset_t *oldset, struct pt_regs * regs) { - struct k_sigaction *ka = ¤t->sighand->action[sig-1]; - /* Are we from a system call? */ if (regs->orig_eax >= 0) { /* If so, check system call restarting.. */ @@ -517,7 +513,7 @@ handle_signal(unsigned long sig, siginfo break; case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { + if (!(ka_copy->sa.sa_flags & SA_RESTART)) { regs->eax = -EINTR; break; } @@ -529,17 +525,14 @@ handle_signal(unsigned long sig, siginfo } /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - setup_rt_frame(sig, ka, info, oldset, regs); + if (ka_copy->sa.sa_flags & SA_SIGINFO) + setup_rt_frame(sig, ka_copy, info, oldset, regs); else - setup_frame(sig, ka, oldset, regs); - - if (ka->sa.sa_flags & SA_ONESHOT) - ka->sa.sa_handler = SIG_DFL; + setup_frame(sig, ka_copy, oldset, regs); - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (!(ka_copy->sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); + sigorsets(¤t->blocked,¤t->blocked,&ka_copy->sa.sa_mask); sigaddset(¤t->blocked,sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); @@ -555,6 +548,7 @@ int fastcall do_signal(struct pt_regs *r { siginfo_t info; int signr; + struct k_sigaction ka_copy; /* * We want the common case to go fast, which @@ -573,7 +567,7 @@ int fastcall do_signal(struct pt_regs *r if (!oldset) oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, regs, NULL); + signr = get_signal_to_deliver(&info, &ka_copy, regs, NULL); if (signr > 0) { /* Reenable any watchpoints before delivering the * signal to user space. The processor register will @@ -583,7 +577,7 @@ int fastcall do_signal(struct pt_regs *r __asm__("movl %0,%%db7" : : "r" (current->thread.debugreg[7])); /* Whee! Actually deliver the signal. */ - handle_signal(signr, &info, oldset, regs); + handle_signal(signr, &info, &ka_copy, oldset, regs); return 1; } --- linux-2.6.5-rc3/arch/i386/kernel/smpboot.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/smpboot.c 2004-03-31 00:56:32.881674168 -0800 @@ -39,6 +39,7 @@ #include #include +#include #include #include #include @@ -522,7 +523,7 @@ static inline void unmap_cpu_to_node(int printk("Unmapping cpu %d from all nodes\n", cpu); for (node = 0; node < MAX_NUMNODES; node ++) cpu_clear(cpu, node_2_cpu_mask[node]); - cpu_2_node[cpu] = -1; + cpu_2_node[cpu] = 0; } #else /* !CONFIG_NUMA */ @@ -815,6 +816,8 @@ static int __init do_boot_cpu(int apicid /* Stack for startup_32 can be just as for start_secondary onwards */ stack_start.esp = (void *) idle->thread.esp; + irq_ctx_init(cpu); + /* * This grunge runs the startup process for * the targeted processor. @@ -934,7 +937,7 @@ static int boot_cpu_logical_apicid; /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; -int cpu_sibling_map[NR_CPUS] __cacheline_aligned; +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; static void __init smp_boot_cpus(unsigned int max_cpus) { @@ -953,6 +956,8 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); + cpus_clear(cpu_sibling_map[0]); + cpu_set(0, cpu_sibling_map[0]); /* * If we couldn't find an SMP configuration at boot time, @@ -1079,32 +1084,34 @@ static void __init smp_boot_cpus(unsigne Dprintk("Boot done.\n"); /* - * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so - * that we can tell the sibling CPU efficiently. + * construct cpu_sibling_map[], so that we can tell sibling CPUs + * efficiently. */ - if (cpu_has_ht && smp_num_siblings > 1) { - for (cpu = 0; cpu < NR_CPUS; cpu++) - cpu_sibling_map[cpu] = NO_PROC_ID; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - int i; - if (!cpu_isset(cpu, cpu_callout_map)) - continue; + for (cpu = 0; cpu < NR_CPUS; cpu++) + cpus_clear(cpu_sibling_map[cpu]); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + int siblings = 0; + int i; + if (!cpu_isset(cpu, cpu_callout_map)) + continue; + if (smp_num_siblings > 1) { for (i = 0; i < NR_CPUS; i++) { - if (i == cpu || !cpu_isset(i, cpu_callout_map)) + if (!cpu_isset(i, cpu_callout_map)) continue; if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_sibling_map[cpu] = i; - printk("cpu_sibling_map[%d] = %d\n", cpu, cpu_sibling_map[cpu]); - break; + siblings++; + cpu_set(i, cpu_sibling_map[cpu]); } } - if (cpu_sibling_map[cpu] == NO_PROC_ID) { - smp_num_siblings = 1; - printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu); - } + } else { + siblings++; + cpu_set(cpu, cpu_sibling_map[cpu]); } + + if (siblings != smp_num_siblings) + printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings); } smpboot_setup_io_apic(); @@ -1118,6 +1125,256 @@ static void __init smp_boot_cpus(unsigne synchronize_tsc_bp(); } +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_NUMA +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static DEFINE_PER_CPU(struct sched_domain, node_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first = NULL, *last = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + cpu_domain->cache_hot_time = cacheflush_time / 2; + cpu_domain->parent = phys_domain; + cpu_domain->groups = &sched_group_cpus[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = nodemask; + phys_domain->cache_hot_time = cacheflush_time / 2; + phys_domain->parent = node_domain; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + node_domain->cache_hot_time = cacheflush_time; + node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + int j; + first = last = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpu->cpumask = CPU_MASK_NONE; + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + } + + for (i = 0; i < MAX_NUMNODES; i++) { + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + first = last = NULL; + /* Set up physical groups */ + for_each_cpu_mask(j, nodemask) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j); + struct sched_group *cpu = &sched_group_phys[j]; + + if (j != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* + * Make each extra sibling increase power by 10% of + * the basic CPU. This is very arbitrary. + */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + node->cpu_power += cpu->cpu_power; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + } + + /* Set up nodes */ + first = last = NULL; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *cpu = &sched_group_nodes[i]; + cpumask_t nodemask; + cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + cpu->cpumask = nodemask; + /* ->cpu_power already setup */ + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); + } +} +#else /* !CONFIG_NUMA */ +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first = NULL, *last = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + cpu_domain->cache_hot_time = cacheflush_time / 2; + cpu_domain->parent = phys_domain; + cpu_domain->groups = &sched_group_cpus[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = cpu_possible_map; + phys_domain->cache_hot_time = cacheflush_time / 2; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + int j; + first = last = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + } + + first = last = NULL; + /* Set up physical groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_group *cpu = &sched_group_phys[i]; + + if (i != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* See SMT+NUMA setup for comment */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); + } +} +#endif /* CONFIG_NUMA */ +#else /* !CONFIG_SCHED_SMT */ + +static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); + +void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + + *cpu_sd = SD_CPU_INIT; + cpu_sd->span = cpu_possible_map; + cpu_sd->cache_hot_time = cacheflush_time / 2; + cpu_sd->groups = &sched_group_cpus[i]; + } + + /* Set up CPU groups */ + for_each_cpu_mask(i, cpu_possible_map) { + struct sched_group *cpu = &sched_group_cpus[i]; + + cpus_clear(cpu->cpumask); + cpu_set(i, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); /* domains were modified outside the lock */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); + } +} +#endif + /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) --- linux-2.6.5-rc3/arch/i386/kernel/smp.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/smp.c 2004-03-31 00:56:34.395444040 -0800 @@ -327,10 +327,12 @@ asmlinkage void smp_invalidate_interrupt if (flush_mm == cpu_tlbstate[cpu].active_mm) { if (cpu_tlbstate[cpu].state == TLBSTATE_OK) { +#ifndef CONFIG_X86_SWITCH_PAGETABLES if (flush_va == FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(flush_va); +#endif } else leave_mm(cpu); } @@ -396,21 +398,6 @@ static void flush_tlb_others(cpumask_t c spin_unlock(&tlbstate_lock); } -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - local_flush_tlb(); - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - preempt_enable(); -} - void flush_tlb_mm (struct mm_struct * mm) { cpumask_t cpu_mask; @@ -442,7 +429,10 @@ void flush_tlb_page(struct vm_area_struc if (current->active_mm == mm) { if(current->mm) - __flush_tlb_one(va); +#ifndef CONFIG_X86_SWITCH_PAGETABLES + __flush_tlb_one(va) +#endif + ; else leave_mm(smp_processor_id()); } @@ -466,7 +456,17 @@ void flush_tlb_all(void) { on_each_cpu(do_flush_tlb_all, 0, 1, 1); } - +#ifdef CONFIG_KGDB +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} +#endif /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing --- linux-2.6.5-rc3/arch/i386/kernel/sysenter.c 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/sysenter.c 2004-03-31 00:56:34.396443888 -0800 @@ -18,13 +18,18 @@ #include #include #include +#include extern asmlinkage void sysenter_entry(void); void enable_sep_cpu(void *info) { int cpu = get_cpu(); +#ifdef CONFIG_X86_HIGH_ENTRY + struct tss_struct *tss = (struct tss_struct *) __fix_to_virt(FIX_TSS_0) + cpu; +#else struct tss_struct *tss = init_tss + cpu; +#endif tss->ss1 = __KERNEL_CS; tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; --- linux-2.6.5-rc3/arch/i386/kernel/traps.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/kernel/traps.c 2004-03-31 00:56:34.397443736 -0800 @@ -55,12 +55,8 @@ #include "mach_traps.h" -asmlinkage int system_call(void); -asmlinkage void lcall7(void); -asmlinkage void lcall27(void); - -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 } }; +struct desc_struct default_ldt[] __attribute__((__section__(".data.default_ldt"))) = { { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } }; +struct page *default_ldt_page; /* Do we ignore FPU interrupts ? */ char ignore_fpu_irq = 0; @@ -92,6 +88,41 @@ asmlinkage void alignment_check(void); asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); +#ifdef CONFIG_KGDB +extern void sysenter_entry(void); +#include +#include +void set_intr_gate(unsigned int n, void *addr); +static void set_intr_usr_gate(unsigned int n, void *addr); +/* + * Should be able to call this breakpoint() very early in + * bring up. Just hard code the call where needed. + * The breakpoint() code is here because set_?_gate() functions + * are local (static) to trap.c. They need be done only once, + * but it does not hurt to do them over. + */ +void breakpoint(void) +{ + init_entry_mappings(); + set_intr_usr_gate(3,&int3); /* disable ints on trap */ + set_intr_gate(1,&debug); + set_intr_gate(14,&page_fault); + + BREAKPOINT; +} +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (!user_mode(regs) ) \ + { \ + kgdb_handle_exception(trapnr, signr, error_code, regs); \ + after; \ + } else if ((trapnr == 3) && (regs->eflags &0x200)) local_irq_enable(); \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + + static int kstack_depth_to_print = 24; void show_trace(struct task_struct *task, unsigned long * stack) @@ -105,12 +136,20 @@ void show_trace(struct task_struct *task #ifdef CONFIG_KALLSYMS printk("\n"); #endif - while (!kstack_end(stack)) { - addr = *stack++; - if (kernel_text_address(addr)) { - printk(" [<%08lx>] ", addr); - print_symbol("%s\n", addr); + while (1) { + struct thread_info *context; + context = (struct thread_info*) ((unsigned long)stack & (~(THREAD_SIZE - 1))); + while (!kstack_end(stack)) { + addr = *stack++; + if (kernel_text_address(addr)) { + printk(" [<%08lx>] ", addr); + print_symbol("%s\n", addr); + } } + stack = (unsigned long*)context->previous_esp; + if (!stack) + break; + printk(" =======================\n"); } printk("\n"); } @@ -176,7 +215,7 @@ void show_registers(struct pt_regs *regs ss = regs->xss & 0xffff; } print_modules(); - printk("CPU: %d\nEIP: %04x:[<%08lx>] %s\nEFLAGS: %08lx" + printk("CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\nEFLAGS: %08lx" " (%s) \n", smp_processor_id(), 0xffff & regs->xcs, regs->eip, print_tainted(), regs->eflags, UTS_RELEASE); @@ -194,23 +233,27 @@ void show_registers(struct pt_regs *regs * time of the fault.. */ if (in_kernel) { + u8 *eip; printk("\nStack: "); show_stack(NULL, (unsigned long*)esp); printk("Code: "); - if(regs->eip < PAGE_OFFSET) - goto bad; - for(i=0;i<20;i++) - { - unsigned char c; - if(__get_user(c, &((unsigned char*)regs->eip)[i])) { -bad: + eip = (u8 *)regs->eip - 43; + for (i = 0; i < 64; i++, eip++) { + unsigned char c = 0xff; + + if ((user_mode(regs) && get_user(c, eip)) || + (!user_mode(regs) && __direct_get_user(c, eip))) { + printk(" Bad EIP value."); break; } - printk("%02x ", c); + if (eip == (u8 *)regs->eip) + printk("<%02x> ", c); + else + printk("%02x ", c); } } printk("\n"); @@ -229,16 +272,14 @@ static void handle_BUG(struct pt_regs *r eip = regs->eip; - if (eip < PAGE_OFFSET) - goto no_bug; - if (__get_user(ud2, (unsigned short *)eip)) + if (__direct_get_user(ud2, (unsigned short *)eip)) goto no_bug; if (ud2 != 0x0b0f) goto no_bug; - if (__get_user(line, (unsigned short *)(eip + 2))) + if (__direct_get_user(line, (unsigned short *)(eip + 2))) goto bug; - if (__get_user(file, (char **)(eip + 4)) || - (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) + if (__direct_get_user(file, (char **)(eip + 4)) || + __direct_get_user(c, file)) file = ""; printk("------------[ cut here ]------------\n"); @@ -278,6 +319,15 @@ void die(const char * str, struct pt_reg #endif if (nl) printk("\n"); +#ifdef CONFIG_KGDB + /* This is about the only place we want to go to kgdb even if in + * user mode. But we must go in via a trap so within kgdb we will + * always be in kernel mode. + */ + if (user_mode(regs)) + BREAKPOINT; +#endif + CHK_REMOTE_DEBUG(0,SIGTRAP,err,regs,) show_registers(regs); bust_spinlocks(0); spin_unlock_irq(&die_lock); @@ -347,6 +397,7 @@ static inline void do_trap(int trapnr, i #define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,)\ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -364,7 +415,9 @@ asmlinkage void do_##name(struct pt_regs #define DO_VM86_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr, signr, error_code,regs, return)\ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + return; \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -411,8 +464,10 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) + if (!fixup_exception(regs)){ + CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,) die("general protection fault", regs, error_code); + } } static void mem_parity_error(unsigned char reason, struct pt_regs * regs) @@ -551,10 +606,18 @@ asmlinkage void do_debug(struct pt_regs if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); - /* Mask out spurious debug traps due to lazy DR7 setting */ + /* + * Mask out spurious debug traps due to lazy DR7 setting or + * due to 4G/4G kernel mode: + */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { if (!tsk->thread.debugreg[7]) goto clear_dr7; + if (!user_mode(regs)) { + // restore upon return-to-userspace: + set_thread_flag(TIF_DB7); + goto clear_dr7; + } } if (regs->eflags & VM_MASK) @@ -574,8 +637,18 @@ asmlinkage void do_debug(struct pt_regs * allowing programs to debug themselves without the ptrace() * interface. */ +#ifdef CONFIG_KGDB + /* + * I think this is the only "real" case of a TF in the kernel + * that really belongs to user space. Others are + * "Ours all ours!" + */ + if (((regs->xcs & 3) == 0) && ((void *)regs->eip == sysenter_entry)) + goto clear_TF_reenable; +#else if ((regs->xcs & 3) == 0) goto clear_TF_reenable; +#endif if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } @@ -587,6 +660,17 @@ asmlinkage void do_debug(struct pt_regs info.si_errno = 0; info.si_code = TRAP_BRKPT; +#ifdef CONFIG_KGDB + /* + * If this is a kernel mode trap, we need to reset db7 to allow us + * to continue sanely ALSO skip the signal delivery + */ + if ((regs->xcs & 3) == 0) + goto clear_dr7; + + /* if not kernel, allow ints but only if they were on */ + if ( regs->eflags & 0x200) local_irq_enable(); +#endif /* If this is a kernel mode trap, save the user PC on entry to * the kernel, that's what the debugger can make sense of. */ @@ -601,6 +685,7 @@ clear_dr7: __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); + CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) return; debug_vm86: @@ -796,19 +881,53 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ -#ifdef CONFIG_X86_F00F_BUG -void __init trap_init_f00f_bug(void) +void __init trap_init_virtual_IDT(void) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); - /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. + * "idt" is magic - it overlaps the idt_descr + * variable so that updating idt will automatically + * update the idt descriptor.. */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); + __set_fixmap(FIX_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + idt_descr.address = __fix_to_virt(FIX_IDT); + __asm__ __volatile__("lidt %0" : : "m" (idt_descr)); } -#endif + +void __init trap_init_virtual_GDT(void) +{ + int cpu = smp_processor_id(); + struct Xgt_desc_struct *gdt_desc = cpu_gdt_descr + cpu; + struct Xgt_desc_struct tmp_desc = {0, 0}; + struct tss_struct * t; + + __asm__ __volatile__("sgdt %0": "=m" (tmp_desc): :"memory"); + +#ifdef CONFIG_X86_HIGH_ENTRY + if (!cpu) { + __set_fixmap(FIX_GDT_0, __pa(cpu_gdt_table), PAGE_KERNEL); + __set_fixmap(FIX_GDT_1, __pa(cpu_gdt_table) + PAGE_SIZE, PAGE_KERNEL); + __set_fixmap(FIX_TSS_0, __pa(init_tss), PAGE_KERNEL); + __set_fixmap(FIX_TSS_1, __pa(init_tss) + 1*PAGE_SIZE, PAGE_KERNEL); + __set_fixmap(FIX_TSS_2, __pa(init_tss) + 2*PAGE_SIZE, PAGE_KERNEL); + __set_fixmap(FIX_TSS_3, __pa(init_tss) + 3*PAGE_SIZE, PAGE_KERNEL); + } + + gdt_desc->address = __fix_to_virt(FIX_GDT_0) + sizeof(cpu_gdt_table[0]) * cpu; +#else + gdt_desc->address = (unsigned long)cpu_gdt_table[cpu]; +#endif + __asm__ __volatile__("lgdt %0": "=m" (*gdt_desc)); + +#ifdef CONFIG_X86_HIGH_ENTRY + t = (struct tss_struct *) __fix_to_virt(FIX_TSS_0) + cpu; +#else + t = init_tss + cpu; +#endif + set_tss_desc(cpu, t); + cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; + load_TR_desc(); +} #define _set_gate(gate_addr,type,dpl,addr,seg) \ do { \ @@ -835,20 +954,26 @@ void set_intr_gate(unsigned int n, void _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); } -static void __init set_trap_gate(unsigned int n, void *addr) +void __init set_trap_gate(unsigned int n, void *addr) { _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); } -static void __init set_system_gate(unsigned int n, void *addr) +void __init set_system_gate(unsigned int n, void *addr) { _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); } -static void __init set_call_gate(void *a, void *addr) +void __init set_call_gate(void *a, void *addr) { _set_gate(a,12,3,addr,__KERNEL_CS); } +#ifdef CONFIG_KGDB +void set_intr_usr_gate(unsigned int n, void *addr) +{ + _set_gate(idt_table+n,14,3,addr,__KERNEL_CS); +} +#endif static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { @@ -867,11 +992,16 @@ void __init trap_init(void) #ifdef CONFIG_X86_LOCAL_APIC init_apic_mappings(); #endif + init_entry_mappings(); set_trap_gate(0,÷_error); set_intr_gate(1,&debug); set_intr_gate(2,&nmi); +#ifndef CONFIG_KGDB set_system_gate(3,&int3); /* int3-5 can be called from all */ +#else + set_intr_usr_gate(3,&int3); /* int3-5 can be called from all */ +#endif set_system_gate(4,&overflow); set_system_gate(5,&bounds); set_trap_gate(6,&invalid_op); --- linux-2.6.5-rc3/arch/i386/kernel/vm86.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/kernel/vm86.c 2004-03-31 00:56:34.398443584 -0800 @@ -125,7 +125,7 @@ struct pt_regs * fastcall save_v86_state tss = init_tss + get_cpu(); current->thread.esp0 = current->thread.saved_esp0; current->thread.sysenter_cs = __KERNEL_CS; - load_esp0(tss, ¤t->thread); + load_virtual_esp0(tss, current); current->thread.saved_esp0 = 0; put_cpu(); @@ -305,7 +305,7 @@ static void do_sys_vm86(struct kernel_vm tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; - load_esp0(tss, &tsk->thread); + load_virtual_esp0(tss, tsk); put_cpu(); tsk->thread.screen_bitmap = info->screen_bitmap; --- linux-2.6.5-rc3/arch/i386/kernel/vmlinux.lds.S 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/kernel/vmlinux.lds.S 2004-03-31 00:56:34.399443432 -0800 @@ -5,13 +5,17 @@ #include #include +#include +#include +#include + OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) ENTRY(startup_32) jiffies = jiffies_64; SECTIONS { - . = 0xC0000000 + 0x100000; + . = __PAGE_OFFSET + 0x100000; /* read-only */ _text = .; /* Text and read-only data */ .text : { @@ -20,6 +24,19 @@ SECTIONS *(.gnu.warning) } = 0x9090 +#ifdef CONFIG_X86_4G + . = ALIGN(PAGE_SIZE_asm); + __entry_tramp_start = .; + . = FIX_ENTRY_TRAMPOLINE_0_addr; + __start___entry_text = .; + .entry.text : AT (__entry_tramp_start) { *(.entry.text) } + __entry_tramp_end = __entry_tramp_start + SIZEOF(.entry.text); + . = __entry_tramp_end; + . = ALIGN(PAGE_SIZE_asm); +#else + .entry.text : { *(.entry.text) } +#endif + _etext = .; /* End of text section */ . = ALIGN(16); /* Exception table */ @@ -35,15 +52,12 @@ SECTIONS CONSTRUCTORS } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE_asm); __nosave_begin = .; .data_nosave : { *(.data.nosave) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE_asm); __nosave_end = .; - . = ALIGN(4096); - .data.page_aligned : { *(.data.idt) } - . = ALIGN(32); .data.cacheline_aligned : { *(.data.cacheline_aligned) } @@ -53,7 +67,7 @@ SECTIONS .data.init_task : { *(.data.init_task) } /* will be freed after init */ - . = ALIGN(4096); /* Init code and data */ + . = ALIGN(PAGE_SIZE_asm); /* Init code and data */ __init_begin = .; .init.text : { _sinittext = .; @@ -65,6 +79,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; __start___param = .; __param : { *(__param) } __stop___param = .; @@ -92,7 +109,7 @@ SECTIONS from .altinstructions and .eh_frame */ .exit.text : { *(.exit.text) } .exit.data : { *(.exit.data) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE_asm); __initramfs_start = .; .init.ramfs : { *(.init.ramfs) } __initramfs_end = .; @@ -100,10 +117,22 @@ SECTIONS __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE_asm); __init_end = .; /* freed after init ends here */ - + + . = ALIGN(PAGE_SIZE_asm); + .data.page_aligned_tss : { *(.data.tss) } + + . = ALIGN(PAGE_SIZE_asm); + .data.page_aligned_default_ldt : { *(.data.default_ldt) } + + . = ALIGN(PAGE_SIZE_asm); + .data.page_aligned_idt : { *(.data.idt) } + + . = ALIGN(PAGE_SIZE_asm); + .data.page_aligned_gdt : { *(.data.gdt) } + __bss_start = .; /* BSS */ .bss : { *(.bss) } . = ALIGN(4); @@ -128,4 +157,6 @@ SECTIONS .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } + + } --- linux-2.6.5-rc3/arch/i386/kernel/vsyscall.lds 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/kernel/vsyscall.lds 2004-03-31 00:56:34.399443432 -0800 @@ -5,7 +5,7 @@ */ /* This must match . */ -VSYSCALL_BASE = 0xffffe000; +VSYSCALL_BASE = 0xffffd000; SECTIONS { --- linux-2.6.5-rc3/arch/i386/kernel/vsyscall-sysenter.S 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/kernel/vsyscall-sysenter.S 2004-03-31 00:56:34.400443280 -0800 @@ -12,6 +12,11 @@ .type __kernel_vsyscall,@function __kernel_vsyscall: .LSTART_vsyscall: + cmpl $192, %eax + jne 1f + int $0x80 + ret +1: push %ecx .Lpush_ecx: push %edx --- linux-2.6.5-rc3/arch/i386/lib/checksum.S 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/lib/checksum.S 2004-03-31 00:56:34.400443280 -0800 @@ -280,14 +280,14 @@ unsigned int csum_partial_copy_generic ( .previous .align 4 -.globl csum_partial_copy_generic +.globl direct_csum_partial_copy_generic #ifndef CONFIG_X86_USE_PPRO_CHECKSUM #define ARGBASE 16 #define FP 12 -csum_partial_copy_generic: +direct_csum_partial_copy_generic: subl $4,%esp pushl %edi pushl %esi @@ -422,7 +422,7 @@ DST( movb %cl, (%edi) ) #define ARGBASE 12 -csum_partial_copy_generic: +direct_csum_partial_copy_generic: pushl %ebx pushl %edi pushl %esi --- linux-2.6.5-rc3/arch/i386/lib/dec_and_lock.c 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/lib/dec_and_lock.c 2004-03-31 00:56:32.607715816 -0800 @@ -10,6 +10,7 @@ #include #include +#ifndef ATOMIC_DEC_AND_LOCK int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { int counter; @@ -38,3 +39,5 @@ slow_path: spin_unlock(lock); return 0; } +#endif + --- linux-2.6.5-rc3/arch/i386/lib/getuser.S 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/lib/getuser.S 2004-03-31 00:56:34.401443128 -0800 @@ -9,6 +9,7 @@ * return value. */ #include +#include /* @@ -28,7 +29,7 @@ .globl __get_user_1 __get_user_1: GET_THREAD_INFO(%edx) - cmpl TI_ADDR_LIMIT(%edx),%eax + cmpl TI_addr_limit(%edx),%eax jae bad_get_user 1: movzbl (%eax),%edx xorl %eax,%eax @@ -40,7 +41,7 @@ __get_user_2: addl $1,%eax jc bad_get_user GET_THREAD_INFO(%edx) - cmpl TI_ADDR_LIMIT(%edx),%eax + cmpl TI_addr_limit(%edx),%eax jae bad_get_user 2: movzwl -1(%eax),%edx xorl %eax,%eax @@ -52,7 +53,7 @@ __get_user_4: addl $3,%eax jc bad_get_user GET_THREAD_INFO(%edx) - cmpl TI_ADDR_LIMIT(%edx),%eax + cmpl TI_addr_limit(%edx),%eax jae bad_get_user 3: movl -3(%eax),%edx xorl %eax,%eax --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/arch/i386/lib/kgdb_serial.c 2004-03-30 23:39:43.579394624 -0800 @@ -0,0 +1,499 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * Modified to allow invokation early in boot see also + * kgdb.h for instructions by George Anzinger(george@mvista.com) + * Modified to handle debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_KGDB_USER_CONSOLE +extern void kgdb_console_finit(void); +#endif +#define PRNT_off +#define TEST_EXISTANCE +#ifdef PRNT +#define dbprintk(s) printk s +#else +#define dbprintk(s) +#endif +#define TEST_INTERRUPT_off +#ifdef TEST_INTERRUPT +#define intprintk(s) printk s +#else +#define intprintk(s) +#endif + +#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT) + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +struct async_struct *gdb_async_info; +static int gdb_async_irq; + +#define outb_px(a,b) outb_p(b,a) + +static void program_uart(struct async_struct *info); +static void write_char(struct async_struct *info, int chr); +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(struct async_struct *info) +{ + char it = inb_p(info->port + UART_LSR); + + if (it & UART_LSR_DR) + return (inb_p(info->port + UART_RX)); + /* + * If we have a framing error assume somebody messed with + * our uart. Reprogram it and send '-' both ways... + */ + if (it & 0xc) { + program_uart(info); + write_char(info, '-'); + return ('-'); + } + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + + * Locking here is a bit of a problem. We MUST not lock out communication + * if we are trying to talk to gdb about a kgdb entry. ON the other hand + * we can loose chars in the console pass thru if we don't lock. It is also + * possible that we could hold the lock or be waiting for it when kgdb + * NEEDS to talk. Since kgdb locks down the world, it does not need locks. + * We do, of course have possible issues with interrupting a uart operation, + * but we will just depend on the uart status to help keep that straight. + + */ +static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +#endif + +static int +read_char(struct async_struct *info) +{ + int chr; + unsigned long flags; + local_irq_save(flags); +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_lock(&uart_interrupt_lock); + } +#endif + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + } else { + chr = read_data_bfr(info); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_unlock(&uart_interrupt_lock); + } +#endif + local_irq_restore(flags); + return (chr); +} + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(struct async_struct *info, int chr) +{ + while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ; + + outb_p(chr, info->port + UART_TX); + +} /* write_char */ + +/* + * Mostly we don't need a spinlock, but since the console goes + * thru here with interrutps on, well, we need to catch those + * chars. + */ +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine tty_getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via tty_putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static irqreturn_t +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct async_struct *info; + unsigned long flags; + + info = gdb_async_info; + if (!info || !info->tty || irq != gdb_async_irq) + return IRQ_NONE; + + local_irq_save(flags); + spin_lock(&uart_interrupt_lock); + do { + int chr = read_data_bfr(info); + intprintk(("Debug char on int: %x hex\n", chr)); + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + BREAKPOINT; + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { + /* buffer overflow tosses early char */ + read_char(info); + } + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI); + spin_unlock(&uart_interrupt_lock); + local_irq_restore(flags); + return IRQ_HANDLED; +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +/* These structure are filled in with values defined in asm/kgdb_local.h + */ +static struct serial_state state = SB_STATE; +static struct async_struct local_info = SB_INFO; +static int ok_to_enable_ints = 0; +static void kgdb_enable_ints_now(void); + +extern char *kgdb_version; +/* + * Hook an IRQ for KGDB. + * + * This routine is called from tty_putDebugChar, below. + */ +static int ints_disabled = 1; +int +gdb_hook_interrupt(struct async_struct *info, int verb) +{ + struct serial_state *state = info->state; + unsigned long flags; + int port; +#ifdef TEST_EXISTANCE + int scratch, scratch2; +#endif + + /* The above fails if memory managment is not set up yet. + * Rather than fail the set up, just keep track of the fact + * and pick up the interrupt thing later. + */ + gdb_async_info = info; + port = gdb_async_info->port; + gdb_async_irq = state->irq; + if (verb) { + printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n", + kgdb_version, + port, + gdb_async_irq, gdb_async_info->state->custom_divisor); + } + local_irq_save(flags); +#ifdef TEST_EXISTANCE + /* Existance test */ + /* Should not need all this, but just in case.... */ + + scratch = inb_p(port + UART_IER); + outb_px(port + UART_IER, 0); + outb_px(0xff, 0x080); + scratch2 = inb_p(port + UART_IER); + outb_px(port + UART_IER, scratch); + if (scratch2) { + printk + ("gdb_hook_interrupt: Could not clear IER, not a UART!\n"); + local_irq_restore(flags); + return 1; /* We failed; there's nothing here */ + } + scratch2 = inb_p(port + UART_LCR); + outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */ + outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */ + outb_px(port + UART_LCR, 0); + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO); + scratch = inb_p(port + UART_IIR) >> 6; + if (scratch == 1) { + printk("gdb_hook_interrupt: Undefined UART type!" + " Not a UART! \n"); + local_irq_restore(flags); + return 1; + } else { + dbprintk(("gdb_hook_interrupt: UART type " + "is %d where 0=16450, 2=16550 3=16550A\n", scratch)); + } + scratch = inb_p(port + UART_MCR); + outb_px(port + UART_MCR, UART_MCR_LOOP | scratch); + outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A); + scratch2 = inb_p(port + UART_MSR) & 0xF0; + outb_px(port + UART_MCR, scratch); + if (scratch2 != 0x90) { + printk("gdb_hook_interrupt: " + "Loop back test failed! Not a UART!\n"); + local_irq_restore(flags); + return scratch2 + 1000; /* force 0 to fail */ + } +#endif /* test existance */ + program_uart(info); + local_irq_restore(flags); + + return (0); + +} /* gdb_hook_interrupt */ + +static void +program_uart(struct async_struct *info) +{ + int port = info->port; + + (void) inb_p(port + UART_RX); + outb_px(port + UART_IER, 0); + + (void) inb_p(port + UART_RX); /* serial driver comments say */ + (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */ + (void) inb_p(port + UART_MSR); + outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB); + outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */ + outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */ + outb_px(port + UART_MCR, info->MCR); + + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */ + outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */ + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */ + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + return; +} + +/* + * tty_getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. In the + */ +int kgdb_in_isr = 0; +int kgdb_in_lsr = 0; +extern spinlock_t kgdb_spinlock; + +/* Caller takes needed protections */ + +int +tty_getDebugChar(void) +{ + volatile int chr, dum, time, end_time; + + dbprintk(("tty_getDebugChar(port %x): ", gdb_async_info->port)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + /* + * This trick says if we wait a very long time and get + * no char, return the -1 and let the upper level deal + * with it. + */ + rdtsc(dum, time); + end_time = time + 2; + while (((chr = read_char(gdb_async_info)) == -1) && + (end_time - time) > 0) { + rdtsc(dum, time); + }; + /* + * This covers our butts if some other code messes with + * our uart, hay, it happens :o) + */ + if (chr == -1) + program_uart(gdb_async_info); + + dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' ')); + return (chr); + +} /* tty_getDebugChar */ + +static int count = 3; +static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; + +static int __init +kgdb_enable_ints(void) +{ + if (kgdboe) { + return 0; + } + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 1); + } + ok_to_enable_ints = 1; + kgdb_enable_ints_now(); +#ifdef CONFIG_KGDB_USER_CONSOLE + kgdb_console_finit(); +#endif + return 0; +} + +#ifdef CONFIG_SERIAL_8250 +void shutdown_for_kgdb(struct async_struct *gdb_async_info); +#endif + +#ifdef CONFIG_DISCONTIGMEM +static inline int kgdb_mem_init_done(void) +{ + return highmem_start_page != NULL; +} +#else +static inline int kgdb_mem_init_done(void) +{ + return max_mapnr != 0; +} +#endif + +static void +kgdb_enable_ints_now(void) +{ + if (!spin_trylock(&one_at_atime)) + return; + if (!ints_disabled) + goto exit; + if (kgdb_mem_init_done() && + ints_disabled) { /* don't try till mem init */ +#ifdef CONFIG_SERIAL_8250 + /* + * The ifdef here allows the system to be configured + * without the serial driver. + * Don't make it a module, however, it will steal the port + */ + shutdown_for_kgdb(gdb_async_info); +#endif + ints_disabled = request_irq(gdb_async_info->state->irq, + gdb_interrupt, + IRQ_T(gdb_async_info), + "KGDB-stub", NULL); + intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); + } + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + exit: + spin_unlock(&one_at_atime); +} + +/* + * tty_putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. Caller takes needed protections. + */ +void +tty_putDebugChar(int chr) +{ + dbprintk(("tty_putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n", + gdb_async_info->port, + chr, + chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + + write_char(gdb_async_info, chr); /* this routine will wait */ + count = (chr == '#') ? 0 : count + 1; + if ((count == 2)) { /* try to enable after */ + if (ints_disabled & ok_to_enable_ints) + kgdb_enable_ints_now(); /* try to enable after */ + + /* We do this a lot because, well we really want to get these + * interrupts. The serial driver will clear these bits when it + * initializes the chip. Every thing else it does is ok, + * but this. + */ + if (!ints_disabled) { + outb_px(gdb_async_info->port + UART_IER, + gdb_async_info->IER); + } + } + +} /* tty_putDebugChar */ + +/* + * This does nothing for the serial port, since it doesn't buffer. + */ + +void tty_flushDebugChar(void) +{ +} + +module_init(kgdb_enable_ints); --- linux-2.6.5-rc3/arch/i386/lib/Makefile 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/lib/Makefile 2004-03-30 20:43:37.004758136 -0800 @@ -9,3 +9,4 @@ lib-y = checksum.o delay.o \ lib-$(CONFIG_X86_USE_3DNOW) += mmx.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o +lib-$(CONFIG_KGDB) += kgdb_serial.o --- linux-2.6.5-rc3/arch/i386/lib/usercopy.c 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/i386/lib/usercopy.c 2004-03-31 00:56:34.402442976 -0800 @@ -76,7 +76,7 @@ do { \ * and returns @count. */ long -__strncpy_from_user(char *dst, const char __user *src, long count) +__direct_strncpy_from_user(char *dst, const char __user *src, long count) { long res; __do_strncpy_from_user(dst, src, count, res); @@ -102,7 +102,7 @@ __strncpy_from_user(char *dst, const cha * and returns @count. */ long -strncpy_from_user(char *dst, const char __user *src, long count) +direct_strncpy_from_user(char *dst, const char __user *src, long count) { long res = -EFAULT; if (access_ok(VERIFY_READ, src, 1)) @@ -147,7 +147,7 @@ do { \ * On success, this will be zero. */ unsigned long -clear_user(void __user *to, unsigned long n) +direct_clear_user(void __user *to, unsigned long n) { might_sleep(); if (access_ok(VERIFY_WRITE, to, n)) @@ -167,7 +167,7 @@ clear_user(void __user *to, unsigned lon * On success, this will be zero. */ unsigned long -__clear_user(void __user *to, unsigned long n) +__direct_clear_user(void __user *to, unsigned long n) { __do_clear_user(to, n); return n; @@ -184,7 +184,7 @@ __clear_user(void __user *to, unsigned l * On exception, returns 0. * If the string is too long, returns a value greater than @n. */ -long strnlen_user(const char __user *s, long n) +long direct_strnlen_user(const char __user *s, long n) { unsigned long mask = -__addr_ok(s); unsigned long res, tmp; @@ -575,3 +575,4 @@ unsigned long __copy_from_user_ll(void * n = __copy_user_zeroing_intel(to, (const void *) from, n); return n; } + --- linux-2.6.5-rc3/arch/i386/mach-default/Makefile 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/mach-default/Makefile 2004-03-31 00:56:21.000480384 -0800 @@ -2,4 +2,4 @@ # Makefile for the linux kernel. # -obj-y := setup.o topology.o +obj-y := setup.o topology.o std_resources.o --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/arch/i386/mach-default/std_resources.c 2004-03-31 00:56:21.175453784 -0800 @@ -0,0 +1,204 @@ +/* + * Machine specific resource allocation for generic. + */ + +#include +#include +#include + +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +#define ADAPTER_ROM_RESOURCES \ + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource vram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource standard_io_resources[] = { { + .name = "dma1", + .start = 0x0000, + .end = 0x001f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic1", + .start = 0x0020, + .end = 0x0021, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer", + .start = 0x0040, + .end = 0x005f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0060, + .end = 0x006f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma page reg", + .start = 0x0080, + .end = 0x008f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic2", + .start = 0x00a0, + .end = 0x00a1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma2", + .start = 0x00c0, + .end = 0x00df, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "fpu", + .start = 0x00f0, + .end = 0x00ff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +} }; + +#define STANDARD_IO_RESOURCES \ + (sizeof standard_io_resources / sizeof standard_io_resources[0]) + +static int __init checksum(unsigned char *rom, unsigned long length) +{ + unsigned char *p, sum = 0; + + for (p = rom; p < rom + length; p++) + sum += *p; + return sum == 0; +} + +void __init probe_roms(void) +{ + unsigned long start, length, upper; + unsigned char *rom; + int i; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* if checksum okay, trust length byte */ + if (length && checksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (checksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !checksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + +void __init request_graphics_resource(void) +{ + request_resource(&iomem_resource, &vram_resource); +} + +void __init request_standard_io_resources(void) +{ + int i; + + for (i = 0; i < STANDARD_IO_RESOURCES; i++) + request_resource(&ioport_resource, &standard_io_resources[i]); +} --- linux-2.6.5-rc3/arch/i386/mach-generic/probe.c 2003-08-22 19:23:40.000000000 -0700 +++ 25/arch/i386/mach-generic/probe.c 2004-03-31 00:55:02.079478200 -0800 @@ -28,41 +28,42 @@ struct genapic *apic_probe[] __initdata NULL, }; -void __init generic_apic_probe(char *command_line) -{ - char *s; +static int __init generic_apic_probe(char *command_line) +{ + char *s = command_line; int i; int changed = 0; - s = strstr(command_line, "apic="); - if (s && (s == command_line || isspace(s[-1]))) { - char *p = strchr(s, ' '), old; - if (!p) - p = strchr(s, '\0'); - old = *p; - *p = 0; - for (i = 0; !changed && apic_probe[i]; i++) { - if (!strcmp(apic_probe[i]->name, s+5)) { - changed = 1; - genapic = apic_probe[i]; - } + char *p = strchr(s, ' '), old; + if (!p) + p = strchr(s, '\0'); + old = *p; + *p = 0; + for (i = 0; !changed && apic_probe[i]; i++) { + if (!strcmp(apic_probe[i]->name, s+5)) { + changed = 1; + genapic = apic_probe[i]; } - if (!changed) - printk(KERN_ERR "Unknown genapic `%s' specified.\n", s); - *p = old; - } - for (i = 0; !changed && apic_probe[i]; i++) { + } + if (!changed) + printk(KERN_ERR "Unknown genapic `%s' specified.\n", s); + *p = old; + + for (i = 0; !changed && apic_probe[i]; i++) { if (apic_probe[i]->probe()) { changed = 1; - genapic = apic_probe[i]; - } + genapic = apic_probe[i]; + } } - /* Not visible without early console */ - if (!changed) - panic("Didn't find an APIC driver"); + /* Not visible without early console */ + if (!changed) + panic("Didn't find an APIC driver"); printk(KERN_INFO "Using APIC driver %s\n", genapic->name); -} + + return 0; +} +__early_param("apic=", generic_apic_probe); /* These functions can switch the APIC even after the initial ->probe() */ --- linux-2.6.5-rc3/arch/i386/mach-pc9800/Makefile 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/mach-pc9800/Makefile 2004-03-31 00:56:21.001480232 -0800 @@ -2,4 +2,4 @@ # Makefile for the linux kernel. # -obj-y := setup.o topology.o +obj-y := setup.o topology.o std_resources.o --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/arch/i386/mach-pc9800/std_resources.c 2004-03-31 00:56:21.002480080 -0800 @@ -0,0 +1,195 @@ +/* + * Machine specific resource allocation for PC-9800. + * Written by Osamu Tomita + */ + +#include +#include +#include + +static char str_pic1[] = "pic1"; +static char str_dma[] = "dma"; +static char str_pic2[] = "pic2"; +static char str_calender_clock[] = "calender clock"; +static char str_system[] = "system"; +static char str_nmi_control[] = "nmi control"; +static char str_kanji_rom[] = "kanji rom"; +static char str_keyboard[] = "keyboard"; +static char str_text_gdc[] = "text gdc"; +static char str_crtc[] = "crtc"; +static char str_timer[] = "timer"; +static char str_graphic_gdc[] = "graphic gdc"; +static char str_dma_ex_bank[] = "dma ex. bank"; +static char str_beep_freq[] = "beep freq."; +static char str_mouse_pio[] = "mouse pio"; +struct resource standard_io_resources[] = { + { str_pic1, 0x00, 0x00, IORESOURCE_BUSY }, + { str_dma, 0x01, 0x01, IORESOURCE_BUSY }, + { str_pic1, 0x02, 0x02, IORESOURCE_BUSY }, + { str_dma, 0x03, 0x03, IORESOURCE_BUSY }, + { str_dma, 0x05, 0x05, IORESOURCE_BUSY }, + { str_dma, 0x07, 0x07, IORESOURCE_BUSY }, + { str_pic2, 0x08, 0x08, IORESOURCE_BUSY }, + { str_dma, 0x09, 0x09, IORESOURCE_BUSY }, + { str_pic2, 0x0a, 0x0a, IORESOURCE_BUSY }, + { str_dma, 0x0b, 0x0b, IORESOURCE_BUSY }, + { str_dma, 0x0d, 0x0d, IORESOURCE_BUSY }, + { str_dma, 0x0f, 0x0f, IORESOURCE_BUSY }, + { str_dma, 0x11, 0x11, IORESOURCE_BUSY }, + { str_dma, 0x13, 0x13, IORESOURCE_BUSY }, + { str_dma, 0x15, 0x15, IORESOURCE_BUSY }, + { str_dma, 0x17, 0x17, IORESOURCE_BUSY }, + { str_dma, 0x19, 0x19, IORESOURCE_BUSY }, + { str_dma, 0x1b, 0x1b, IORESOURCE_BUSY }, + { str_dma, 0x1d, 0x1d, IORESOURCE_BUSY }, + { str_dma, 0x1f, 0x1f, IORESOURCE_BUSY }, + { str_calender_clock, 0x20, 0x20, 0 }, + { str_dma, 0x21, 0x21, IORESOURCE_BUSY }, + { str_calender_clock, 0x22, 0x22, 0 }, + { str_dma, 0x23, 0x23, IORESOURCE_BUSY }, + { str_dma, 0x25, 0x25, IORESOURCE_BUSY }, + { str_dma, 0x27, 0x27, IORESOURCE_BUSY }, + { str_dma, 0x29, 0x29, IORESOURCE_BUSY }, + { str_dma, 0x2b, 0x2b, IORESOURCE_BUSY }, + { str_dma, 0x2d, 0x2d, IORESOURCE_BUSY }, + { str_system, 0x31, 0x31, IORESOURCE_BUSY }, + { str_system, 0x33, 0x33, IORESOURCE_BUSY }, + { str_system, 0x35, 0x35, IORESOURCE_BUSY }, + { str_system, 0x37, 0x37, IORESOURCE_BUSY }, + { str_nmi_control, 0x50, 0x50, IORESOURCE_BUSY }, + { str_nmi_control, 0x52, 0x52, IORESOURCE_BUSY }, + { "time stamp", 0x5c, 0x5f, IORESOURCE_BUSY }, + { str_kanji_rom, 0xa1, 0xa1, IORESOURCE_BUSY }, + { str_kanji_rom, 0xa3, 0xa3, IORESOURCE_BUSY }, + { str_kanji_rom, 0xa5, 0xa5, IORESOURCE_BUSY }, + { str_kanji_rom, 0xa7, 0xa7, IORESOURCE_BUSY }, + { str_kanji_rom, 0xa9, 0xa9, IORESOURCE_BUSY }, + { str_keyboard, 0x41, 0x41, IORESOURCE_BUSY }, + { str_keyboard, 0x43, 0x43, IORESOURCE_BUSY }, + { str_text_gdc, 0x60, 0x60, IORESOURCE_BUSY }, + { str_text_gdc, 0x62, 0x62, IORESOURCE_BUSY }, + { str_text_gdc, 0x64, 0x64, IORESOURCE_BUSY }, + { str_text_gdc, 0x66, 0x66, IORESOURCE_BUSY }, + { str_text_gdc, 0x68, 0x68, IORESOURCE_BUSY }, + { str_text_gdc, 0x6a, 0x6a, IORESOURCE_BUSY }, + { str_text_gdc, 0x6c, 0x6c, IORESOURCE_BUSY }, + { str_text_gdc, 0x6e, 0x6e, IORESOURCE_BUSY }, + { str_crtc, 0x70, 0x70, IORESOURCE_BUSY }, + { str_crtc, 0x72, 0x72, IORESOURCE_BUSY }, + { str_crtc, 0x74, 0x74, IORESOURCE_BUSY }, + { str_crtc, 0x74, 0x74, IORESOURCE_BUSY }, + { str_crtc, 0x76, 0x76, IORESOURCE_BUSY }, + { str_crtc, 0x78, 0x78, IORESOURCE_BUSY }, + { str_crtc, 0x7a, 0x7a, IORESOURCE_BUSY }, + { str_timer, 0x71, 0x71, IORESOURCE_BUSY }, + { str_timer, 0x73, 0x73, IORESOURCE_BUSY }, + { str_timer, 0x75, 0x75, IORESOURCE_BUSY }, + { str_timer, 0x77, 0x77, IORESOURCE_BUSY }, + { str_graphic_gdc, 0xa0, 0xa0, IORESOURCE_BUSY }, + { str_graphic_gdc, 0xa2, 0xa2, IORESOURCE_BUSY }, + { str_graphic_gdc, 0xa4, 0xa4, IORESOURCE_BUSY }, + { str_graphic_gdc, 0xa6, 0xa6, IORESOURCE_BUSY }, + { "cpu", 0xf0, 0xf7, IORESOURCE_BUSY }, + { "fpu", 0xf8, 0xff, IORESOURCE_BUSY }, + { str_dma_ex_bank, 0x0e05, 0x0e05, 0 }, + { str_dma_ex_bank, 0x0e07, 0x0e07, 0 }, + { str_dma_ex_bank, 0x0e09, 0x0e09, 0 }, + { str_dma_ex_bank, 0x0e0b, 0x0e0b, 0 }, + { str_beep_freq, 0x3fd9, 0x3fd9, IORESOURCE_BUSY }, + { str_beep_freq, 0x3fdb, 0x3fdb, IORESOURCE_BUSY }, + { str_beep_freq, 0x3fdd, 0x3fdd, IORESOURCE_BUSY }, + { str_beep_freq, 0x3fdf, 0x3fdf, IORESOURCE_BUSY }, + /* All PC-9800 have (exactly) one mouse interface. */ + { str_mouse_pio, 0x7fd9, 0x7fd9, 0 }, + { str_mouse_pio, 0x7fdb, 0x7fdb, 0 }, + { str_mouse_pio, 0x7fdd, 0x7fdd, 0 }, + { str_mouse_pio, 0x7fdf, 0x7fdf, 0 }, + { "mouse timer", 0xbfdb, 0xbfdb, 0 }, + { "mouse irq", 0x98d7, 0x98d7, 0 }, +}; + +#define STANDARD_IO_RESOURCES (sizeof(standard_io_resources)/sizeof(struct resource)) + +static struct resource tvram_resource = { "Text VRAM/CG window", 0xa0000, 0xa4fff, IORESOURCE_BUSY }; +static struct resource gvram_brg_resource = { "Graphic VRAM (B/R/G)", 0xa8000, 0xbffff, IORESOURCE_BUSY }; +static struct resource gvram_e_resource = { "Graphic VRAM (E)", 0xe0000, 0xe7fff, IORESOURCE_BUSY }; + +/* System ROM resources */ +#define MAXROMS 6 +static struct resource rom_resources[MAXROMS] = { + { "System ROM", 0xe8000, 0xfffff, IORESOURCE_BUSY } +}; + +void __init probe_roms(void) +{ + int i; + __u8 *xrom_id; + int roms = 1; + + request_resource(&iomem_resource, rom_resources+0); + + xrom_id = (__u8 *) isa_bus_to_virt(PC9800SCA_XROM_ID + 0x10); + + for (i = 0; i < 16; i++) { + if (xrom_id[i] & 0x80) { + int j; + + for (j = i + 1; j < 16 && (xrom_id[j] & 0x80); j++) + ; + rom_resources[roms].start = 0x0d0000 + i * 0x001000; + rom_resources[roms].end = 0x0d0000 + j * 0x001000 - 1; + rom_resources[roms].name = "Extension ROM"; + rom_resources[roms].flags = IORESOURCE_BUSY; + + request_resource(&iomem_resource, + rom_resources + roms); + if (++roms >= MAXROMS) + return; + } + } +} + +void __init request_graphics_resource(void) +{ + int i; + + if (PC9800_HIGHRESO_P()) { + tvram_resource.start = 0xe0000; + tvram_resource.end = 0xe4fff; + gvram_brg_resource.name = "Graphic VRAM"; + gvram_brg_resource.start = 0xc0000; + gvram_brg_resource.end = 0xdffff; + } + + request_resource(&iomem_resource, &tvram_resource); + request_resource(&iomem_resource, &gvram_brg_resource); + if (!PC9800_HIGHRESO_P()) + request_resource(&iomem_resource, &gvram_e_resource); + + if (PC9800_HIGHRESO_P() || PC9800_9821_P()) { + static char graphics[] = "graphics"; + static struct resource graphics_resources[] = { + { graphics, 0x9a0, 0x9a0, 0 }, + { graphics, 0x9a2, 0x9a2, 0 }, + { graphics, 0x9a4, 0x9a4, 0 }, + { graphics, 0x9a6, 0x9a6, 0 }, + { graphics, 0x9a8, 0x9a8, 0 }, + { graphics, 0x9aa, 0x9aa, 0 }, + { graphics, 0x9ac, 0x9ac, 0 }, + { graphics, 0x9ae, 0x9ae, 0 }, + }; + +#define GRAPHICS_RESOURCES (sizeof(graphics_resources)/sizeof(struct resource)) + + for (i = 0; i < GRAPHICS_RESOURCES; i++) + request_resource(&ioport_resource, graphics_resources + i); + } +} + +void __init request_standard_io_resources(void) +{ + int i; + + for (i = 0; i < STANDARD_IO_RESOURCES; i++) + request_resource(&ioport_resource, standard_io_resources+i); +} --- linux-2.6.5-rc3/arch/i386/Makefile 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/i386/Makefile 2004-03-31 00:56:32.883673864 -0800 @@ -19,7 +19,7 @@ LDFLAGS := -m elf_i386 OBJCOPYFLAGS := -O binary -R .note -R .comment -S LDFLAGS_vmlinux := -CFLAGS += -pipe +CFLAGS += -pipe -msoft-float # prevent gcc from keeping the stack 16 byte aligned CFLAGS += $(call check_gcc,-mpreferred-stack-boundary=2,) @@ -56,9 +56,9 @@ cflags-$(CONFIG_X86_ELAN) += -march=i486 GCC_VERSION := $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC)) cflags-$(CONFIG_REGPARM) += $(shell if [ $(GCC_VERSION) -ge 0300 ] ; then echo "-mregparm=3"; fi ;) -# Enable unit-at-a-time mode when possible. It shrinks the -# kernel considerably. -CFLAGS += $(call check_gcc,-funit-at-a-time,) +# Disable unit-at-a-time mode, it makes gcc use a lot more stack +# due to the lack of sharing of stacklots. +CFLAGS += $(call check_gcc,-fno-unit-at-a-time,) CFLAGS += $(cflags-y) @@ -97,6 +97,9 @@ mcore-$(CONFIG_X86_ES7000) := mach-es700 # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +mflags-$(CONFIG_KGDB) += -gdwarf-2 +mflags-$(CONFIG_KGDB_MORE) += $(shell echo $(CONFIG_KGDB_OPTIONS) | sed -e 's/"//g') + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ --- linux-2.6.5-rc3/arch/i386/math-emu/fpu_system.h 2003-11-09 16:45:04.000000000 -0800 +++ 25/arch/i386/math-emu/fpu_system.h 2004-03-31 00:56:34.402442976 -0800 @@ -15,6 +15,7 @@ #include #include #include +#include /* This sets the pointer FPU_info to point to the argument part of the stack frame of math_emulate() */ @@ -22,7 +23,7 @@ /* s is always from a cpu register, and the cpu does bounds checking * during register load --> no further bounds checks needed */ -#define LDT_DESCRIPTOR(s) (((struct desc_struct *)current->mm->context.ldt)[(s) >> 3]) +#define LDT_DESCRIPTOR(s) (((struct desc_struct *)__kmap_atomic_vaddr(KM_LDT_PAGE0))[(s) >> 3]) #define SEG_D_SIZE(x) ((x).b & (3 << 21)) #define SEG_G_BIT(x) ((x).b & (1 << 23)) #define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1) --- linux-2.6.5-rc3/arch/i386/mm/fault.c 2003-12-17 21:20:01.000000000 -0800 +++ 25/arch/i386/mm/fault.c 2004-03-31 00:56:34.403442824 -0800 @@ -27,6 +27,7 @@ #include #include #include +#include extern void die(const char *,struct pt_regs *,long); @@ -104,8 +105,17 @@ static inline unsigned long get_segment_ if (seg & (1<<2)) { /* Must lock the LDT while reading it. */ down(¤t->mm->context.sem); +#if 1 + /* horrible hack for 4/4 disabled kernels. + I'm not quite sure what the TLB flush is good for, + it's mindlessly copied from the read_ldt code */ + __flush_tlb_global(); + desc = kmap(current->mm->context.ldt_pages[(seg&~7)/PAGE_SIZE]); + desc = (void *)desc + ((seg & ~7) % PAGE_SIZE); +#else desc = current->mm->context.ldt; desc = (void *)desc + (seg & ~7); +#endif } else { /* Must disable preemption while reading the GDT. */ desc = (u32 *)&cpu_gdt_table[get_cpu()]; @@ -118,6 +128,9 @@ static inline unsigned long get_segment_ (desc[1] & 0xff000000); if (seg & (1<<2)) { +#if 1 + kunmap((void *)((unsigned long)desc & PAGE_MASK)); +#endif up(¤t->mm->context.sem); } else put_cpu(); @@ -243,6 +256,19 @@ asmlinkage void do_page_fault(struct pt_ * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 1) == 0. */ +#ifdef CONFIG_X86_4G + /* + * On 4/4 all kernels faults are either bugs, vmalloc or prefetch + */ + if (unlikely((regs->xcs & 3) == 0)) { + if (error_code & 3) + goto bad_area_nosemaphore; + + /* If it's vm86 fall through */ + if (!(regs->eflags & VM_MASK)) + goto vmalloc_fault; + } +#else if (unlikely(address >= TASK_SIZE)) { if (!(error_code & 5)) goto vmalloc_fault; @@ -252,6 +278,7 @@ asmlinkage void do_page_fault(struct pt_ */ goto bad_area_nosemaphore; } +#endif mm = tsk->mm; @@ -326,6 +353,8 @@ good_area: goto do_sigbus; case VM_FAULT_OOM: goto out_of_memory; + case VM_FAULT_SIGSEGV: + goto bad_area; default: BUG(); } @@ -403,6 +432,12 @@ no_context: * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ +#ifdef CONFIG_KGDB + if (!user_mode(regs)){ + kgdb_handle_exception(14,SIGBUS, error_code, regs); + return; + } +#endif bust_spinlocks(1); --- linux-2.6.5-rc3/arch/i386/mm/hugetlbpage.c 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/i386/mm/hugetlbpage.c 2004-03-31 00:56:27.576480680 -0800 @@ -29,7 +29,7 @@ static spinlock_t htlbpage_lock = SPIN_L static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -44,8 +44,8 @@ static struct page *dequeue_huge_page(vo break; } if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, struct page, list); - list_del(&page->list); + page = list_entry(hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); } return page; } @@ -61,6 +61,27 @@ static struct page *alloc_fresh_huge_pag static void free_huge_page(struct page *page); +#ifdef CONFIG_NUMA + +static inline void huge_inc_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss += (HPAGE_SIZE / PAGE_SIZE); + mm->pernode_rss[page_nodenum(page)] += (HPAGE_SIZE / PAGE_SIZE); +} + +static inline void huge_dec_rss(struct mm_struct *mm, struct page *page) +{ + mm->rss -= (HPAGE_SIZE / PAGE_SIZE); + mm->pernode_rss[page_nodenum(page)] -= (HPAGE_SIZE / PAGE_SIZE); +} + +#else /* !CONFIG_NUMA */ + +#define huge_inc_rss(mm, page) ((mm)->rss += (HPAGE_SIZE / PAGE_SIZE)) +#define huge_dec_rss(mm, page) ((mm)->rss -= (HPAGE_SIZE / PAGE_SIZE)) + +#endif /* CONFIG_NUMA */ + static struct page *alloc_hugetlb_page(void) { int i; @@ -105,7 +126,7 @@ static void set_huge_pte(struct mm_struc { pte_t entry; - mm->rss += (HPAGE_SIZE / PAGE_SIZE); + huge_inc_rss(mm, page); if (write_access) { entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); @@ -145,7 +166,7 @@ int copy_hugetlb_page_range(struct mm_st ptepage = pte_page(entry); get_page(ptepage); set_pte(dst_pte, entry); - dst->rss += (HPAGE_SIZE / PAGE_SIZE); + huge_inc_rss(dst, ptepage); addr += HPAGE_SIZE; } return 0; @@ -278,9 +299,8 @@ follow_huge_pmd(struct mm_struct *mm, un static void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); spin_lock(&htlbpage_lock); enqueue_huge_page(page); @@ -314,8 +334,8 @@ void unmap_hugepage_range(struct vm_area page = pte_page(*pte); huge_page_release(page); pte_clear(pte); + huge_dec_rss(mm, page); } - mm->rss -= (end - start) >> PAGE_SHIFT; flush_tlb_range(vma, start, end); } @@ -409,19 +429,19 @@ static int try_to_free_low(int count) /* all lowmem is on node 0 */ list_for_each(p, &hugepage_freelists[0]) { if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; map = NULL; if (++count == 0) break; } - page = list_entry(p, struct page, list); + page = list_entry(p, struct page, lru); if (!PageHighMem(page)) map = page; } if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; count++; @@ -527,6 +547,12 @@ int is_hugepage_mem_enough(size_t size) return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem; } +/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ +unsigned long hugetlb_total_pages(void) +{ + return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE); +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the --- linux-2.6.5-rc3/arch/i386/mm/init.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/i386/mm/init.c 2004-03-31 00:56:34.406442368 -0800 @@ -40,125 +40,13 @@ #include #include #include +#include DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static int do_test_wp_bit(void); -/* - * Creates a middle page table and puts a pointer to it in the - * given global directory entry. This only returns the gd entry - * in non-PAE compilation mode, since the middle layer is folded. - */ -static pmd_t * __init one_md_table_init(pgd_t *pgd) -{ - pmd_t *pmd_table; - -#ifdef CONFIG_X86_PAE - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - if (pmd_table != pmd_offset(pgd, 0)) - BUG(); -#else - pmd_table = pmd_offset(pgd, 0); -#endif - - return pmd_table; -} - -/* - * Create a page table and place a pointer to it in a middle page - * directory entry. - */ -static pte_t * __init one_page_table_init(pmd_t *pmd) -{ - if (pmd_none(*pmd)) { - pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); - if (page_table != pte_offset_kernel(pmd, 0)) - BUG(); - - return page_table; - } - - return pte_offset_kernel(pmd, 0); -} - -/* - * This function initializes a certain range of kernel virtual memory - * with new bootmem page tables, everywhere page tables are missing in - * the given range. - */ - -/* - * NOTE: The pagetables are allocated contiguous on the physical space - * so we can cache the place of the first one and move around without - * checking the pgd every time. - */ -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) -{ - pgd_t *pgd; - pmd_t *pmd; - int pgd_idx, pmd_idx; - unsigned long vaddr; - - vaddr = start; - pgd_idx = pgd_index(vaddr); - pmd_idx = pmd_index(vaddr); - pgd = pgd_base + pgd_idx; - - for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { - if (pgd_none(*pgd)) - one_md_table_init(pgd); - - pmd = pmd_offset(pgd, vaddr); - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (pmd_none(*pmd)) - one_page_table_init(pmd); - - vaddr += PMD_SIZE; - } - pmd_idx = 0; - } -} - -/* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. - */ -static void __init kernel_physical_mapping_init(pgd_t *pgd_base) -{ - unsigned long pfn; - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - int pgd_idx, pmd_idx, pte_ofs; - - pgd_idx = pgd_index(PAGE_OFFSET); - pgd = pgd_base + pgd_idx; - pfn = 0; - - for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { - pmd = one_md_table_init(pgd); - if (pfn >= max_low_pfn) - continue; - for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { - /* Map with big pages if possible, otherwise create normal page tables. */ - if (cpu_has_pse) { - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); - pfn += PTRS_PER_PTE; - } else { - pte = one_page_table_init(pmd); - - for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); - } - } - } -} - static inline int page_kills_ppro(unsigned long pagenr) { if (pagenr >= 0x70000 && pagenr <= 0x7003F) @@ -206,11 +94,8 @@ static inline int page_is_ram(unsigned l return 0; } -#ifdef CONFIG_HIGHMEM pte_t *kmap_pte; -pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); EXPORT_SYMBOL(kmap_pte); #define kmap_get_fixmap_pte(vaddr) \ @@ -218,29 +103,7 @@ EXPORT_SYMBOL(kmap_pte); void __init kmap_init(void) { - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; -} - -void __init permanent_kmaps_init(pgd_t *pgd_base) -{ - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - unsigned long vaddr; - - vaddr = PKMAP_BASE; - page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); - - pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + kmap_pte = kmap_get_fixmap_pte(__fix_to_virt(FIX_KMAP_BEGIN)); } void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) @@ -255,6 +118,8 @@ void __init one_highpage_init(struct pag SetPageReserved(page); } +#ifdef CONFIG_HIGHMEM + #ifndef CONFIG_DISCONTIGMEM void __init set_highmem_pages_init(int bad_ppro) { @@ -266,12 +131,9 @@ void __init set_highmem_pages_init(int b #else extern void set_highmem_pages_init(int); #endif /* !CONFIG_DISCONTIGMEM */ - #else -#define kmap_init() do { } while (0) -#define permanent_kmaps_init(pgd_base) do { } while (0) -#define set_highmem_pages_init(bad_ppro) do { } while (0) -#endif /* CONFIG_HIGHMEM */ +# define set_highmem_pages_init(bad_ppro) do { } while (0) +#endif unsigned long __PAGE_KERNEL = _PAGE_KERNEL; @@ -281,30 +143,125 @@ unsigned long __PAGE_KERNEL = _PAGE_KERN extern void __init remap_numa_kva(void); #endif -static void __init pagetable_init (void) +static __init void prepare_pagetables(pgd_t *pgd_base, unsigned long address) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_base + pgd_index(address); + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) { + pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); + } +} + +static void __init fixrange_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { unsigned long vaddr; - pgd_t *pgd_base = swapper_pg_dir; + for (vaddr = start; vaddr != end; vaddr += PAGE_SIZE) + prepare_pagetables(pgd_base, vaddr); +} + +void setup_identity_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end) +{ + unsigned long vaddr; + pgd_t *pgd; + int i, j, k; + pmd_t *pmd; + pte_t *pte, *pte_base; + + pgd = pgd_base; + + for (i = 0; i < PTRS_PER_PGD; pgd++, i++) { + vaddr = i*PGDIR_SIZE; + if (end && (vaddr >= end)) + break; + pmd = pmd_offset(pgd, 0); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + if (cpu_has_pse) { + unsigned long __pe; + + set_in_cr4(X86_CR4_PSE); + boot_cpu_data.wp_works_ok = 1; + __pe = _KERNPG_TABLE + _PAGE_PSE + vaddr - start; + /* Make it "global" too if supported */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); +#if !defined(CONFIG_X86_SWITCH_PAGETABLES) + __pe += _PAGE_GLOBAL; + __PAGE_KERNEL |= _PAGE_GLOBAL; +#endif + } + set_pmd(pmd, __pmd(__pe)); + continue; + } + if (!pmd_present(*pmd)) + pte_base = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + else + pte_base = (pte_t *) page_address(pmd_page(*pmd)); + pte = pte_base; + for (k = 0; k < PTRS_PER_PTE; pte++, k++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + *pte = mk_pte_phys(vaddr-start, PAGE_KERNEL); + } + set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); + } + } +} + +static void __init pagetable_init (void) +{ + unsigned long vaddr, end; + pgd_t *pgd_base; #ifdef CONFIG_X86_PAE int i; - /* Init entries of the first-level page table to the zero page */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); #endif - /* Enable PSE if available */ - if (cpu_has_pse) { - set_in_cr4(X86_CR4_PSE); - } + /* + * This can be zero as well - no problem, in that case we exit + * the loops anyway due to the PTRS_PER_* conditions. + */ + end = (unsigned long)__va(max_low_pfn*PAGE_SIZE); - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __PAGE_KERNEL |= _PAGE_GLOBAL; + pgd_base = swapper_pg_dir; +#ifdef CONFIG_X86_PAE + /* + * It causes too many problems if there's no proper pmd set up + * for all 4 entries of the PGD - so we allocate all of them. + * PAE systems will not miss this extra 4-8K anyway ... + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + pmd_t *pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pgd(pgd_base + i, __pgd(__pa(pmd) + 0x1)); } +#endif + /* + * Set up lowmem-sized identity mappings at PAGE_OFFSET: + */ + setup_identity_mappings(pgd_base, PAGE_OFFSET, end); - kernel_physical_mapping_init(pgd_base); + /* + * Add flat-mode identity-mappings - SMP needs it when + * starting up on an AP from real-mode. (In the non-PAE + * case we already have these mappings through head.S.) + * All user-space mappings are explicitly cleared after + * SMP startup. + */ +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PAE) + setup_identity_mappings(pgd_base, 0, 16*1024*1024); +#endif remap_numa_kva(); /* @@ -312,38 +269,64 @@ static void __init pagetable_init (void) * created - mappings will be set by set_fixmap(): */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - page_table_range_init(vaddr, 0, pgd_base); + fixrange_init(vaddr, 0, pgd_base); - permanent_kmaps_init(pgd_base); +#ifdef CONFIG_HIGHMEM + { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; + /* + * Permanent kmaps: + */ + vaddr = PKMAP_BASE; + fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + + pgd = swapper_pg_dir + pgd_index(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; + } #endif } -void zap_low_mappings (void) +/* + * Clear kernel pagetables in a PMD_SIZE-aligned range. + */ +static void clear_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end) { - int i; + unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; + int i, j; + + pgd = pgd_base; + + for (i = 0; i < PTRS_PER_PGD; pgd++, i++) { + vaddr = i*PGDIR_SIZE; + if (end && (vaddr >= end)) + break; + pmd = pmd_offset(pgd, 0); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + pmd_clear(pmd); + } + } + flush_tlb_all(); +} + +void zap_low_mappings(void) +{ + printk("zapping low mappings.\n"); /* * Zap initial low-memory mappings. - * - * Note that "pgd_clear()" doesn't do it for - * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) -#ifdef CONFIG_X86_PAE - set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); -#else - set_pgd(swapper_pg_dir+i, __pgd(0)); -#endif - flush_tlb_all(); + clear_mappings(swapper_pg_dir, 0, 16*1024*1024); } #ifndef CONFIG_DISCONTIGMEM @@ -393,7 +376,15 @@ void __init paging_init(void) set_in_cr4(X86_CR4_PAE); #endif __flush_tlb_all(); - + /* + * Subtle. SMP is doing it's boot stuff late (because it has to + * fork idle threads) - but it also needs low mappings for the + * protected-mode entry to work. We zap these entries only after + * the WP-bit has been tested. + */ +#ifndef CONFIG_SMP + zap_low_mappings(); +#endif kmap_init(); zone_sizes_init(); } @@ -512,38 +503,57 @@ void __init mem_init(void) if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); - /* - * Subtle. SMP is doing it's boot stuff late (because it has to - * fork idle threads) - but it also needs low mappings for the - * protected-mode entry to work. We zap these entries only after - * the WP-bit has been tested. - */ -#ifndef CONFIG_SMP - zap_low_mappings(); -#endif + entry_trampoline_setup(); + default_ldt_page = virt_to_page(default_ldt); + load_LDT(&init_mm.context); } -kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; +kmem_cache_t *pgd_cache, *pmd_cache, *kpmd_cache; void __init pgtable_cache_init(void) { + void (*ctor)(void *, kmem_cache_t *, unsigned long); + void (*dtor)(void *, kmem_cache_t *, unsigned long); + if (PTRS_PER_PMD > 1) { pmd_cache = kmem_cache_create("pmd", PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, pmd_ctor, NULL); if (!pmd_cache) panic("pgtable_cache_init(): cannot create pmd cache"); + + if (TASK_SIZE > PAGE_OFFSET) { + kpmd_cache = kmem_cache_create("kpmd", + PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), + 0, + kpmd_ctor, + NULL); + if (!kpmd_cache) + panic("pgtable_cache_init(): " + "cannot create kpmd cache"); + } } + + if (PTRS_PER_PMD == 1 || TASK_SIZE <= PAGE_OFFSET) + ctor = pgd_ctor; + else + ctor = NULL; + + if (PTRS_PER_PMD == 1 && TASK_SIZE <= PAGE_OFFSET) + dtor = pgd_dtor; + else + dtor = NULL; + pgd_cache = kmem_cache_create("pgd", PTRS_PER_PGD*sizeof(pgd_t), + PTRS_PER_PGD*sizeof(pgd_t), 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + ctor, + dtor); if (!pgd_cache) panic("pgtable_cache_init(): Cannot create pgd cache"); } --- linux-2.6.5-rc3/arch/i386/mm/pageattr.c 2003-08-08 22:55:10.000000000 -0700 +++ 25/arch/i386/mm/pageattr.c 2004-03-31 00:56:34.406442368 -0800 @@ -71,11 +71,11 @@ static void set_pmd_pte(pte_t *kpte, uns unsigned long flags; set_pte_atomic(kpte, pte); /* change init_mm */ - if (PTRS_PER_PMD > 1) + if (PTRS_PER_PMD > 1 || TASK_SIZE > PAGE_OFFSET) return; spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { + for (page = pgd_list; page; page = (struct page *)page->index) { pgd_t *pgd; pmd_t *pmd; pgd = (pgd_t *)page_address(page) + pgd_index(address); @@ -135,7 +135,7 @@ __change_page_attr(struct page *page, pg } if (cpu_has_pse && (atomic_read(&kpte_page->count) == 1)) { - list_add(&kpte_page->list, &df_list); + list_add(&kpte_page->lru, &df_list); revert_page(kpte_page, address); } return 0; @@ -188,7 +188,7 @@ void global_flush_tlb(void) flush_map(); n = l.next; while (n != &l) { - struct page *pg = list_entry(n, struct page, list); + struct page *pg = list_entry(n, struct page, lru); n = n->next; __free_page(pg); } --- linux-2.6.5-rc3/arch/i386/mm/pgtable.c 2003-11-09 16:45:05.000000000 -0800 +++ 25/arch/i386/mm/pgtable.c 2004-03-31 00:56:34.407442216 -0800 @@ -21,6 +21,7 @@ #include #include #include +#include void show_mem(void) { @@ -157,11 +158,20 @@ void pmd_ctor(void *pmd, kmem_cache_t *c memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); } +void kpmd_ctor(void *__pmd, kmem_cache_t *cache, unsigned long flags) +{ + pmd_t *kpmd, *pmd; + kpmd = pmd_offset(&swapper_pg_dir[PTRS_PER_PGD-1], + (PTRS_PER_PMD - NR_SHARED_PMDS)*PMD_SIZE); + pmd = (pmd_t *)__pmd + (PTRS_PER_PMD - NR_SHARED_PMDS); + + memset(__pmd, 0, (PTRS_PER_PMD - NR_SHARED_PMDS)*sizeof(pmd_t)); + memcpy(pmd, kpmd, NR_SHARED_PMDS*sizeof(pmd_t)); +} + /* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking + * List of all pgd's needed so it can invalidate entries in both cached + * and uncached pgd's. This is essentially codepath-based locking * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. @@ -169,37 +179,75 @@ void pmd_ctor(void *pmd, kmem_cache_t *c * checks at dup_mmap(), exec(), and other mmlist addition points * could be used. The locking scheme was chosen on the basis of * manfred's recommendations and having no core impact whatsoever. + * + * Lexicon for #ifdefless conditions to config options: + * (a) PTRS_PER_PMD == 1 means non-PAE. + * (b) PTRS_PER_PMD > 1 means PAE. + * (c) TASK_SIZE > PAGE_OFFSET means 4:4. + * (d) TASK_SIZE <= PAGE_OFFSET means non-4:4. * -- wli */ spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; -LIST_HEAD(pgd_list); +struct page *pgd_list; + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + page->index = (unsigned long)pgd_list; + if (pgd_list) + pgd_list->private = (unsigned long)&page->index; + pgd_list = page; + page->private = (unsigned long)&pgd_list; +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *next, **pprev, *page = virt_to_page(pgd); + next = (struct page *)page->index; + pprev = (struct page **)page->private; + *pprev = next; + if (next) + next->private = (unsigned long)pprev; +} -void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +void pgd_ctor(void *__pgd, kmem_cache_t *cache, unsigned long unused) { + pgd_t *pgd = __pgd; unsigned long flags; - if (PTRS_PER_PMD == 1) - spin_lock_irqsave(&pgd_lock, flags); + if (PTRS_PER_PMD == 1) { + if (TASK_SIZE <= PAGE_OFFSET) + spin_lock_irqsave(&pgd_lock, flags); + else + memcpy(&pgd[PTRS_PER_PGD - NR_SHARED_PMDS], + &swapper_pg_dir[PTRS_PER_PGD - NR_SHARED_PMDS], + NR_SHARED_PMDS*sizeof(pgd_t)); + } - memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + if (TASK_SIZE <= PAGE_OFFSET) + memcpy(&pgd[USER_PTRS_PER_PGD], + &swapper_pg_dir[USER_PTRS_PER_PGD], + (PTRS_PER_PGD - USER_PTRS_PER_PGD)*sizeof(pgd_t)); if (PTRS_PER_PMD > 1) return; - list_add(&virt_to_page(pgd)->lru, &pgd_list); - spin_unlock_irqrestore(&pgd_lock, flags); - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + if (TASK_SIZE > PAGE_OFFSET) + memset(pgd, 0, (PTRS_PER_PGD - NR_SHARED_PMDS)*sizeof(pgd_t)); + else { + pgd_list_add(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + } } -/* never called when PTRS_PER_PMD > 1 */ +/* Never called when PTRS_PER_PMD > 1 || TASK_SIZE > PAGE_OFFSET */ void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ spin_lock_irqsave(&pgd_lock, flags); - list_del(&virt_to_page(pgd)->lru); + pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); } @@ -211,15 +259,31 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (PTRS_PER_PMD == 1 || !pgd) return pgd; + /* + * In the 4G userspace case alias the top 16 MB virtual + * memory range into the user mappings as well (these + * include the trampoline and CPU data structures). + */ for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + pmd_t *pmd; + + if (TASK_SIZE > PAGE_OFFSET && i == USER_PTRS_PER_PGD - 1) + pmd = kmem_cache_alloc(kpmd_cache, GFP_KERNEL); + else + pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (!pmd) goto out_oom; set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd)))); } - return pgd; + return pgd; out_oom: + /* + * we don't have to handle the kpmd_cache here, since it's the + * last allocation, and has either nothing to free or when it + * succeeds the whole operation succeeds. + */ for (i--; i >= 0; i--) kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); kmem_cache_free(pgd_cache, pgd); @@ -230,10 +294,26 @@ void pgd_free(pgd_t *pgd) { int i; - /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); /* in the non-PAE case, clear_page_tables() clears user pgd entries */ + if (PTRS_PER_PMD == 1) + goto out_free; + + /* in the PAE case user pgd entries are overwritten before usage */ + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd_t *pmd = __va(pgd_val(pgd[i]) - 1); + + /* + * only userspace pmd's are cleared for us + * by mm/memory.c; it's a slab cache invariant + * that we must separate the kernel pmd slab + * all times, else we'll have bad pmd's. + */ + if (TASK_SIZE > PAGE_OFFSET && i == USER_PTRS_PER_PGD - 1) + kmem_cache_free(kpmd_cache, pmd); + else + kmem_cache_free(pmd_cache, pmd); + } +out_free: kmem_cache_free(pgd_cache, pgd); } + --- linux-2.6.5-rc3/arch/i386/oprofile/op_model_p4.c 2003-08-22 19:23:40.000000000 -0700 +++ 25/arch/i386/oprofile/op_model_p4.c 2004-03-31 00:54:25.013113144 -0800 @@ -382,11 +382,8 @@ static struct p4_event_binding p4_events static unsigned int get_stagger(void) { #ifdef CONFIG_SMP - int cpu; - if (smp_num_siblings > 1) { - cpu = smp_processor_id(); - return (cpu_sibling_map[cpu] > cpu) ? 0 : 1; - } + int cpu = smp_processor_id(); + return (cpu != first_cpu(cpu_sibling_map[cpu])); #endif return 0; } --- linux-2.6.5-rc3/arch/ia64/configs/generic_defconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ia64/configs/generic_defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -976,7 +976,6 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set # CONFIG_USB_LEGOTOWER is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_TEST is not set # CONFIG_USB_GADGET is not set --- linux-2.6.5-rc3/arch/ia64/configs/zx1_defconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ia64/configs/zx1_defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -1028,7 +1028,6 @@ CONFIG_USB_HIDDEV=y # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set # CONFIG_USB_LEGOTOWER is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_LED is not set --- linux-2.6.5-rc3/arch/ia64/defconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ia64/defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -906,7 +906,6 @@ CONFIG_USB_HIDDEV=y # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set # CONFIG_USB_LEGOTOWER is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_LED is not set # CONFIG_USB_TEST is not set --- linux-2.6.5-rc3/arch/ia64/ia32/binfmt_elf32.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/ia64/ia32/binfmt_elf32.c 2004-03-31 00:54:56.458332744 -0800 @@ -35,7 +35,7 @@ extern void ia64_elf32_init (struct pt_r static void elf32_set_personality (void); -#define setup_arg_pages(bprm) ia32_setup_arg_pages(bprm) +#define setup_arg_pages(bprm,exec) ia32_setup_arg_pages(bprm,exec) #define elf_map elf32_map #undef SET_PERSONALITY @@ -149,7 +149,7 @@ ia64_elf32_init (struct pt_regs *regs) } int -ia32_setup_arg_pages (struct linux_binprm *bprm) +ia32_setup_arg_pages (struct linux_binprm *bprm, int executable_stack) { unsigned long stack_base; struct vm_area_struct *mpnt; @@ -178,8 +178,14 @@ ia32_setup_arg_pages (struct linux_binpr mpnt->vm_mm = current->mm; mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; mpnt->vm_end = IA32_STACK_TOP; - mpnt->vm_page_prot = PAGE_COPY; - mpnt->vm_flags = VM_STACK_FLAGS; + if (executable_stack == EXSTACK_ENABLE_X) + mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC; + else if (executable_stack == EXSTACK_DISABLE_X) + mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC; + else + mpnt->vm_flags = VM_STACK_FLAGS; + mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC)? + PAGE_COPY_EXEC: PAGE_COPY; mpnt->vm_ops = NULL; mpnt->vm_pgoff = 0; mpnt->vm_file = NULL; @@ -192,7 +198,7 @@ ia32_setup_arg_pages (struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, PAGE_COPY); + put_dirty_page(current, page, stack_base, mpnt->vm_page_prot); } stack_base += PAGE_SIZE; } --- linux-2.6.5-rc3/arch/ia64/ia32/ia32_entry.S 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ia64/ia32/ia32_entry.S 2004-03-31 00:56:31.546877088 -0800 @@ -465,7 +465,7 @@ ia32_syscall_table: data8 sys_epoll_create data8 sys32_epoll_ctl /* 255 */ data8 sys32_epoll_wait - data8 sys_remap_file_pages + data8 old_remap_file_pages data8 sys_set_tid_address data8 sys32_timer_create data8 compat_timer_settime /* 260 */ @@ -484,5 +484,12 @@ ia32_syscall_table: data8 sys_ni_syscall data8 sys_ni_syscall + data8 sys_ni_syscall /* 275 */ + data8 sys_ni_syscall + data8 sys_ni_syscall + data8 sys_ni_syscall + data8 sys_ni_syscall + data8 sys_remap_file_pages /* 280 */ + // guard against failures to increase IA32_NR_syscalls .org ia32_syscall_table + 8*IA32_NR_syscalls --- linux-2.6.5-rc3/arch/ia64/ia32/ia32priv.h 2004-01-09 00:04:30.000000000 -0800 +++ 25/arch/ia64/ia32/ia32priv.h 2004-03-31 00:54:56.459332592 -0800 @@ -494,7 +494,7 @@ struct ia32_user_desc { struct linux_binprm; extern void ia32_init_addr_space (struct pt_regs *regs); -extern int ia32_setup_arg_pages (struct linux_binprm *bprm); +extern int ia32_setup_arg_pages (struct linux_binprm *bprm, int exec_stack); extern unsigned long ia32_do_mmap (struct file *, unsigned long, unsigned long, int, int, loff_t); extern void ia32_load_segment_descriptors (struct task_struct *task); --- linux-2.6.5-rc3/arch/ia64/ia32/ia32_signal.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/ia64/ia32/ia32_signal.c 2004-03-31 00:54:54.874573512 -0800 @@ -1,7 +1,7 @@ /* * IA32 Architecture-specific signal handling support. * - * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co + * Copyright (C) 1999, 2001-2002, 2004 Hewlett-Packard Co * David Mosberger-Tang * Copyright (C) 1999 Arun Sharma * Copyright (C) 2000 VA Linux Co @@ -815,7 +815,7 @@ restore_sigcontext_ia32 (struct pt_regs * Determine which stack to use.. */ static inline void * -get_sigframe (struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) +get_sigframe (struct k_sigaction *ka_copy, struct pt_regs * regs, size_t frame_size) { unsigned long esp; @@ -823,7 +823,7 @@ get_sigframe (struct k_sigaction *ka, st esp = (unsigned int) regs->r12; /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { + if (ka_copy->sa.sa_flags & SA_ONSTACK) { if (!on_sig_stack(esp)) esp = current->sas_ss_sp + current->sas_ss_size; } @@ -832,17 +832,40 @@ get_sigframe (struct k_sigaction *ka, st return (void *)((esp - frame_size) & -8ul); } +static long +force_sigsegv (int sig) +{ + unsigned long flags; + + if (sig == SIGSEGV) { + /* + * Acquiring siglock around the sa_handler-update is almost + * certainly overkill, but this isn't a + * performance-critical path and I'd rather play it safe + * here than having to debug a nasty race if and when + * something changes in kernel/signal.c that would make it + * no longer safe to modify sa_handler without holding the + * lock. + */ + spin_lock_irqsave(¤t->sighand->siglock, flags); + current->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + force_sig(SIGSEGV, current); + return 0; +} + static int -setup_frame_ia32 (int sig, struct k_sigaction *ka, sigset_t *set, struct pt_regs * regs) +setup_frame_ia32 (int sig, struct k_sigaction *ka_copy, sigset_t *set, struct pt_regs * regs) { struct exec_domain *ed = current_thread_info()->exec_domain; struct sigframe_ia32 *frame; int err = 0; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka_copy, regs, sizeof(*frame)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return force_sigsegv(sig); err |= __put_user((ed && ed->signal_invmap && sig < 32 ? (int)(ed->signal_invmap[sig]) : sig), &frame->sig); @@ -855,8 +878,8 @@ setup_frame_ia32 (int sig, struct k_siga /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ka->sa.sa_flags & SA_RESTORER) { - unsigned int restorer = IA32_SA_RESTORER(ka); + if (ka_copy->sa.sa_flags & SA_RESTORER) { + unsigned int restorer = IA32_SA_RESTORER(ka_copy); err |= __put_user(restorer, &frame->pretcode); } else { err |= __put_user((long)frame->retcode, &frame->pretcode); @@ -868,11 +891,11 @@ setup_frame_ia32 (int sig, struct k_siga } if (err) - goto give_sigsegv; + return force_sigsegv(sig); /* Set up registers for signal handler */ regs->r12 = (unsigned long) frame; - regs->cr_iip = IA32_SA_HANDLER(ka); + regs->cr_iip = IA32_SA_HANDLER(ka_copy); set_fs(USER_DS); @@ -880,32 +903,26 @@ setup_frame_ia32 (int sig, struct k_siga regs->eflags &= ~TF_MASK; #endif -#if 0 +#if DEBUG_SIG printk("SIG deliver (%s:%d): sig=%d sp=%p pc=%lx ra=%x\n", current->comm, current->pid, sig, (void *) frame, regs->cr_iip, frame->pretcode); #endif return 1; - - give_sigsegv: - if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; - force_sig(SIGSEGV, current); - return 0; } static int -setup_rt_frame_ia32 (int sig, struct k_sigaction *ka, siginfo_t *info, +setup_rt_frame_ia32 (int sig, struct k_sigaction *ka_copy, siginfo_t *info, sigset_t *set, struct pt_regs * regs) { struct exec_domain *ed = current_thread_info()->exec_domain; struct rt_sigframe_ia32 *frame; int err = 0; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka_copy, regs, sizeof(*frame)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return force_sigsegv(sig); err |= __put_user((ed && ed->signal_invmap && sig < 32 ? ed->signal_invmap[sig] : sig), &frame->sig); @@ -922,12 +939,12 @@ setup_rt_frame_ia32 (int sig, struct k_s err |= setup_sigcontext_ia32(&frame->uc.uc_mcontext, &frame->fpstate, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) - goto give_sigsegv; + return force_sigsegv(sig); /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ka->sa.sa_flags & SA_RESTORER) { - unsigned int restorer = IA32_SA_RESTORER(ka); + if (ka_copy->sa.sa_flags & SA_RESTORER) { + unsigned int restorer = IA32_SA_RESTORER(ka_copy); err |= __put_user(restorer, &frame->pretcode); } else { err |= __put_user((long)frame->retcode, &frame->pretcode); @@ -938,11 +955,11 @@ setup_rt_frame_ia32 (int sig, struct k_s } if (err) - goto give_sigsegv; + return force_sigsegv(sig); /* Set up registers for signal handler */ regs->r12 = (unsigned long) frame; - regs->cr_iip = IA32_SA_HANDLER(ka); + regs->cr_iip = IA32_SA_HANDLER(ka_copy); set_fs(USER_DS); @@ -950,29 +967,23 @@ setup_rt_frame_ia32 (int sig, struct k_s regs->eflags &= ~TF_MASK; #endif -#if 0 +#if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%x\n", current->comm, current->pid, (void *) frame, regs->cr_iip, frame->pretcode); #endif return 1; - -give_sigsegv: - if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; - force_sig(SIGSEGV, current); - return 0; } int -ia32_setup_frame1 (int sig, struct k_sigaction *ka, siginfo_t *info, +ia32_setup_frame1 (int sig, struct k_sigaction *ka_copy, siginfo_t *info, sigset_t *set, struct pt_regs *regs) { /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - return setup_rt_frame_ia32(sig, ka, info, set, regs); + if (ka_copy->sa.sa_flags & SA_SIGINFO) + return setup_rt_frame_ia32(sig, ka_copy, info, set, regs); else - return setup_frame_ia32(sig, ka, set, regs); + return setup_frame_ia32(sig, ka_copy, set, regs); } asmlinkage long --- linux-2.6.5-rc3/arch/ia64/ia32/sys_ia32.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ia64/ia32/sys_ia32.c 2004-03-31 00:54:51.658062496 -0800 @@ -1090,10 +1090,11 @@ asmlinkage long sys32_time (int *tloc) { int i; + struct timeval tv; + + do_gettimeofday(&tv); + i = tv.tv_sec; - /* SMP: This is fairly trivial. We grab CURRENT_TIME and - stuff it to user space. No side effects */ - i = get_seconds(); if (tloc) { if (put_user(i, tloc)) i = -EFAULT; @@ -2103,7 +2104,8 @@ sys32_brk (unsigned int brk) } /* - * Exactly like fs/open.c:sys_open(), except that it doesn't set the O_LARGEFILE flag. + * Exactly like fs/open.c:sys_open(), except that it doesn't set the + * O_KERNEL_LARGEFILE flag. */ asmlinkage long sys32_open (const char * filename, int flags, int mode) --- linux-2.6.5-rc3/arch/ia64/Kconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ia64/Kconfig 2004-03-31 00:56:32.625713080 -0800 @@ -493,6 +493,13 @@ config DEBUG_INFO Say Y here only if you plan to use gdb to debug the kernel. If you don't debug the kernel, you can say N. +config LOCKMETER + bool "Kernel lock metering" + depends on SMP + help + Say Y to enable kernel lock metering, which adds overhead to SMP locks, + but allows you to see various statistics using the lockstat command. + config SYSVIPC_COMPAT bool depends on COMPAT && SYSVIPC --- linux-2.6.5-rc3/arch/ia64/kernel/efi.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ia64/kernel/efi.c 2004-03-31 00:55:03.682234544 -0800 @@ -469,6 +469,19 @@ efi_map_pal_code (void) } } +static int __init +early_mem(char *cp) +{ + mem_limit = memparse(cp, &cp) - 1; + + if (mem_limit != ~0UL) + printk(KERN_INFO "Ignoring memory above %luMB\n", + mem_limit >> 20); + + return 0; +} +__early_param("mem=", early_mem); + void __init efi_init (void) { @@ -476,28 +489,9 @@ efi_init (void) efi_config_table_t *config_tables; efi_char16_t *c16; u64 efi_desc_size; - char *cp, *end, vendor[100] = "unknown"; - extern char saved_command_line[]; + char vendor[100] = "unknown"; int i; - /* it's too early to be able to use the standard kernel command line support... */ - for (cp = saved_command_line; *cp; ) { - if (memcmp(cp, "mem=", 4) == 0) { - cp += 4; - mem_limit = memparse(cp, &end) - 1; - if (end != cp) - break; - cp = end; - } else { - while (*cp != ' ' && *cp) - ++cp; - while (*cp == ' ') - ++cp; - } - } - if (mem_limit != ~0UL) - printk(KERN_INFO "Ignoring memory above %luMB\n", mem_limit >> 20); - efi.systab = __va(ia64_boot_param->efi_systab); /* --- linux-2.6.5-rc3/arch/ia64/kernel/entry.S 2004-02-03 20:42:34.000000000 -0800 +++ 25/arch/ia64/kernel/entry.S 2004-03-31 00:56:31.547876936 -0800 @@ -1367,7 +1367,7 @@ sys_call_table: data8 sys_ni_syscall /* was: ia64_oldfstat */ data8 sys_vhangup data8 sys_lchown - data8 sys_remap_file_pages // 1125 + data8 old_remap_file_pages // 1125 data8 sys_wait4 data8 sys_sysinfo data8 sys_clone @@ -1501,7 +1501,7 @@ sys_call_table: data8 sys_clock_nanosleep data8 sys_fstatfs64 data8 sys_statfs64 - data8 sys_ni_syscall + data8 sys_remap_file_pages data8 sys_ni_syscall // 1260 data8 sys_ni_syscall data8 sys_ni_syscall --- linux-2.6.5-rc3/arch/ia64/kernel/setup.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ia64/kernel/setup.c 2004-03-31 00:55:03.683234392 -0800 @@ -88,10 +88,6 @@ unsigned char aux_device_present = 0xaa; unsigned long ia64_max_iommu_merge_mask = ~0UL; EXPORT_SYMBOL(ia64_max_iommu_merge_mask); -#define COMMAND_LINE_SIZE 512 - -char saved_command_line[COMMAND_LINE_SIZE]; /* used in proc filesystem */ - /* * We use a special marker for the end of memory and it uses the extra (+1) slot */ @@ -288,7 +284,7 @@ setup_arch (char **cmdline_p) ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist); *cmdline_p = __va(ia64_boot_param->command_line); - strlcpy(saved_command_line, *cmdline_p, sizeof(saved_command_line)); + parse_early_options(cmdline_p); efi_init(); io_port_init(); --- linux-2.6.5-rc3/arch/ia64/kernel/signal.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/ia64/kernel/signal.c 2004-03-31 00:54:54.876573208 -0800 @@ -1,7 +1,7 @@ /* * Architecture-specific signal handling support. * - * Copyright (C) 1999-2003 Hewlett-Packard Co + * Copyright (C) 1999-2004 Hewlett-Packard Co * David Mosberger-Tang * * Derived from i386 and Alpha versions. @@ -397,18 +397,47 @@ rbs_on_sig_stack (unsigned long bsp) } static long -setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, +force_sigsegv (int sig, void *addr) +{ + unsigned long flags; + struct siginfo si; + + if (sig == SIGSEGV) { + /* + * Acquiring siglock around the sa_handler-update is almost + * certainly overkill, but this isn't a + * performance-critical path and I'd rather play it safe + * here than having to debug a nasty race if and when + * something changes in kernel/signal.c that would make it + * no longer safe to modify sa_handler without holding the + * lock. + */ + spin_lock_irqsave(¤t->sighand->siglock, flags); + current->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + si.si_signo = SIGSEGV; + si.si_errno = 0; + si.si_code = SI_KERNEL; + si.si_pid = current->pid; + si.si_uid = current->uid; + si.si_addr = addr; + force_sig_info(SIGSEGV, &si, current); + return 0; +} + +static long +setup_frame (int sig, struct k_sigaction *ka_copy, siginfo_t *info, sigset_t *set, struct sigscratch *scr) { extern char __kernel_sigtramp[]; unsigned long tramp_addr, new_rbs = 0; struct sigframe *frame; - struct siginfo si; long err; frame = (void *) scr->pt.r12; tramp_addr = (unsigned long) __kernel_sigtramp; - if ((ka->sa.sa_flags & SA_ONSTACK) && sas_ss_flags((unsigned long) frame) == 0) { + if ((ka_copy->sa.sa_flags & SA_ONSTACK) && sas_ss_flags((unsigned long) frame) == 0) { frame = (void *) ((current->sas_ss_sp + current->sas_ss_size) & ~(STACK_ALIGN - 1)); /* @@ -422,14 +451,14 @@ setup_frame (int sig, struct k_sigaction frame = (void *) frame - ((sizeof(*frame) + STACK_ALIGN - 1) & ~(STACK_ALIGN - 1)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return force_sigsegv(sig, frame); err = __put_user(sig, &frame->arg0); err |= __put_user(&frame->info, &frame->arg1); err |= __put_user(&frame->sc, &frame->arg2); err |= __put_user(new_rbs, &frame->sc.sc_rbs_base); err |= __put_user(0, &frame->sc.sc_loadrs); /* initialize to zero */ - err |= __put_user(ka->sa.sa_handler, &frame->handler); + err |= __put_user(ka_copy->sa.sa_handler, &frame->handler); err |= copy_siginfo_to_user(&frame->info, info); @@ -438,8 +467,8 @@ setup_frame (int sig, struct k_sigaction err |= __put_user(sas_ss_flags(scr->pt.r12), &frame->sc.sc_stack.ss_flags); err |= setup_sigcontext(&frame->sc, set, scr); - if (err) - goto give_sigsegv; + if (unlikely(err)) + return force_sigsegv(sig, frame); scr->pt.r12 = (unsigned long) frame - 16; /* new stack pointer */ scr->pt.ar_fpsr = FPSR_DEFAULT; /* reset fpsr for signal handler */ @@ -466,40 +495,25 @@ setup_frame (int sig, struct k_sigaction current->comm, current->pid, sig, scr->pt.r12, frame->sc.sc_ip, frame->handler); #endif return 1; - - give_sigsegv: - if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; - si.si_signo = SIGSEGV; - si.si_errno = 0; - si.si_code = SI_KERNEL; - si.si_pid = current->pid; - si.si_uid = current->uid; - si.si_addr = frame; - force_sig_info(SIGSEGV, &si, current); - return 0; } static long -handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset, +handle_signal (unsigned long sig, struct k_sigaction *ka_copy, siginfo_t *info, sigset_t *oldset, struct sigscratch *scr) { if (IS_IA32_PROCESS(&scr->pt)) { /* send signal to IA-32 process */ - if (!ia32_setup_frame1(sig, ka, info, oldset, &scr->pt)) + if (!ia32_setup_frame1(sig, ka_copy, info, oldset, &scr->pt)) return 0; } else /* send signal to IA-64 process */ - if (!setup_frame(sig, ka, info, oldset, scr)) + if (!setup_frame(sig, ka_copy, info, oldset, scr)) return 0; - if (ka->sa.sa_flags & SA_ONESHOT) - ka->sa.sa_handler = SIG_DFL; - - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (!(ka_copy->sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); { - sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); + sigorsets(¤t->blocked, ¤t->blocked, &ka_copy->sa.sa_mask); sigaddset(¤t->blocked, sig); recalc_sigpending(); } @@ -515,7 +529,7 @@ handle_signal (unsigned long sig, struct long ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall) { - struct k_sigaction *ka; + struct k_sigaction ka_copy; siginfo_t info; long restart = in_syscall; long errno = scr->pt.r8; @@ -537,7 +551,7 @@ ia64_do_signal (sigset_t *oldset, struct * need to push through a forced SIGSEGV. */ while (1) { - int signr = get_signal_to_deliver(&info, &scr->pt, NULL); + int signr = get_signal_to_deliver(&info, &ka_copy, &scr->pt, NULL); /* * get_signal_to_deliver() may have run a debugger (via notify_parent()) @@ -564,8 +578,6 @@ ia64_do_signal (sigset_t *oldset, struct if (signr <= 0) break; - ka = ¤t->sighand->action[signr - 1]; - if (unlikely(restart)) { switch (errno) { case ERESTART_RESTARTBLOCK: @@ -575,7 +587,7 @@ ia64_do_signal (sigset_t *oldset, struct break; case ERESTARTSYS: - if ((ka->sa.sa_flags & SA_RESTART) == 0) { + if ((ka_copy.sa.sa_flags & SA_RESTART) == 0) { scr->pt.r8 = ERR_CODE(EINTR); /* note: scr->pt.r10 is already -1 */ break; @@ -594,7 +606,7 @@ ia64_do_signal (sigset_t *oldset, struct * Whee! Actually deliver the signal. If the delivery failed, we need to * continue to iterate in this loop so we can deliver the SIGSEGV... */ - if (handle_signal(signr, ka, &info, oldset, scr)) + if (handle_signal(signr, &ka_copy, &info, oldset, scr)) return 1; } --- linux-2.6.5-rc3/arch/ia64/kernel/unaligned.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ia64/kernel/unaligned.c 2004-03-31 00:54:38.524059168 -0800 @@ -1337,7 +1337,7 @@ ia64_handle_unaligned (unsigned long ifa * be holding locks... */ if (user_mode(regs)) - tty_write_message(current->tty, buf); + tty_write_message(current->signal->tty, buf); buf[len-1] = '\0'; /* drop '\r' */ printk(KERN_WARNING "%s", buf); /* watch for command names containing %s */ } --- linux-2.6.5-rc3/arch/ia64/kernel/vmlinux.lds.S 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ia64/kernel/vmlinux.lds.S 2004-03-31 00:55:03.684234240 -0800 @@ -134,6 +134,12 @@ SECTIONS *(.init.setup) __setup_end = .; } + __early_param : AT(ADDR(__early_param) - LOAD_OFFSET) + { + __early_begin = .; + *(__early_param) + __early_end = .; + } __param : AT(ADDR(__param) - LOAD_OFFSET) { __start___param = .; --- linux-2.6.5-rc3/arch/ia64/lib/dec_and_lock.c 2004-01-09 00:04:31.000000000 -0800 +++ 25/arch/ia64/lib/dec_and_lock.c 2004-03-31 00:56:32.626712928 -0800 @@ -13,6 +13,7 @@ #include #include +#ifndef CONFIG_LOCKMETER /* * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock. Both of these * operations have to be done atomically, so that the count doesn't drop to zero without @@ -40,3 +41,4 @@ atomic_dec_and_lock (atomic_t *refcount, } EXPORT_SYMBOL(atomic_dec_and_lock); +#endif --- linux-2.6.5-rc3/arch/ia64/mm/hugetlbpage.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/ia64/mm/hugetlbpage.c 2004-03-31 00:56:27.576480680 -0800 @@ -32,7 +32,7 @@ static spinlock_t htlbpage_lock = SPIN_L static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -48,8 +48,8 @@ static struct page *dequeue_huge_page(vo } if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, struct page, list); - list_del(&page->list); + page = list_entry(hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); } return page; } @@ -246,9 +246,8 @@ follow_huge_pmd(struct mm_struct *mm, un void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); spin_lock(&htlbpage_lock); enqueue_huge_page(page); @@ -449,19 +448,19 @@ int try_to_free_low(int count) spin_lock(&htlbpage_lock); list_for_each(p, &hugepage_freelists[0]) { if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; map = NULL; if (++count == 0) break; } - page = list_entry(p, struct page, list); + page = list_entry(p, struct page, lru); if (!PageHighMem(page)) map = page; } if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; count++; @@ -592,6 +591,12 @@ int is_hugepage_mem_enough(size_t size) return 1; } +/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ +unsigned long hugetlb_total_pages(void) +{ + return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE); +} + static struct page *hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int *unused) { BUG(); --- linux-2.6.5-rc3/arch/m68k/atari/config.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/m68k/atari/config.c 2004-03-31 00:55:03.859207640 -0800 @@ -180,7 +180,7 @@ int __init atari_parse_bootinfo(const st /* Parse the Atari-specific switches= option. */ -void __init atari_switches_setup( const char *str, unsigned len ) +static void __init atari_switches_setup( const char *str, unsigned len ) { char switches[len+1]; char *p; @@ -217,6 +217,17 @@ void __init atari_switches_setup( const } } +static int __init early_switches(char *p) +{ + char *q; + + atari_switches_setup(p, (q = strchr(p, ' ' )) ? + (q - p) : strlen(p)); + + return 0; +} +__early_param("switches=", early_switches); + /* * Setup the Atari configuration info --- linux-2.6.5-rc3/arch/m68k/kernel/setup.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/m68k/kernel/setup.c 2004-03-31 00:55:03.860207488 -0800 @@ -61,8 +61,7 @@ struct mem_info m68k_memory[NUM_MEMINFO] static struct mem_info m68k_ramdisk = { 0, 0 }; -static char m68k_command_line[CL_SIZE]; -char saved_command_line[CL_SIZE]; +static char m68k_command_line[COMMAND_LINE_SIZE]; char m68k_debug_device[6] = ""; @@ -196,6 +195,22 @@ static void __init m68k_parse_bootinfo(c #endif } +/* + * "debug=xxx" will enable printing certain kernel messages to some + * machine-specific device. + */ +static int __init early_debug(char *p) +{ + strlcpy(m68k_debug_device, p, sizeof(m68k_debug_device)); + + /* Terminate the arg. */ + if ((p = strchr(m68k_debug_device, ' ' ))) + *p = 0; + + return 0; +} +__early_param("debug=", early_debug); + void __init setup_arch(char **cmdline_p) { extern int _etext, _edata, _end; @@ -203,7 +218,6 @@ void __init setup_arch(char **cmdline_p) unsigned long endmem, startmem; #endif int i; - char *p, *q; if (!MACH_IS_HP300) { /* The bootinfo is located right after the kernel bss */ @@ -244,39 +258,7 @@ void __init setup_arch(char **cmdline_p) init_mm.brk = (unsigned long) &_end; *cmdline_p = m68k_command_line; - memcpy(saved_command_line, *cmdline_p, CL_SIZE); - - /* Parse the command line for arch-specific options. - * For the m68k, this is currently only "debug=xxx" to enable printing - * certain kernel messages to some machine-specific device. - */ - for( p = *cmdline_p; p && *p; ) { - i = 0; - if (!strncmp( p, "debug=", 6 )) { - strlcpy( m68k_debug_device, p+6, sizeof(m68k_debug_device) ); - if ((q = strchr( m68k_debug_device, ' ' ))) *q = 0; - i = 1; - } -#ifdef CONFIG_ATARI - /* This option must be parsed very early */ - if (!strncmp( p, "switches=", 9 )) { - extern void atari_switches_setup( const char *, int ); - atari_switches_setup( p+9, (q = strchr( p+9, ' ' )) ? - (q - (p+9)) : strlen(p+9) ); - i = 1; - } -#endif - - if (i) { - /* option processed, delete it */ - if ((q = strchr( p, ' ' ))) - strcpy( p, q+1 ); - else - *p = 0; - } else { - if ((p = strchr( p, ' ' ))) ++p; - } - } + parse_early_options(cmdline_p); switch (m68k_machtype) { #ifdef CONFIG_AMIGA --- linux-2.6.5-rc3/arch/m68k/kernel/vmlinux-std.lds 2003-09-08 13:58:56.000000000 -0700 +++ 25/arch/m68k/kernel/vmlinux-std.lds 2004-03-31 00:55:03.860207488 -0800 @@ -50,6 +50,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; __start___param = .; __param : { *(__param) } __stop___param = .; --- linux-2.6.5-rc3/arch/m68k/kernel/vmlinux-sun3.lds 2003-09-08 13:58:56.000000000 -0700 +++ 25/arch/m68k/kernel/vmlinux-sun3.lds 2004-03-31 00:55:03.861207336 -0800 @@ -44,6 +44,9 @@ __init_begin = .; __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; __start___param = .; __param : { *(__param) } __stop___param = .; --- linux-2.6.5-rc3/arch/m68k/mac/config.c 2004-02-03 20:42:34.000000000 -0800 +++ 25/arch/m68k/mac/config.c 2004-03-31 00:55:03.862207184 -0800 @@ -58,7 +58,7 @@ extern struct mem_info m68k_memory[NUM_M extern struct mem_info m68k_ramdisk; -extern char m68k_command_line[CL_SIZE]; +extern char m68k_command_line[COMMAND_LINE_SIZE]; void *mac_env; /* Loaded by the boot asm */ --- linux-2.6.5-rc3/arch/m68k/mm/memory.c 2003-07-27 12:14:38.000000000 -0700 +++ 25/arch/m68k/mm/memory.c 2004-03-31 00:56:27.866436600 -0800 @@ -29,8 +29,8 @@ typedef struct list_head ptable_desc; static LIST_HEAD(ptable_list); -#define PD_PTABLE(page) ((ptable_desc *)&(virt_to_page(page)->list)) -#define PD_PAGE(ptable) (list_entry(ptable, struct page, list)) +#define PD_PTABLE(page) ((ptable_desc *)&(virt_to_page(page)->lru)) +#define PD_PAGE(ptable) (list_entry(ptable, struct page, lru)) #define PD_MARKBITS(dp) (*(unsigned char *)&PD_PAGE(dp)->index) #define PTABLE_SIZE (PTRS_PER_PMD * sizeof(pmd_t)) --- linux-2.6.5-rc3/arch/m68knommu/kernel/setup.c 2003-07-27 12:14:38.000000000 -0700 +++ 25/arch/m68knommu/kernel/setup.c 2004-03-31 00:55:04.045179368 -0800 @@ -51,8 +51,7 @@ unsigned long rom_length; unsigned long memory_start; unsigned long memory_end; -char command_line[512]; -char saved_command_line[512]; +char command_line[COMMAND_LINE_SIZE]; /* setup some dummy routines */ static void dummy_waitbut(void) @@ -224,10 +223,8 @@ void setup_arch(char **cmdline_p) (int) memory_end, (int) _ramend); #endif - /* Keep a copy of command line */ *cmdline_p = &command_line[0]; - memcpy(saved_command_line, command_line, sizeof(saved_command_line)); - saved_command_line[sizeof(saved_command_line)-1] = 0; + parse_early_options(cmdline_p); #ifdef DEBUG if (strlen(*cmdline_p)) --- linux-2.6.5-rc3/arch/m68knommu/kernel/vmlinux.lds.S 2003-08-22 19:23:40.000000000 -0700 +++ 25/arch/m68knommu/kernel/vmlinux.lds.S 2004-03-31 00:55:04.045179368 -0800 @@ -262,6 +262,9 @@ SECTIONS { __setup_start = .; *(.init.setup) __setup_end = .; + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; __start___param = .; *(__param) __stop___param = .; --- linux-2.6.5-rc3/arch/m68k/q40/config.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/m68k/q40/config.c 2004-03-31 00:55:03.862207184 -0800 @@ -64,7 +64,6 @@ void q40_set_vectors (void); extern void q40_mksound(unsigned int /*freq*/, unsigned int /*ticks*/ ); -extern char *saved_command_line; extern char m68k_debug_device[]; static void q40_mem_console_write(struct console *co, const char *b, unsigned int count); --- linux-2.6.5-rc3/arch/mips/cobalt/setup.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/mips/cobalt/setup.c 2004-03-31 00:55:04.210154288 -0800 @@ -31,7 +31,7 @@ extern void cobalt_machine_power_off(voi int cobalt_board_id; -static char my_cmdline[CL_SIZE] = { +static char my_cmdline[COMMAND_LINE_SIZE] = { "console=ttyS0,115200 " #ifdef CONFIG_IP_PNP "ip=on " --- linux-2.6.5-rc3/arch/mips/configs/rm200_defconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/mips/configs/rm200_defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -986,7 +986,6 @@ CONFIG_USB_TIGL=m CONFIG_USB_AUERSWALD=m CONFIG_USB_RIO500=m CONFIG_USB_LEGOTOWER=m -CONFIG_USB_BRLVGER=m CONFIG_USB_LCD=m # CONFIG_USB_LED is not set CONFIG_USB_TEST=m --- linux-2.6.5-rc3/arch/mips/kernel/irixelf.c 2003-09-27 18:57:43.000000000 -0700 +++ 25/arch/mips/kernel/irixelf.c 2004-03-31 00:54:56.461332288 -0800 @@ -688,7 +688,7 @@ static int load_irix_binary(struct linux * change some of these later. */ current->mm->rss = 0; - setup_arg_pages(bprm); + setup_arg_pages(bprm, EXSTACK_DEFAULT); current->mm->start_stack = bprm->p; /* At this point, we assume that the image should be loaded at --- linux-2.6.5-rc3/arch/mips/kernel/setup.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/mips/kernel/setup.c 2004-03-31 00:55:04.211154136 -0800 @@ -71,7 +71,6 @@ EXPORT_SYMBOL(mips_machgroup); struct boot_mem_map boot_mem_map; static char command_line[CL_SIZE]; - char saved_command_line[CL_SIZE]; char arcs_cmdline[CL_SIZE]=CONFIG_CMDLINE; /* @@ -143,57 +142,37 @@ static void __init print_memory_map(void } } -static inline void parse_cmdline_early(void) +/* + * "mem=XXX[kKmM]" defines a memory region from 0 to , overriding + * the determined size. "mem=XXX[KkmM]@YYY[KkmM]" defines a memory region + * from to +, overriding the determined size. + */ +static int __init early_mem(char *from) { - char c = ' ', *to = command_line, *from = saved_command_line; unsigned long start_at, mem_size; int len = 0; - int usermem = 0; - printk("Determined physical RAM map:\n"); - print_memory_map(); + /* + * The user has specified the memory size, so we blow away any + * automatically generated size. + */ + boot_mem_map.nr_map = 0; - for (;;) { - /* - * "mem=XXX[kKmM]" defines a memory region from - * 0 to , overriding the determined size. - * "mem=XXX[KkmM]@YYY[KkmM]" defines a memory region from - * to +, overriding the determined size. - */ - if (c == ' ' && !memcmp(from, "mem=", 4)) { - if (to != command_line) - to--; - /* - * If a user specifies memory size, we - * blow away any automatically generated - * size. - */ - if (usermem == 0) { - boot_mem_map.nr_map = 0; - usermem = 1; - } - mem_size = memparse(from + 4, &from); - if (*from == '@') - start_at = memparse(from + 1, &from); - else - start_at = 0; - add_memory_region(start_at, mem_size, BOOT_MEM_RAM); - } - c = *(from++); - if (!c) - break; - if (CL_SIZE <= ++len) - break; - *(to++) = c; - } - *to = '\0'; + mem_size = memparse(from, &from); - if (usermem) { - printk("User-defined physical RAM map:\n"); - print_memory_map(); - } -} + if (*from == '@') + start_at = memparse(from + 1, &from); + else + start_at = 0; + + add_memory_region(start_at, mem_size, BOOT_MEM_RAM); + printk("User-defined physical RAM map:\n"); + print_memory_map(); + + return 0; +} +__early_param("mem=", early_mem); #define PFN_UP(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) #define PFN_DOWN(x) ((x) >> PAGE_SHIFT) @@ -484,11 +463,13 @@ void __init setup_arch(char **cmdline_p) do_earlyinitcalls(); strlcpy(command_line, arcs_cmdline, sizeof(command_line)); - strlcpy(saved_command_line, command_line, sizeof(saved_command_line)); + + printk("Determined physical RAM map:\n"); + print_memory_map(); *cmdline_p = command_line; + parse_early_options(cmdline_p); - parse_cmdline_early(); bootmem_init(); paging_init(); resource_init(); --- linux-2.6.5-rc3/arch/mips/kernel/vmlinux.lds.S 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/mips/kernel/vmlinux.lds.S 2004-03-31 00:55:04.211154136 -0800 @@ -95,6 +95,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; __start___param = .; __param : { *(__param) } __stop___param = .; --- linux-2.6.5-rc3/arch/mips/lasat/prom.c 2004-03-10 20:41:25.000000000 -0800 +++ 25/arch/mips/lasat/prom.c 2004-03-31 00:55:04.211154136 -0800 @@ -111,8 +111,8 @@ void __init prom_init(void) /* Get the command line */ if (argc > 0) { - strncpy(arcs_cmdline, argv[0], CL_SIZE-1); - arcs_cmdline[CL_SIZE-1] = '\0'; + strncpy(arcs_cmdline, argv[0], COMMAND_LINE_SIZE-1); + arcs_cmdline[COMMAND_LINE_SIZE-1] = '\0'; } /* Set the I/O base address */ --- linux-2.6.5-rc3/arch/mips/sibyte/cfe/setup.c 2004-03-10 20:41:26.000000000 -0800 +++ 25/arch/mips/sibyte/cfe/setup.c 2004-03-31 00:55:04.212153984 -0800 @@ -291,7 +291,7 @@ void __init prom_init(void) * boot console */ cfe_cons_handle = cfe_getstdhandle(CFE_STDHANDLE_CONSOLE); - if (cfe_getenv("LINUX_CMDLINE", arcs_cmdline, CL_SIZE) < 0) { + if (cfe_getenv("LINUX_CMDLINE", arcs_cmdline, COMMAND_LINE_SIZE) < 0) { if (argc < 0) { /* * It's OK for direct boot to not provide a @@ -338,7 +338,7 @@ void __init prom_init(void) #endif /* CONFIG_BLK_DEV_INITRD */ /* Not sure this is needed, but it's the safe way. */ - arcs_cmdline[CL_SIZE-1] = 0; + arcs_cmdline[COMMAND_LINE_SIZE-1] = 0; mips_machgroup = MACH_GROUP_SIBYTE; prom_meminit(); --- linux-2.6.5-rc3/arch/parisc/configs/c3000_defconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/parisc/configs/c3000_defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -876,7 +876,6 @@ CONFIG_USB_HPUSBSCSI=m # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set CONFIG_USB_LEGOTOWER=m -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_LED is not set # CONFIG_USB_TEST is not set --- linux-2.6.5-rc3/arch/parisc/defconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/parisc/defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -737,7 +737,6 @@ CONFIG_USB_OHCI_HCD=y # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set # CONFIG_USB_LEGOTOWER is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_LED is not set --- linux-2.6.5-rc3/arch/parisc/kernel/setup.c 2003-10-08 15:07:08.000000000 -0700 +++ 25/arch/parisc/kernel/setup.c 2004-03-31 00:55:04.388127232 -0800 @@ -44,9 +44,8 @@ #include /* for pa7300lc_init() proto */ #include #include +#include -#define COMMAND_LINE_SIZE 1024 -char saved_command_line[COMMAND_LINE_SIZE]; char command_line[COMMAND_LINE_SIZE]; /* Intended for ccio/sba/cpu statistics under /proc/bus/{runway|gsc} */ @@ -60,11 +59,8 @@ void __init setup_cmdline(char **cmdline /* Collect stuff passed in from the boot loader */ /* boot_args[0] is free-mem start, boot_args[1] is ptr to command line */ - if (boot_args[0] < 64) { - /* called from hpux boot loader */ - saved_command_line[0] = '\0'; - } else { - strcpy(saved_command_line, (char *)__va(boot_args[1])); + if (boot_args[0] > 64) { + strcpy(command_line, (char *)__va(boot_args[1])); #ifdef CONFIG_BLK_DEV_INITRD if (boot_args[2] != 0) /* did palo pass us a ramdisk? */ @@ -75,8 +71,8 @@ void __init setup_cmdline(char **cmdline #endif } - strcpy(command_line, saved_command_line); *cmdline_p = command_line; + parse_early_options(cmdline_p); } #ifdef CONFIG_PA11 --- linux-2.6.5-rc3/arch/parisc/kernel/sys_parisc32.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/parisc/kernel/sys_parisc32.c 2004-03-31 00:54:51.659062344 -0800 @@ -388,14 +388,16 @@ static inline long get_ts32(struct times asmlinkage long sys32_time(compat_time_t *tloc) { - time_t now = get_seconds(); - compat_time_t now32 = now; + struct timeval tv; - if (tloc) - if (put_user(now32, tloc)) - now32 = -EFAULT; + do_gettimeofday(&tv); + compat_time_t now32 = tv.tv_sec; - return now32; + if (tloc) + if (put_user(now32, tloc)) + now32 = -EFAULT; + + return now32; } asmlinkage int --- linux-2.6.5-rc3/arch/parisc/kernel/vmlinux.lds.S 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/parisc/kernel/vmlinux.lds.S 2004-03-31 00:55:04.388127232 -0800 @@ -116,6 +116,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; __start___param = .; __param : { *(__param) } __stop___param = .; --- linux-2.6.5-rc3/arch/parisc/mm/init.c 2003-10-08 15:07:08.000000000 -0700 +++ 25/arch/parisc/mm/init.c 2004-03-31 00:55:04.389127080 -0800 @@ -74,33 +74,18 @@ int npmem_ranges; static unsigned long mem_limit = MAX_MEM; -static void __init mem_limit_func(void) +static int __init mem_limit_func(char *cp) { - char *cp, *end; unsigned long limit; - extern char saved_command_line[]; - /* We need this before __setup() functions are called */ - - limit = MAX_MEM; - for (cp = saved_command_line; *cp; ) { - if (memcmp(cp, "mem=", 4) == 0) { - cp += 4; - limit = memparse(cp, &end); - if (end != cp) - break; - cp = end; - } else { - while (*cp != ' ' && *cp) - ++cp; - while (*cp == ' ') - ++cp; - } - } + limit = memparse(cp, &cp); if (limit < mem_limit) mem_limit = limit; + + return 0; } +__early_param("mem=", mem_limit_func); #define MAX_GAP (0x40000000UL >> PAGE_SHIFT) @@ -215,8 +200,6 @@ static void __init setup_bootmem(void) * to work with multiple memory ranges). */ - mem_limit_func(); /* check for "mem=" argument */ - mem_max = 0; for (i = 0; i < npmem_ranges; i++) { unsigned long rsize; --- linux-2.6.5-rc3/arch/ppc64/configs/pSeries_defconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc64/configs/pSeries_defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -880,7 +880,6 @@ CONFIG_USB_HIDDEV=y # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set # CONFIG_USB_LEGOTOWER is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_LED is not set # CONFIG_USB_TEST is not set --- linux-2.6.5-rc3/arch/ppc64/defconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc64/defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -880,7 +880,6 @@ CONFIG_USB_HIDDEV=y # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set # CONFIG_USB_LEGOTOWER is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_LED is not set # CONFIG_USB_TEST is not set --- linux-2.6.5-rc3/arch/ppc64/Kconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc64/Kconfig 2004-03-31 00:54:27.904673560 -0800 @@ -173,6 +173,15 @@ config NUMA bool "NUMA support" depends on DISCONTIGMEM +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with POWER5 cpus at a cost of slightly increased + overhead in some places. If unsure say N here. + config PREEMPT bool "Preemptible Kernel" depends on BROKEN --- linux-2.6.5-rc3/arch/ppc64/kernel/chrp_setup.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc64/kernel/chrp_setup.c 2004-03-31 00:55:04.578098352 -0800 @@ -58,6 +58,7 @@ #include #include #include +#include #include "i8259.h" #include "open_pic.h" --- linux-2.6.5-rc3/arch/ppc64/kernel/eeh.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc64/kernel/eeh.c 2004-03-31 00:55:04.579098200 -0800 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "pci.h" --- linux-2.6.5-rc3/arch/ppc64/kernel/entry.S 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc64/kernel/entry.S 2004-03-31 00:54:30.687250544 -0800 @@ -95,7 +95,7 @@ _GLOBAL(DoSyscall) #endif /* SHOW_SYSCALLS */ clrrdi r10,r1,THREAD_SHIFT ld r10,TI_FLAGS(r10) - andi. r11,r10,_TIF_SYSCALL_TRACE + andi. r11,r10,_TIF_SYSCALL_T_OR_A bne- 50f cmpli 0,r0,NR_syscalls bge- 66f @@ -151,7 +151,8 @@ _GLOBAL(ret_from_syscall_1) b 22b /* Traced system call support */ -50: bl .do_syscall_trace +50: addi r3,r1,STACK_FRAME_OVERHEAD + bl .do_syscall_trace_enter ld r0,GPR0(r1) /* Restore original registers */ ld r3,GPR3(r1) ld r4,GPR4(r1) @@ -194,14 +195,14 @@ _GLOBAL(ret_from_syscall_1) _GLOBAL(ret_from_syscall_2) std r3,RESULT(r1) /* Save result */ li r10,-_LAST_ERRNO - cmpl 0,r3,r10 + cmpld 0,r3,r10 blt 60f neg r3,r3 57: ld r10,_CCR(r1) /* Set SO bit in CR */ oris r10,r10,0x1000 std r10,_CCR(r1) 60: std r3,GPR3(r1) /* Update return value */ - bl .do_syscall_trace + bl .do_syscall_trace_leave b .ret_from_except 66: li r3,ENOSYS b 57b @@ -234,14 +235,14 @@ _GLOBAL(ppc64_rt_sigreturn) 80: clrrdi r4,r1,THREAD_SHIFT ld r4,TI_FLAGS(r4) - andi. r4,r4,_TIF_SYSCALL_TRACE + andi. r4,r4,_TIF_SYSCALL_T_OR_A bne- 81f cmpi 0,r3,0 bge .ret_from_except b .ret_from_syscall_1 81: cmpi 0,r3,0 blt .ret_from_syscall_2 - bl .do_syscall_trace + bl .do_syscall_trace_leave b .ret_from_except /* @@ -289,6 +290,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) std r23,_CCR(r1) std r1,KSP(r3) /* Set old stack pointer */ +#ifdef CONFIG_SMP + /* We need a sync somewhere here to make sure that if the + * previous task gets rescheduled on another CPU, it sees all + * stores it has performed on this one. + */ + sync +#endif /* CONFIG_SMP */ + addi r6,r4,-THREAD /* Convert THREAD to 'current' */ std r6,PACACURRENT(r13) /* Set new 'current' */ @@ -344,9 +353,9 @@ _GLOBAL(ret_from_fork) bl .schedule_tail clrrdi r4,r1,THREAD_SHIFT ld r4,TI_FLAGS(r4) - andi. r4,r4,_TIF_SYSCALL_TRACE + andi. r4,r4,_TIF_SYSCALL_T_OR_A beq+ .ret_from_except - bl .do_syscall_trace + bl .do_syscall_trace_leave b .ret_from_except _GLOBAL(ret_from_except) --- linux-2.6.5-rc3/arch/ppc64/kernel/head.S 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc64/kernel/head.S 2004-03-31 00:55:04.581097896 -0800 @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef CONFIG_PPC_ISERIES #define DO_SOFT_DISABLE @@ -496,6 +497,9 @@ __end_systemcfg: .globl SystemReset_Iseries SystemReset_Iseries: mfspr r13,SPRG3 /* Get paca address */ + mfmsr r24 + ori r24,r24,MSR_RI + mtmsrd r24 /* RI on */ lhz r24,PACAPACAINDEX(r13) /* Get processor # */ cmpi 0,r24,0 /* Are we processor 0? */ beq .__start_initialization_iSeries /* Start up the first processor */ @@ -2275,4 +2279,4 @@ stab_array: */ .globl cmd_line cmd_line: - .space 512 /* COMMAND_LINE_SIZE */ + .space COMMAND_LINE_SIZE --- linux-2.6.5-rc3/arch/ppc64/kernel/HvCall.c 2004-02-03 20:42:34.000000000 -0800 +++ 25/arch/ppc64/kernel/HvCall.c 2004-03-31 00:54:17.849202224 -0800 @@ -19,7 +19,7 @@ void HvCall_writeLogBuffer(const void *b { struct HvLpBufferList hv_buf; u64 left_this_page; - u64 cur = virt_to_absolute((unsigned long)buffer); + u64 cur = virt_to_abs(buffer); while (len) { hv_buf.addr = cur; @@ -29,7 +29,7 @@ void HvCall_writeLogBuffer(const void *b hv_buf.len = left_this_page; len -= left_this_page; HvCall2(HvCallBaseWriteLogBuffer, - virt_to_absolute((unsigned long)&hv_buf), + virt_to_abs(&hv_buf), left_this_page); cur = (cur & PAGE_MASK) + PAGE_SIZE; } --- linux-2.6.5-rc3/arch/ppc64/kernel/iSeries_iommu.c 2004-03-10 20:41:26.000000000 -0800 +++ 25/arch/ppc64/kernel/iSeries_iommu.c 2004-03-31 00:54:17.850202072 -0800 @@ -76,7 +76,7 @@ static void tce_build_iSeries(struct iom while (npages--) { tce.te_word = 0; - tce.te_bits.tb_rpn = (virt_to_absolute(uaddr)) >> PAGE_SHIFT; + tce.te_bits.tb_rpn = virt_to_abs(uaddr) >> PAGE_SHIFT; if (tbl->it_type == TCE_VB) { /* Virtual Bus */ @@ -130,7 +130,7 @@ void __init iommu_vio_init(void) cb.itc_busno = 255; /* Bus 255 is the virtual bus */ cb.itc_virtbus = 0xff; /* Ask for virtual bus */ - cbp = virt_to_absolute((unsigned long)&cb); + cbp = virt_to_abs(&cb); HvCallXm_getTceTableParms(cbp); veth_iommu_table.it_size = cb.itc_size / 2; @@ -209,7 +209,7 @@ static void iommu_table_getparms(struct parms->itc_slotno = dn->LogicalSlot; parms->itc_virtbus = 0; - HvCallXm_getTceTableParms(REALADDR(parms)); + HvCallXm_getTceTableParms(ISERIES_HV_ADDR(parms)); if (parms->itc_size == 0) panic("PCI_DMA: parms->size is zero, parms is 0x%p", parms); --- linux-2.6.5-rc3/arch/ppc64/kernel/iSeries_pci.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc64/kernel/iSeries_pci.c 2004-03-31 00:54:17.851201920 -0800 @@ -277,7 +277,7 @@ static void iSeries_Scan_PHBs_Slots(stru */ for (IdSel = 1; IdSel < MaxAgents; ++IdSel) { HvRc = HvCallPci_getDeviceInfo(bus, SubBus, IdSel, - REALADDR(DevInfo), + ISERIES_HV_ADDR(DevInfo), sizeof(struct HvCallPci_DeviceInfo)); if (HvRc == 0) { if (DevInfo->deviceType == HvCallPci_NodeDevice) @@ -318,7 +318,7 @@ static void iSeries_Scan_EADs_Bridge(HvB "PCI:Connect EADs: 0x%02X.%02X.%02X\n", bus, SubBus, AgentId); HvRc = HvCallPci_getBusUnitInfo(bus, SubBus, AgentId, - REALADDR(BridgeInfo), + ISERIES_HV_ADDR(BridgeInfo), sizeof(struct HvCallPci_BridgeInfo)); if (HvRc == 0) { printk("bridge info: type %x subbus %x maxAgents %x maxsubbus %x logslot %x\n", --- linux-2.6.5-rc3/arch/ppc64/kernel/iSeries_setup.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc64/kernel/iSeries_setup.c 2004-03-31 00:55:04.582097744 -0800 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "iSeries_setup.h" @@ -658,8 +659,7 @@ static void __init iSeries_bolt_kernel(u HvCallHpt_setPp(slot, PP_RWXX); } else /* No HPTE exists, so create a new bolted one */ - iSeries_make_pte(va, (unsigned long)__v2a(ea), - mode_rw); + iSeries_make_pte(va, phys_to_abs(pa), mode_rw); } } --- linux-2.6.5-rc3/arch/ppc64/kernel/iSeries_VpdInfo.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc64/kernel/iSeries_VpdInfo.c 2004-03-31 00:54:17.852201768 -0800 @@ -293,7 +293,8 @@ void iSeries_Get_Location_Code(struct iS return; } BusVpdLen = HvCallPci_getBusVpd(ISERIES_BUS(DevNode), - REALADDR(BusVpdPtr), BUS_VPDSIZE); + ISERIES_HV_ADDR(BusVpdPtr), + BUS_VPDSIZE); if (BusVpdLen == 0) { kfree(BusVpdPtr); printk("PCI: Bus VPD Buffer zero length.\n"); --- linux-2.6.5-rc3/arch/ppc64/kernel/mf.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc64/kernel/mf.c 2004-03-31 00:54:17.853201616 -0800 @@ -763,14 +763,10 @@ void mf_getSrcHistory(char *buffer, int ev->event.data.vsp_cmd.lp_index = HvLpConfig_getLpIndex(); ev->event.data.vsp_cmd.result_code = 0xFF; ev->event.data.vsp_cmd.reserved = 0; - ev->event.data.vsp_cmd.sub_data.page[0] = - (0x8000000000000000ULL | virt_to_absolute((unsigned long)pages[0])); - ev->event.data.vsp_cmd.sub_data.page[1] = - (0x8000000000000000ULL | virt_to_absolute((unsigned long)pages[1])); - ev->event.data.vsp_cmd.sub_data.page[2] = - (0x8000000000000000ULL | virt_to_absolute((unsigned long)pages[2])); - ev->event.data.vsp_cmd.sub_data.page[3] = - (0x8000000000000000ULL | virt_to_absolute((unsigned long)pages[3])); + ev->event.data.vsp_cmd.sub_data.page[0] = ISERIES_HV_ADDR(pages[0]); + ev->event.data.vsp_cmd.sub_data.page[1] = ISERIES_HV_ADDR(pages[1]); + ev->event.data.vsp_cmd.sub_data.page[2] = ISERIES_HV_ADDR(pages[2]); + ev->event.data.vsp_cmd.sub_data.page[3] = ISERIES_HV_ADDR(pages[3]); mb(); if (signal_event(ev) != 0) return; --- linux-2.6.5-rc3/arch/ppc64/kernel/misc.S 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/ppc64/kernel/misc.S 2004-03-31 00:56:32.151785128 -0800 @@ -811,7 +811,7 @@ _GLOBAL(sys_call_table32) .llong .sys_epoll_create .llong .sys_epoll_ctl .llong .sys_epoll_wait - .llong .sys_remap_file_pages + .llong .old_remap_file_pages .llong .ppc32_timer_create /* 240 */ .llong .compat_timer_settime .llong .compat_timer_gettime @@ -828,6 +828,8 @@ _GLOBAL(sys_call_table32) .llong .compat_fstatfs64 .llong .ppc32_fadvise64_64 /* 32bit only fadvise64_64 */ .llong .ppc_rtas /* 255 */ + .llong .sys_ni_syscall /* sys_debug_setcontext */ + .llong .sys_remap_file_pages .balign 8 _GLOBAL(sys_call_table) @@ -1070,7 +1072,7 @@ _GLOBAL(sys_call_table) .llong .sys_epoll_create .llong .sys_epoll_ctl .llong .sys_epoll_wait - .llong .sys_remap_file_pages + .llong .old_remap_file_pages .llong .sys_timer_create /* 240 */ .llong .sys_timer_settime .llong .sys_timer_gettime @@ -1087,3 +1089,5 @@ _GLOBAL(sys_call_table) .llong .sys_fstatfs64 .llong .sys_ni_syscall /* 32bit only fadvise64_64 */ .llong .ppc_rtas /* 255 */ + .llong .sys_ni_syscall /* sys_debug_setcontext */ + .llong .sys_remap_file_pages --- linux-2.6.5-rc3/arch/ppc64/kernel/pci_dma_direct.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/ppc64/kernel/pci_dma_direct.c 2004-03-31 00:54:17.853201616 -0800 @@ -37,7 +37,7 @@ static void *pci_direct_alloc_consistent ret = (void *)__get_free_pages(GFP_ATOMIC, get_order(size)); if (ret != NULL) { memset(ret, 0, size); - *dma_handle = virt_to_absolute((unsigned long)ret); + *dma_handle = virt_to_abs(ret); } return ret; } @@ -51,7 +51,7 @@ static void pci_direct_free_consistent(s static dma_addr_t pci_direct_map_single(struct pci_dev *hwdev, void *ptr, size_t size, int direction) { - return virt_to_absolute((unsigned long)ptr); + return virt_to_abs(ptr); } static void pci_direct_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, --- linux-2.6.5-rc3/arch/ppc64/kernel/pmac_iommu.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/ppc64/kernel/pmac_iommu.c 2004-03-31 00:54:17.854201464 -0800 @@ -154,7 +154,7 @@ static void dart_build_pmac(struct iommu * out of the loop. */ while (npages--) { - rpn = (virt_to_absolute(uaddr)) >> PAGE_SHIFT; + rpn = virt_to_abs(uaddr) >> PAGE_SHIFT; *(dp++) = DARTMAP_VALID | (rpn & DARTMAP_RPNMASK); @@ -210,7 +210,7 @@ static int dart_init(struct device_node if (tmp == 0) panic("U3-DART: Cannot allocate spare page !"); dart_emptyval = DARTMAP_VALID | - ((virt_to_absolute(tmp) >> PAGE_SHIFT) & DARTMAP_RPNMASK); + ((virt_to_abs(tmp) >> PAGE_SHIFT) & DARTMAP_RPNMASK); /* Map in DART registers. FIXME: Use device node to get base address */ dart = ioremap(DART_BASE, 0x7000); @@ -225,7 +225,7 @@ static int dart_init(struct device_node (((dart_tablesize >> PAGE_SHIFT) & DARTCNTL_SIZE_MASK) << DARTCNTL_SIZE_SHIFT); p = virt_to_page(dart_tablebase); - dart_vbase = ioremap(virt_to_absolute(dart_tablebase), dart_tablesize); + dart_vbase = ioremap(virt_to_abs(dart_tablebase), dart_tablesize); /* Fill initial table */ for (i = 0; i < dart_tablesize/4; i++) --- linux-2.6.5-rc3/arch/ppc64/kernel/pmac_setup.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/ppc64/kernel/pmac_setup.c 2004-03-31 00:55:04.582097744 -0800 @@ -70,6 +70,7 @@ #include #include #include +#include #include "pmac.h" --- linux-2.6.5-rc3/arch/ppc64/kernel/prom.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/ppc64/kernel/prom.c 2004-03-31 00:55:04.585097288 -0800 @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include "open_pic.h" #ifdef CONFIG_LOGO_LINUX_CLUT224 @@ -900,7 +900,7 @@ prom_initialize_tce_table(void) prom_panic(RELOC("ERROR, cannot find space for TCE table.\n")); } - vbase = absolute_to_virt(base); + vbase = (unsigned long)abs_to_virt(base); /* Save away the TCE table attributes for later use. */ prom_tce_table[table].node = node; @@ -1007,9 +1007,12 @@ prom_hold_cpus(unsigned long mem) extern void __secondary_hold(void); extern unsigned long __secondary_hold_spinloop; extern unsigned long __secondary_hold_acknowledge; - unsigned long *spinloop = __v2a(&__secondary_hold_spinloop); - unsigned long *acknowledge = __v2a(&__secondary_hold_acknowledge); - unsigned long secondary_hold = (unsigned long)__v2a(*PTRRELOC((unsigned long *)__secondary_hold)); + unsigned long *spinloop + = (void *)virt_to_abs(&__secondary_hold_spinloop); + unsigned long *acknowledge + = (void *)virt_to_abs(&__secondary_hold_acknowledge); + unsigned long secondary_hold + = virt_to_abs(*PTRRELOC((unsigned long *)__secondary_hold)); struct systemcfg *_systemcfg = RELOC(systemcfg); struct paca_struct *_xPaca = PTRRELOC(&paca[0]); struct prom_t *_prom = PTRRELOC(&prom); --- linux-2.6.5-rc3/arch/ppc64/kernel/pSeries_iommu.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/ppc64/kernel/pSeries_iommu.c 2004-03-31 00:54:17.857201008 -0800 @@ -61,7 +61,7 @@ static void tce_build_pSeries(struct iom while (npages--) { /* can't move this out since we might cross LMB boundary */ - t.te_rpn = (virt_to_absolute(uaddr)) >> PAGE_SHIFT; + t.te_rpn = (virt_to_abs(uaddr)) >> PAGE_SHIFT; tp->te_word = t.te_word; --- linux-2.6.5-rc3/arch/ppc64/kernel/pSeries_lpar.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/ppc64/kernel/pSeries_lpar.c 2004-03-31 00:54:17.858200856 -0800 @@ -36,6 +36,7 @@ #include #include #include +#include /* in pSeries_hvCall.S */ EXPORT_SYMBOL(plpar_hcall); @@ -135,7 +136,7 @@ static void tce_build_pSeriesLP(struct i union tce_entry tce; tce.te_word = 0; - tce.te_rpn = (virt_to_absolute(uaddr)) >> PAGE_SHIFT; + tce.te_rpn = (virt_to_abs(uaddr)) >> PAGE_SHIFT; tce.te_rdwr = 1; if (direction != PCI_DMA_TODEVICE) tce.te_pciwr = 1; --- linux-2.6.5-rc3/arch/ppc64/kernel/ptrace.c 2003-06-14 12:18:30.000000000 -0700 +++ 25/arch/ppc64/kernel/ptrace.c 2004-03-31 00:54:31.294158280 -0800 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -286,12 +287,8 @@ out: return ret; } -void do_syscall_trace(void) +static void do_syscall_trace(void) { - if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return; - if (!(current->ptrace & PT_PTRACED)) - return; /* the 0x80 provides a way for the tracing parent to distinguish between a syscall stop and SIGTRAP delivery */ ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) @@ -307,3 +304,25 @@ void do_syscall_trace(void) current->exit_code = 0; } } + +void do_syscall_trace_enter(struct pt_regs *regs) +{ + if (unlikely(current->audit_context)) + audit_syscall_entry(current, regs->gpr[0], + regs->gpr[3], regs->gpr[4], + regs->gpr[5], regs->gpr[6]); + + if (test_thread_flag(TIF_SYSCALL_TRACE) + && (current->ptrace & PT_PTRACED)) + do_syscall_trace(); +} + +void do_syscall_trace_leave(void) +{ + if (unlikely(current->audit_context)) + audit_syscall_exit(current, 0); /* FIXME: pass pt_regs */ + + if (test_thread_flag(TIF_SYSCALL_TRACE) + && (current->ptrace & PT_PTRACED)) + do_syscall_trace(); +} --- linux-2.6.5-rc3/arch/ppc64/kernel/rtas.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/ppc64/kernel/rtas.c 2004-03-31 00:54:17.858200856 -0800 @@ -361,7 +361,7 @@ rtas_flash_firmware(void) */ rtas_firmware_flash_list.num_blocks = 0; flist = (struct flash_block_list *)&rtas_firmware_flash_list; - rtas_block_list = virt_to_absolute((unsigned long)flist); + rtas_block_list = virt_to_abs(flist); if (rtas_block_list >= (4UL << 20)) { printk(KERN_ALERT "FLASH: kernel bug...flash list header addr above 4GB\n"); return; @@ -373,13 +373,13 @@ rtas_flash_firmware(void) for (f = flist; f; f = next) { /* Translate data addrs to absolute */ for (i = 0; i < f->num_blocks; i++) { - f->blocks[i].data = (char *)virt_to_absolute((unsigned long)f->blocks[i].data); + f->blocks[i].data = (char *)virt_to_abs(f->blocks[i].data); image_size += f->blocks[i].length; } next = f->next; /* Don't translate NULL pointer for last entry */ - if(f->next) - f->next = (struct flash_block_list *)virt_to_absolute((unsigned long)f->next); + if (f->next) + f->next = (struct flash_block_list *)virt_to_abs(f->next); else f->next = 0LL; /* make num_blocks into the version/length field */ --- linux-2.6.5-rc3/arch/ppc64/kernel/setup.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/ppc64/kernel/setup.c 2004-03-31 00:55:04.586097136 -0800 @@ -43,6 +43,7 @@ #include #include #include +#include extern unsigned long klimit; /* extern void *stab; */ @@ -80,7 +81,6 @@ unsigned long decr_overclock_proc0_set = int powersave_nap; -char saved_command_line[COMMAND_LINE_SIZE]; unsigned char aux_device_present; void parse_cmd_line(unsigned long r3, unsigned long r4, unsigned long r5, @@ -603,9 +603,8 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = klimit; - /* Save unparsed command line copy for /proc/cmdline */ - strlcpy(saved_command_line, cmd_line, sizeof(saved_command_line)); *cmdline_p = cmd_line; + parse_early_options(cmdline_p); /* set up the bootmem stuff with available memory */ do_init_bootmem(); --- linux-2.6.5-rc3/arch/ppc64/kernel/signal32.c 2004-03-10 20:41:26.000000000 -0800 +++ 25/arch/ppc64/kernel/signal32.c 2004-03-31 00:54:55.345501920 -0800 @@ -657,7 +657,7 @@ int sys32_sigaltstack(u32 newstack, u32 * Set up a signal frame for a "real-time" signal handler * (one which gets siginfo). */ -static void handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, +static void handle_rt_signal32(unsigned long sig, struct k_sigaction *ka_copy, siginfo_t *info, sigset_t *oldset, struct pt_regs * regs, unsigned long newsp) { @@ -703,7 +703,7 @@ static void handle_rt_signal32(unsigned regs->gpr[4] = (unsigned long) &rt_sf->info; regs->gpr[5] = (unsigned long) &rt_sf->uc; regs->gpr[6] = (unsigned long) rt_sf; - regs->nip = (unsigned long) ka->sa.sa_handler; + regs->nip = (unsigned long) ka_copy->sa.sa_handler; regs->link = (unsigned long) frame->tramp; regs->trap = 0; @@ -715,7 +715,7 @@ badframe: regs, frame, newsp); #endif if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + current->sighand->action[SIGSEGV-1].sa.sa_handler = SIG_DFL; force_sig(SIGSEGV, current); } @@ -828,7 +828,7 @@ long sys32_rt_sigreturn(int r3, int r4, /* * OK, we're invoking a handler */ -static void handle_signal32(unsigned long sig, struct k_sigaction *ka, +static void handle_signal32(unsigned long sig, struct k_sigaction *ka_copy, siginfo_t *info, sigset_t *oldset, struct pt_regs * regs, unsigned long newsp) { @@ -853,7 +853,7 @@ static void handle_signal32(unsigned lon #if _NSIG != 64 #error "Please adjust handle_signal32()" #endif - if (__put_user((u32)(u64)ka->sa.sa_handler, &sc->handler) + if (__put_user((u32)(u64)ka_copy->sa.sa_handler, &sc->handler) || __put_user(oldset->sig[0], &sc->oldmask) || __put_user((oldset->sig[0] >> 32), &sc->_unused[3]) || __put_user((u32)(u64)frame, &sc->regs) @@ -868,7 +868,7 @@ static void handle_signal32(unsigned lon regs->gpr[1] = (unsigned long) newsp; regs->gpr[3] = sig; regs->gpr[4] = (unsigned long) sc; - regs->nip = (unsigned long) ka->sa.sa_handler; + regs->nip = (unsigned long) ka_copy->sa.sa_handler; regs->link = (unsigned long) frame->mctx.tramp; regs->trap = 0; @@ -880,7 +880,7 @@ badframe: regs, frame, *newspp); #endif if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + current->sighand->action[SIGSEGV-1].sa.sa_handler = SIG_DFL; force_sig(SIGSEGV, current); } @@ -944,18 +944,16 @@ badframe: int do_signal32(sigset_t *oldset, struct pt_regs *regs) { siginfo_t info; - struct k_sigaction *ka; unsigned int frame, newsp; int signr, ret; + struct k_sigaction ka_copy; if (!oldset) oldset = ¤t->blocked; newsp = frame = 0; - signr = get_signal_to_deliver(&info, regs, NULL); - - ka = (signr == 0)? NULL: ¤t->sighand->action[signr-1]; + signr = get_signal_to_deliver(&info, &ka_copy, regs, NULL); if (regs->trap == 0x0C00 /* System Call! */ && regs->ccr & 0x10000000 /* error signalled */ @@ -966,7 +964,7 @@ int do_signal32(sigset_t *oldset, struct if (signr > 0 && (ret == ERESTARTNOHAND || ret == ERESTART_RESTARTBLOCK || (ret == ERESTARTSYS - && !(ka->sa.sa_flags & SA_RESTART)))) { + && !(ka_copy.sa.sa_flags & SA_RESTART)))) { /* make the system call return an EINTR error */ regs->result = -EINTR; regs->gpr[3] = EINTR; @@ -985,7 +983,7 @@ int do_signal32(sigset_t *oldset, struct if (signr == 0) return 0; /* no signals delivered */ - if ((ka->sa.sa_flags & SA_ONSTACK) && current->sas_ss_size + if ((ka_copy.sa.sa_flags & SA_ONSTACK) && current->sas_ss_size && (!on_sig_stack(regs->gpr[1]))) newsp = (current->sas_ss_sp + current->sas_ss_size); else @@ -993,17 +991,15 @@ int do_signal32(sigset_t *oldset, struct newsp &= ~0xfUL; /* Whee! Actually deliver the signal. */ - if (ka->sa.sa_flags & SA_SIGINFO) - handle_rt_signal32(signr, ka, &info, oldset, regs, newsp); + if (ka_copy.sa.sa_flags & SA_SIGINFO) + handle_rt_signal32(signr, &ka_copy, &info, oldset, regs, newsp); else - handle_signal32(signr, ka, &info, oldset, regs, newsp); - - if (ka->sa.sa_flags & SA_ONESHOT) - ka->sa.sa_handler = SIG_DFL; + handle_signal32(signr, &ka_copy, &info, oldset, regs, newsp); - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (!(ka_copy.sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); + sigorsets(¤t->blocked, ¤t->blocked, + &ka_copy.sa.sa_mask); sigaddset(¤t->blocked, signr); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); --- linux-2.6.5-rc3/arch/ppc64/kernel/signal.c 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/ppc64/kernel/signal.c 2004-03-31 00:54:55.346501768 -0800 @@ -216,15 +216,15 @@ static int restore_sigcontext(struct pt_ /* * Allocate space for the signal frame */ -static inline void * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, - size_t frame_size) +static inline void * get_sigframe(struct k_sigaction *ka_copy, + struct pt_regs *regs, size_t frame_size) { unsigned long newsp; /* Default to using normal stack */ newsp = regs->gpr[1]; - if (ka->sa.sa_flags & SA_ONSTACK) { + if (ka_copy->sa.sa_flags & SA_ONSTACK) { if (! on_sig_stack(regs->gpr[1])) newsp = (current->sas_ss_sp + current->sas_ss_size); } @@ -361,8 +361,8 @@ badframe: do_exit(SIGSEGV); } -static void setup_rt_frame(int signr, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) +static void setup_rt_frame(int signr, struct k_sigaction *ka_copy, + siginfo_t *info, sigset_t *set, struct pt_regs *regs) { /* Handler is *really* a pointer to the function descriptor for * the signal routine. The first entry in the function @@ -374,7 +374,7 @@ static void setup_rt_frame(int signr, st unsigned long newsp = 0; int err = 0; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka_copy, regs, sizeof(*frame)); if (verify_area(VERIFY_WRITE, frame, sizeof(*frame))) goto badframe; @@ -393,7 +393,7 @@ static void setup_rt_frame(int signr, st &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, signr, NULL, - (unsigned long)ka->sa.sa_handler); + (unsigned long)ka_copy->sa.sa_handler); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) goto badframe; @@ -403,7 +403,7 @@ static void setup_rt_frame(int signr, st if (err) goto badframe; - funct_desc_ptr = (func_descr_t *) ka->sa.sa_handler; + funct_desc_ptr = (func_descr_t *) ka_copy->sa.sa_handler; /* Allocate a dummy caller frame for the signal handler. */ newsp = (unsigned long)frame - __SIGNAL_FRAMESIZE; @@ -415,7 +415,7 @@ static void setup_rt_frame(int signr, st regs->gpr[1] = newsp; err |= get_user(regs->gpr[2], &funct_desc_ptr->toc); regs->gpr[3] = signr; - if (ka->sa.sa_flags & SA_SIGINFO) { + if (ka_copy->sa.sa_flags & SA_SIGINFO) { err |= get_user(regs->gpr[4], (unsigned long *)&frame->pinfo); err |= get_user(regs->gpr[5], (unsigned long *)&frame->puc); regs->gpr[6] = (unsigned long) frame; @@ -432,33 +432,33 @@ badframe: printk("badframe in setup_rt_frame, regs=%p frame=%p newsp=%lx\n", regs, frame, newsp); #endif - do_exit(SIGSEGV); + if (signr == SIGSEGV) + current->sighand->action[SIGSEGV-1].sa.sa_handler = SIG_DFL; + force_sig(SIGSEGV, current); } /* * OK, we're invoking a handler */ -static void handle_signal(unsigned long sig, struct k_sigaction *ka, - siginfo_t *info, sigset_t *oldset, struct pt_regs *regs) +static void handle_signal(unsigned long sig, struct k_sigaction *ka_copy, + siginfo_t *info, sigset_t *oldset, struct pt_regs *regs) { /* Set up Signal Frame */ - setup_rt_frame(sig, ka, info, oldset, regs); - - if (ka->sa.sa_flags & SA_ONESHOT) - ka->sa.sa_handler = SIG_DFL; + setup_rt_frame(sig, ka_copy, info, oldset, regs); - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (!(ka_copy->sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); + sigorsets(¤t->blocked, ¤t->blocked, + &ka_copy->sa.sa_mask); sigaddset(¤t->blocked,sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } - return; } -static inline void syscall_restart(struct pt_regs *regs, struct k_sigaction *ka) +static inline void syscall_restart(struct pt_regs *regs, + struct k_sigaction *ka_copy) { switch ((int)regs->result) { case -ERESTART_RESTARTBLOCK: @@ -473,7 +473,7 @@ static inline void syscall_restart(struc /* ERESTARTSYS means to restart the syscall if there is no * handler or the handler was registered with SA_RESTART */ - if (!(ka->sa.sa_flags & SA_RESTART)) { + if (!(ka_copy->sa.sa_flags & SA_RESTART)) { regs->result = -EINTR; break; } @@ -498,6 +498,7 @@ int do_signal(sigset_t *oldset, struct p { siginfo_t info; int signr; + struct k_sigaction ka_copy; /* * If the current thread is 32 bit - invoke the @@ -509,14 +510,12 @@ int do_signal(sigset_t *oldset, struct p if (!oldset) oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, regs, NULL); + signr = get_signal_to_deliver(&info, &ka_copy, regs, NULL); if (signr > 0) { - struct k_sigaction *ka = ¤t->sighand->action[signr-1]; - /* Whee! Actually deliver the signal. */ if (regs->trap == 0x0C00) - syscall_restart(regs, ka); - handle_signal(signr, ka, &info, oldset, regs); + syscall_restart(regs, &ka_copy); + handle_signal(signr, &ka_copy, &info, oldset, regs); return 1; } --- linux-2.6.5-rc3/arch/ppc64/kernel/smp.c 2004-03-10 20:41:26.000000000 -0800 +++ 25/arch/ppc64/kernel/smp.c 2004-03-31 00:54:28.284615800 -0800 @@ -579,11 +579,6 @@ void __init smp_prepare_cpus(unsigned in paca[boot_cpuid].prof_counter = 1; paca[boot_cpuid].prof_multiplier = 1; - /* - * XXX very rough. - */ - cache_decay_ticks = HZ/100; - #ifndef CONFIG_PPC_ISERIES paca[boot_cpuid].next_jiffy_update_tb = tb_last_stamp = get_tb(); @@ -629,7 +624,7 @@ int __devinit __cpu_up(unsigned int cpu) tmp = &stab_array[PAGE_SIZE * cpu]; memset(tmp, 0, PAGE_SIZE); paca[cpu].xStab_data.virt = (unsigned long)tmp; - paca[cpu].xStab_data.real = (unsigned long)__v2a(tmp); + paca[cpu].xStab_data.real = virt_to_abs(tmp); } /* create a process for the processor */ @@ -796,3 +791,278 @@ static int __init topology_init(void) return 0; } __initcall(topology_init); + +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_NUMA +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static DEFINE_PER_CPU(struct sched_domain, node_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + cpumask_t my_cpumask = cpumask_of_cpu(i); + cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1); + + *cpu_domain = SD_SIBLING_INIT; + if (__is_processor(PV_POWER5)) + cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask); + else + cpu_domain->span = my_cpumask; + cpu_domain->groups = &sched_group_cpus[i]; + cpu_domain->parent = phys_domain; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = nodemask; + // phys_domain->cache_hot_time = XXX; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + phys_domain->parent = node_domain; + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + // node_domain->cache_hot_time = XXX; + node_domain->groups = &sched_group_nodes[node]; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) { + &per_cpu(cpu_domains, i)->flags |= SD_SHARE_CPUPOWER; + &per_cpu(cpu_domains, first_cpu(cpu_domain->span))->flags |= + SD_SHARE_CPUPOWER; + continue; + } + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + for (i = 0; i < MAX_NUMNODES; i++) { + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpumask_t node_cpumask = node_to_cpumask(i); + cpus_and(nodemask, node_cpumask, cpu_online_map); + + if (cpus_empty(nodemask)) + continue; + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu_mask(j, nodemask) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j); + struct sched_group *cpu = &sched_group_phys[j]; + + if (j != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* + * Make each extra sibling increase power by 10% of + * the basic CPU. This is very arbitrary. + */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + node->cpu_power += cpu->cpu_power; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + /* Set up nodes */ + first_cpu = last_cpu = NULL; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *cpu = &sched_group_nodes[i]; + cpumask_t nodemask; + cpumask_t node_cpumask = node_to_cpumask(i); + cpus_and(nodemask, node_cpumask, cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + cpu->cpumask = nodemask; + /* ->cpu_power already setup */ + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + int node = cpu_to_node(i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); + } +} +#else /* !CONFIG_NUMA */ +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + cpumask_t my_cpumask = cpumask_of_cpu(i); + cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1); + + *cpu_domain = SD_SIBLING_INIT; + if (__is_processor(PV_POWER5)) + cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask); + else + cpu_domain->span = my_cpumask; + cpu_domain->groups = &sched_group_cpus[i]; + cpu_domain->parent = phys_domain; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = cpu_possible_map; + // phys_domain->cache_hot_time = XXX; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + int j; + first_cpu = last_cpu = NULL; + + if (i != first_cpu(cpu_domain->span)) { + per_cpu(cpu_domains, i).flags |= SD_SHARE_CPUPOWER; + per_cpu(cpu_domains, first_cpu(cpu_domain->span)).flags |= + SD_SHARE_CPUPOWER; + continue; + } + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + } + + first_cpu = last_cpu = NULL; + /* Set up physical groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_group *cpu = &sched_group_phys[i]; + + if (i != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* See SMT+NUMA setup for comment */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); + } +} +#endif /* CONFIG_NUMA */ +#else /* !CONFIG_SCHED_SMT */ + +#ifdef CONFIG_NUMA +#error ppc64 has no NUMA scheduler defined without CONFIG_SCHED_SMT. \ + Please enable CONFIG_SCHED_SMT or bug Anton. +#endif + +static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); + +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + + *cpu_sd = SD_CPU_INIT; + cpu_sd->span = cpu_possible_map; + // cpu_sd->cache_hot_time = XXX; + cpu_sd->groups = &sched_group_cpus[i]; + } + + /* Set up CPU groups */ + for_each_cpu_mask(i, cpu_possible_map) { + struct sched_group *cpu = &sched_group_cpus[i]; + + cpus_clear(cpu->cpumask); + cpu_set(i, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); + } +} + +#endif --- linux-2.6.5-rc3/arch/ppc64/kernel/sys_ppc32.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/ppc64/kernel/sys_ppc32.c 2004-03-31 00:54:51.112145488 -0800 @@ -2210,7 +2210,7 @@ off_t ppc32_lseek(unsigned int fd, u32 o /* * This is just a version for 32-bit applications which does - * not force O_LARGEFILE on. + * not force O_KERNEL_LARGEFILE on. */ long sys32_open(const char * filename, int flags, int mode) { --- linux-2.6.5-rc3/arch/ppc64/kernel/vmlinux.lds.S 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/ppc64/kernel/vmlinux.lds.S 2004-03-31 00:55:04.586097136 -0800 @@ -59,6 +59,11 @@ SECTIONS *(.init.setup) __setup_end = .; } + __early_param : { + __early_begin = .; + *(__early_param) + __early_end = .; + } __param : { __start___param = .; --- linux-2.6.5-rc3/arch/ppc64/mm/hash_utils.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/ppc64/mm/hash_utils.c 2004-03-31 00:54:17.860200552 -0800 @@ -48,6 +48,8 @@ #include #include #include +#include + /* * Note: pte --> Linux PTE * HPTE --> PowerPC Hashed Page Table Entry @@ -107,11 +109,11 @@ static inline void create_pte_mapping(un if (systemcfg->platform == PLATFORM_PSERIES_LPAR) ret = pSeries_lpar_hpte_insert(hpteg, va, - (unsigned long)__v2a(addr) >> PAGE_SHIFT, + virt_to_abs(addr) >> PAGE_SHIFT, 0, mode, 1, large); else ret = pSeries_hpte_insert(hpteg, va, - (unsigned long)__v2a(addr) >> PAGE_SHIFT, + virt_to_abs(addr) >> PAGE_SHIFT, 0, mode, 1, large); if (ret == -1) { @@ -154,7 +156,7 @@ void __init htab_initialize(void) ppc64_terminate_msg(0x20, "hpt space"); loop_forever(); } - htab_data.htab = (HPTE *)__a2v(table); + htab_data.htab = abs_to_virt(table); /* htab absolute addr + encoded htabsize */ _SDR1 = table + __ilog2(pteg_count) - 11; --- linux-2.6.5-rc3/arch/ppc64/mm/hugetlbpage.c 2004-03-10 20:41:26.000000000 -0800 +++ 25/arch/ppc64/mm/hugetlbpage.c 2004-03-31 00:56:27.577480528 -0800 @@ -40,7 +40,7 @@ static struct list_head hugepage_freelis static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -63,8 +63,8 @@ static struct page *dequeue_huge_page(vo } if (!list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, struct page, list); - list_del(&page->list); + page = list_entry(hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); } if (largepage_roundrobin) @@ -407,9 +407,8 @@ follow_huge_pmd(struct mm_struct *mm, un static void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); spin_lock(&htlbpage_lock); enqueue_huge_page(page); @@ -912,6 +911,12 @@ int is_hugepage_mem_enough(size_t size) return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpage_free; } +/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ +unsigned long hugetlb_total_pages(void) +{ + return htlbpage_total * (HPAGE_SIZE / PAGE_SIZE); +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the --- linux-2.6.5-rc3/arch/ppc64/mm/init.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/ppc64/mm/init.c 2004-03-31 00:54:17.861200400 -0800 @@ -60,6 +60,7 @@ #include #include #include +#include struct mmu_context_queue_t mmu_context_queue; @@ -153,7 +154,7 @@ static void map_io_page(unsigned long ea pmdp = pmd_alloc(&ioremap_mm, pgdp, ea); ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea); - pa = absolute_to_phys(pa); + pa = abs_to_phys(pa); set_pte(ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); spin_unlock(&ioremap_mm.page_table_lock); } else { @@ -539,7 +540,7 @@ void __init do_init_bootmem(void) */ bootmap_pages = bootmem_bootmap_pages(total_pages); - start = (unsigned long)__a2p(lmb_alloc(bootmap_pages<> PAGE_SHIFT, total_pages); --- linux-2.6.5-rc3/arch/ppc64/mm/numa.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/ppc64/mm/numa.c 2004-03-31 00:55:04.586097136 -0800 @@ -15,7 +15,7 @@ #include #include #include -#include +#include #if 1 #define dbg(args...) udbg_printf(args) --- linux-2.6.5-rc3/arch/ppc/configs/adir_defconfig 2003-07-10 18:50:30.000000000 -0700 +++ 25/arch/ppc/configs/adir_defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -776,7 +776,6 @@ CONFIG_USB_EZUSB=y # CONFIG_USB_TIGL is not set # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_TEST is not set # CONFIG_USB_GADGET is not set --- linux-2.6.5-rc3/arch/ppc/configs/common_defconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/configs/common_defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -1209,7 +1209,6 @@ CONFIG_USB_EZUSB=y # CONFIG_USB_TIGL is not set # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_TEST is not set # CONFIG_USB_GADGET is not set --- linux-2.6.5-rc3/arch/ppc/configs/lopec_defconfig 2003-07-10 18:50:30.000000000 -0700 +++ 25/arch/ppc/configs/lopec_defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -788,7 +788,6 @@ CONFIG_USB_SERIAL_VISOR=m # CONFIG_USB_TIGL is not set # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_TEST is not set # CONFIG_USB_GADGET is not set --- linux-2.6.5-rc3/arch/ppc/configs/pmac_defconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/configs/pmac_defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -1249,7 +1249,6 @@ CONFIG_USB_SERIAL_VISOR=m # CONFIG_USB_TIGL is not set # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_TEST is not set # CONFIG_USB_GADGET is not set --- linux-2.6.5-rc3/arch/ppc/configs/sandpoint_defconfig 2003-07-10 18:50:30.000000000 -0700 +++ 25/arch/ppc/configs/sandpoint_defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -775,7 +775,6 @@ CONFIG_USB_SERIAL_VISOR=m # CONFIG_USB_TIGL is not set # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_TEST is not set # CONFIG_USB_GADGET is not set --- linux-2.6.5-rc3/arch/ppc/defconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/defconfig 2004-03-30 20:21:49.000000000 -0800 @@ -1208,7 +1208,6 @@ CONFIG_USB_EZUSB=y # CONFIG_USB_TIGL is not set # CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set -# CONFIG_USB_BRLVGER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_TEST is not set # CONFIG_USB_GADGET is not set --- linux-2.6.5-rc3/arch/ppc/Kconfig 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/Kconfig 2004-03-30 20:21:43.000000000 -0800 @@ -696,14 +696,10 @@ config NR_CPUS config PREEMPT bool "Preemptible Kernel" - depends on !SMP help This option reduces the latency of the kernel when reacting to real-time or interactive events by allowing a low priority process to be preempted even if it is in kernel mode executing a system call. - Unfortunately the kernel code has some race conditions if both - CONFIG_SMP and CONFIG_PREEMPT are enabled, so this option is - currently disabled if you are building an SMP kernel. Say Y here if you are building a kernel for a desktop, embedded or real-time system. Say N if you are unsure. --- linux-2.6.5-rc3/arch/ppc/kernel/align.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/kernel/align.c 2004-03-30 20:21:43.000000000 -0800 @@ -325,14 +325,18 @@ fix_alignment(struct pt_regs *regs) * the kernel with -msoft-float so it doesn't use the * fp regs for copying 8-byte objects. */ case LD+F+S: + preempt_disable(); enable_kernel_fp(); cvt_fd(&data.f, ¤t->thread.fpr[reg], ¤t->thread.fpscr); /* current->thread.fpr[reg] = data.f; */ + preempt_enable(); break; case ST+F+S: + preempt_disable(); enable_kernel_fp(); cvt_df(¤t->thread.fpr[reg], &data.f, ¤t->thread.fpscr); /* data.f = current->thread.fpr[reg]; */ + preempt_enable(); break; default: printk("align: can't handle flags=%x\n", flags); --- linux-2.6.5-rc3/arch/ppc/kernel/entry.S 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/kernel/entry.S 2004-03-30 20:21:43.000000000 -0800 @@ -469,10 +469,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) stw r10,_CCR(r1) stw r1,KSP(r3) /* Set old stack pointer */ +#ifdef CONFIG_SMP + /* We need a sync somewhere here to make sure that if the + * previous task gets rescheduled on another CPU, it sees all + * stores it has performed on this one. + */ + sync +#endif /* CONFIG_SMP */ + tophys(r0,r4) CLR_TOP32(r0) mtspr SPRG3,r0 /* Update current THREAD phys addr */ lwz r1,KSP(r4) /* Load new stack pointer */ + /* save the old current 'last' for return value */ mr r3,r2 addi r2,r4,-THREAD /* Update current */ --- linux-2.6.5-rc3/arch/ppc/kernel/head.S 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/kernel/head.S 2004-03-30 20:21:43.000000000 -0800 @@ -1436,11 +1436,8 @@ _GLOBAL(set_context) stw r4, 0x4(r5) #endif li r4,0 -BEGIN_FTR_SECTION - dssall - sync -END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) -3: isync + isync +3: #ifdef CONFIG_PPC64BRIDGE slbie r4 #endif /* CONFIG_PPC64BRIDGE */ --- linux-2.6.5-rc3/arch/ppc/kernel/misc.S 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/kernel/misc.S 2004-03-31 00:56:32.152784976 -0800 @@ -1353,7 +1353,7 @@ _GLOBAL(sys_call_table) .long sys_epoll_create .long sys_epoll_ctl .long sys_epoll_wait - .long sys_remap_file_pages + .long old_remap_file_pages .long sys_timer_create /* 240 */ .long sys_timer_settime .long sys_timer_gettime @@ -1370,3 +1370,5 @@ _GLOBAL(sys_call_table) .long sys_fstatfs64 .long ppc_fadvise64_64 .long sys_ni_syscall /* 255 - rtas (used on ppc64) */ + .long sys_ni_syscall /* sys_set_debug_context */ + .long sys_remap_file_pages --- linux-2.6.5-rc3/arch/ppc/kernel/pci.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/ppc/kernel/pci.c 2004-03-30 20:21:43.000000000 -0800 @@ -159,7 +159,6 @@ pcibios_fixup_resources(struct pci_dev * ppc_md.pcibios_fixup_resources(dev); } - void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, struct resource *res) @@ -1522,51 +1521,43 @@ __pci_mmap_make_offset(struct pci_dev *d { struct pci_controller *hose = (struct pci_controller *) dev->sysdata; unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned long io_offset = 0; - int i, res_bit; + unsigned long size = vma->vm_end - vma->vm_start; + unsigned long base; + struct resource *res; + int i; + int ret = -EINVAL; if (hose == 0) return -EINVAL; /* should never happen */ + if (offset + size <= offset) + return -EINVAL; - /* If memory, add on the PCI bridge address offset */ if (mmap_state == pci_mmap_mem) { + /* PCI memory space */ + base = hose->pci_mem_offset; + for (i = 0; i < 3; ++i) { + res = &hose->mem_resources[i]; + if (res->flags == 0) + continue; + if (offset >= res->start - base + && offset + size - 1 <= res->end - base) { + ret = 0; + break; + } + } offset += hose->pci_mem_offset; - res_bit = IORESOURCE_MEM; } else { - io_offset = (unsigned long)hose->io_base_virt - isa_io_base; - offset += io_offset; - res_bit = IORESOURCE_IO; + /* PCI I/O space */ + base = (unsigned long)hose->io_base_virt - isa_io_base; + res = &hose->io_resource; + if (offset >= res->start - base + && offset + size - 1 <= res->end - base) + ret = 0; + offset += hose->io_base_phys; } - /* - * Check that the offset requested corresponds to one of the - * resources of the device. - */ - for (i = 0; i <= PCI_ROM_RESOURCE; i++) { - struct resource *rp = &dev->resource[i]; - int flags = rp->flags; - - /* treat ROM as memory (should be already) */ - if (i == PCI_ROM_RESOURCE) - flags |= IORESOURCE_MEM; - - /* Active and same type? */ - if ((flags & res_bit) == 0) - continue; - - /* In the range of this resource? */ - if (offset < (rp->start & PAGE_MASK) || offset > rp->end) - continue; - - /* found it! construct the final physical address */ - if (mmap_state == pci_mmap_io) - offset += hose->io_base_phys - io_offset; - - vma->vm_pgoff = offset >> PAGE_SHIFT; - return 0; - } - - return -EINVAL; + vma->vm_pgoff = offset >> PAGE_SHIFT; + return ret; } /* --- linux-2.6.5-rc3/arch/ppc/kernel/ppc_ksyms.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/kernel/ppc_ksyms.c 2004-03-30 20:21:43.000000000 -0800 @@ -192,7 +192,6 @@ EXPORT_SYMBOL(kernel_thread); EXPORT_SYMBOL(flush_instruction_cache); EXPORT_SYMBOL(giveup_fpu); -EXPORT_SYMBOL(enable_kernel_fp); EXPORT_SYMBOL(flush_icache_range); EXPORT_SYMBOL(flush_dcache_range); EXPORT_SYMBOL(flush_icache_user_range); --- linux-2.6.5-rc3/arch/ppc/kernel/ppc-stub.c 2003-09-08 13:58:56.000000000 -0700 +++ 25/arch/ppc/kernel/ppc-stub.c 2004-03-31 00:55:02.545407368 -0800 @@ -105,6 +105,7 @@ #include #include #include +#include #include #include @@ -834,6 +835,22 @@ breakpoint(void) breakinst: .long 0x7d821008"); } +/* Give us an early hook in. */ +static void __init early_kgdb(char **ign) +{ + if (ppc_md.kgdb_map_scc) + ppc_md.kgdb_map_scc(); + + set_debug_traps(); + + if (ppc_md.progress) + ppc_md.progress("setup_arch: kgdb breakpoint", 0x4000); + printk("kgdb breakpoint activated\n"); + + breakpoint(); +} +__early_param("gdb", early_kgdb); + #ifdef CONFIG_KGDB_CONSOLE /* Output string in GDB O-packet format if GDB has connected. If nothing output, returns 0 (caller must then handle output). */ --- linux-2.6.5-rc3/arch/ppc/kernel/process.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/kernel/process.c 2004-03-30 20:21:43.000000000 -0800 @@ -163,7 +163,8 @@ dump_altivec(struct pt_regs *regs, elf_v void enable_kernel_altivec(void) { - preempt_disable(); + WARN_ON(current_thread_info()->preempt_count == 0 && !irqs_disabled()); + #ifdef CONFIG_SMP if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) giveup_altivec(current); @@ -172,14 +173,15 @@ enable_kernel_altivec(void) #else giveup_altivec(last_task_used_altivec); #endif /* __SMP __ */ - preempt_enable(); } +EXPORT_SYMBOL(enable_kernel_altivec); #endif /* CONFIG_ALTIVEC */ void enable_kernel_fp(void) { - preempt_disable(); + WARN_ON(current_thread_info()->preempt_count == 0 && !irqs_disabled()); + #ifdef CONFIG_SMP if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) giveup_fpu(current); @@ -188,8 +190,8 @@ enable_kernel_fp(void) #else giveup_fpu(last_task_used_math); #endif /* CONFIG_SMP */ - preempt_enable(); } +EXPORT_SYMBOL(enable_kernel_fp); int dump_task_fpu(struct task_struct *tsk, elf_fpregset_t *fpregs) --- linux-2.6.5-rc3/arch/ppc/kernel/setup.c 2004-03-10 20:41:26.000000000 -0800 +++ 25/arch/ppc/kernel/setup.c 2004-03-31 00:55:02.744377120 -0800 @@ -36,6 +36,7 @@ #include #include #include +#include #include #if defined CONFIG_KGDB @@ -53,7 +54,6 @@ extern void ppc6xx_idle(void); extern void power4_idle(void); extern boot_infos_t *boot_infos; -char saved_command_line[COMMAND_LINE_SIZE]; unsigned char aux_device_present; struct ide_machdep_calls ppc_ide_md; char *sysmap; @@ -457,12 +457,6 @@ platform_init(unsigned long r3, unsigned } } } -#ifdef CONFIG_ADB - if (strstr(cmd_line, "adb_sync")) { - extern int __adb_probe_sync; - __adb_probe_sync = 1; - } -#endif /* CONFIG_ADB */ switch (_machine) { case _MACH_Pmac: @@ -473,6 +467,17 @@ platform_init(unsigned long r3, unsigned break; } } + +#ifdef CONFIG_ADB +/* Allow us to say that ADB probing will be done synchronously. */ +static int __init early_adb_sync(char *ign) +{ + extern int __adb_probe_sync; + __adb_probe_sync = 1; + return 0; +} +__early_param("adb_sync", early_adb_sync); +#endif /* CONFIG_ADB */ #endif /* CONFIG_PPC_MULTIPLATFORM */ struct bi_record *find_bootinfo(void) @@ -637,23 +642,10 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_XMON xmon_map_scc(); - if (strstr(cmd_line, "xmon")) - xmon(0); -#endif /* CONFIG_XMON */ - if ( ppc_md.progress ) ppc_md.progress("setup_arch: enter", 0x3eab); - -#if defined(CONFIG_KGDB) - if (ppc_md.kgdb_map_scc) - ppc_md.kgdb_map_scc(); - set_debug_traps(); - if (strstr(cmd_line, "gdb")) { - if (ppc_md.progress) - ppc_md.progress("setup_arch: kgdb breakpoint", 0x4000); - printk("kgdb breakpoint activated\n"); - breakpoint(); - } #endif + if ( ppc_md.progress ) ppc_md.progress("setup_arch: enter", 0x3eab); + /* * Set cache line size based on type of cpu as a default. * Systems with OF can look in the properties on the cpu node(s) @@ -670,14 +662,26 @@ void __init setup_arch(char **cmdline_p) /* reboot on panic */ panic_timeout = 180; + /* See if we need to grab the command line params from PPCBUG. */ +#ifdef CONFIG_PPCBUG_NVRAM + if (_machine == _MACH_prep) { + /* Read in NVRAM data */ + init_prep_nvram(); + + if (cmd_line[0] == '\0') { + char *bootargs = prep_nvram_get_var("bootargs"); + if (bootargs != NULL) + strlcpy(cmd_line, bootargs, sizeof(cmd_line)); + } + } +#endif + init_mm.start_code = PAGE_OFFSET; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; init_mm.brk = (unsigned long) klimit; - - /* Save unparsed command line copy for /proc/cmdline */ - strlcpy(saved_command_line, cmd_line, sizeof(saved_command_line)); *cmdline_p = cmd_line; + parse_early_options(cmdline_p); /* set up the bootmem stuff with available memory */ do_init_bootmem(); --- linux-2.6.5-rc3/arch/ppc/kernel/signal.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/kernel/signal.c 2004-03-31 00:54:55.348501464 -0800 @@ -311,7 +311,7 @@ restore_sigmask(sigset_t *set) * (one which gets siginfo). */ static void -handle_rt_signal(unsigned long sig, struct k_sigaction *ka, +handle_rt_signal(unsigned long sig, struct k_sigaction *ka_copy, siginfo_t *info, sigset_t *oldset, struct pt_regs * regs, unsigned long newsp) { @@ -354,7 +354,7 @@ handle_rt_signal(unsigned long sig, stru regs->gpr[4] = (unsigned long) &rt_sf->info; regs->gpr[5] = (unsigned long) &rt_sf->uc; regs->gpr[6] = (unsigned long) rt_sf; - regs->nip = (unsigned long) ka->sa.sa_handler; + regs->nip = (unsigned long) ka_copy->sa.sa_handler; regs->link = (unsigned long) frame->tramp; regs->trap = 0; @@ -366,7 +366,7 @@ badframe: regs, frame, newsp); #endif if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + current->sighand->action[SIGSEGV-1].sa.sa_handler = SIG_DFL; force_sig(SIGSEGV, current); } @@ -466,7 +466,7 @@ int sys_rt_sigreturn(int r3, int r4, int * OK, we're invoking a handler */ static void -handle_signal(unsigned long sig, struct k_sigaction *ka, +handle_signal(unsigned long sig, struct k_sigaction *ka_copy, siginfo_t *info, sigset_t *oldset, struct pt_regs * regs, unsigned long newsp) { @@ -491,7 +491,7 @@ handle_signal(unsigned long sig, struct #if _NSIG != 64 #error "Please adjust handle_signal()" #endif - if (__put_user((unsigned long) ka->sa.sa_handler, &sc->handler) + if (__put_user((unsigned long) ka_copy->sa.sa_handler, &sc->handler) || __put_user(oldset->sig[0], &sc->oldmask) || __put_user(oldset->sig[1], &sc->_unused[3]) || __put_user((struct pt_regs *)frame, &sc->regs) @@ -506,7 +506,7 @@ handle_signal(unsigned long sig, struct regs->gpr[1] = newsp; regs->gpr[3] = sig; regs->gpr[4] = (unsigned long) sc; - regs->nip = (unsigned long) ka->sa.sa_handler; + regs->nip = (unsigned long) ka_copy->sa.sa_handler; regs->link = (unsigned long) frame->mctx.tramp; regs->trap = 0; @@ -518,7 +518,7 @@ badframe: regs, frame, *newspp); #endif if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + current->sighand->action[SIGSEGV-1].sa.sa_handler = SIG_DFL; force_sig(SIGSEGV, current); } @@ -565,18 +565,16 @@ badframe: int do_signal(sigset_t *oldset, struct pt_regs *regs) { siginfo_t info; - struct k_sigaction *ka; unsigned long frame, newsp; int signr, ret; + struct k_sigaction ka_copy; if (!oldset) oldset = ¤t->blocked; newsp = frame = 0; - signr = get_signal_to_deliver(&info, regs, NULL); - - ka = (signr == 0)? NULL: ¤t->sighand->action[signr-1]; + signr = get_signal_to_deliver(&info, &ka_copy, regs, NULL); if (TRAP(regs) == 0x0C00 /* System Call! */ && regs->ccr & 0x10000000 /* error signalled */ @@ -587,7 +585,7 @@ int do_signal(sigset_t *oldset, struct p if (signr > 0 && (ret == ERESTARTNOHAND || ret == ERESTART_RESTARTBLOCK || (ret == ERESTARTSYS - && !(ka->sa.sa_flags & SA_RESTART)))) { + && !(ka_copy.sa.sa_flags & SA_RESTART)))) { /* make the system call return an EINTR error */ regs->result = -EINTR; regs->gpr[3] = EINTR; @@ -606,7 +604,7 @@ int do_signal(sigset_t *oldset, struct p if (signr == 0) return 0; /* no signals delivered */ - if ((ka->sa.sa_flags & SA_ONSTACK) && current->sas_ss_size + if ((ka_copy.sa.sa_flags & SA_ONSTACK) && current->sas_ss_size && !on_sig_stack(regs->gpr[1])) newsp = current->sas_ss_sp + current->sas_ss_size; else @@ -614,17 +612,18 @@ int do_signal(sigset_t *oldset, struct p newsp &= ~0xfUL; /* Whee! Actually deliver the signal. */ - if (ka->sa.sa_flags & SA_SIGINFO) - handle_rt_signal(signr, ka, &info, oldset, regs, newsp); + if (ka_copy.sa.sa_flags & SA_SIGINFO) + handle_rt_signal(signr, &ka_copy, &info, oldset, regs, newsp); else - handle_signal(signr, ka, &info, oldset, regs, newsp); + handle_signal(signr, &ka_copy, &info, oldset, regs, newsp); - if (ka->sa.sa_flags & SA_ONESHOT) - ka->sa.sa_handler = SIG_DFL; + if (ka_copy.sa.sa_flags & SA_ONESHOT) + ka_copy.sa.sa_handler = SIG_DFL; - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (!(ka_copy.sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); + sigorsets(¤t->blocked, ¤t->blocked, + &ka_copy.sa.sa_mask); sigaddset(¤t->blocked, signr); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); --- linux-2.6.5-rc3/arch/ppc/kernel/vmlinux.lds.S 2003-11-23 19:03:00.000000000 -0800 +++ 25/arch/ppc/kernel/vmlinux.lds.S 2004-03-31 00:55:02.547407064 -0800 @@ -101,6 +101,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; __start___param = .; __param : { *(__param) } __stop___param = .; --- linux-2.6.5-rc3/arch/ppc/mm/init.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/ppc/mm/init.c 2004-03-31 00:55:02.548406912 -0800 @@ -209,19 +209,9 @@ void MMU_setup(void) char *p, *q; unsigned long maxmem = 0; - for (q = cmd_line; (p = strstr(q, "mem=")) != 0; ) { - q = p + 4; - if (p > cmd_line && p[-1] != ' ') - continue; - maxmem = simple_strtoul(q, &q, 0); - if (*q == 'k' || *q == 'K') { - maxmem <<= 10; - ++q; - } else if (*q == 'm' || *q == 'M') { - maxmem <<= 20; - ++q; - } - } + for (q = cmd_line; (p = strstr(q, "mem=")) != 0; ) + maxmem = memparse(p + 4, &q); + __max_memory = maxmem; } } --- linux-2.6.5-rc3/arch/ppc/platforms/lopec_setup.c 2003-09-27 18:57:43.000000000 -0700 +++ 25/arch/ppc/platforms/lopec_setup.c 2004-03-31 00:55:02.548406912 -0800 @@ -33,7 +33,6 @@ #include #include -extern char saved_command_line[]; extern void lopec_find_bridges(void); /* @@ -327,21 +326,6 @@ lopec_setup_arch(void) #ifdef CONFIG_VT conswitchp = &dummy_con; #endif -#ifdef CONFIG_PPCBUG_NVRAM - /* Read in NVRAM data */ - init_prep_nvram(); - - /* if no bootargs, look in NVRAM */ - if ( cmd_line[0] == '\0' ) { - char *bootargs; - bootargs = prep_nvram_get_var("bootargs"); - if (bootargs != NULL) { - strcpy(cmd_line, bootargs); - /* again.. */ - strcpy(saved_command_line, cmd_line); - } - } -#endif } void __init --- linux-2.6.5-rc3/arch/ppc/platforms/pplus.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/platforms/pplus.c 2004-03-31 00:55:02.549406760 -0800 @@ -48,8 +48,6 @@ TODC_ALLOC(); -extern char saved_command_line[]; - extern void pplus_setup_hose(void); extern void pplus_set_VIA_IDE_native(void); @@ -588,21 +586,6 @@ static void __init pplus_setup_arch(void #elif defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; #endif -#ifdef CONFIG_PPCBUG_NVRAM - /* Read in NVRAM data */ - init_prep_nvram(); - - /* if no bootargs, look in NVRAM */ - if (cmd_line[0] == '\0') { - char *bootargs; - bootargs = prep_nvram_get_var("bootargs"); - if (bootargs != NULL) { - strcpy(cmd_line, bootargs); - /* again.. */ - strcpy(saved_command_line, cmd_line); - } - } -#endif if (ppc_md.progress) ppc_md.progress("pplus_setup_arch: exit", 0); } --- linux-2.6.5-rc3/arch/ppc/platforms/prep_setup.c 2004-03-29 22:08:27.000000000 -0800 +++ 25/arch/ppc/platforms/prep_setup.c 2004-03-31 00:55:02.550406608 -0800 @@ -76,7 +76,6 @@ extern void rs_nvram_write_val(int addr, extern void ibm_prep_init(void); extern void prep_find_bridges(void); -extern char saved_command_line[]; int _prep_type; @@ -774,20 +773,6 @@ prep_setup_arch(void) break; } - /* Read in NVRAM data */ - init_prep_nvram(); - - /* if no bootargs, look in NVRAM */ - if ( cmd_line[0] == '\0' ) { - char *bootargs; - bootargs = prep_nvram_get_var("bootargs"); - if (bootargs != NULL) { - strcpy(cmd_line, bootargs); - /* again.. */ - strcpy(saved_command_line, cmd_line); - } - } - #ifdef CONFIG_SOUND_CS4232 prep_init_sound(); #endif /* CONFIG_SOUND_CS4232 */ --- linux-2.6.5-rc3/arch/ppc/xmon/xmon.c 2003-06-14 12:17:56.000000000 -0700 +++ 25/arch/ppc/xmon/xmon.c 2004-03-31 00:55:02.552406304 -0800 @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -262,6 +263,13 @@ xmon(struct pt_regs *excp) get_tb(start_tb[smp_processor_id()]); } +/* Allow us a hook to drop in early. */ +static void __init early_xmon(char **ign) +{ + xmon(0); +} +__early_param("xmon", early_xmon); + irqreturn_t xmon_irq(int irq, void *d, struct pt_regs *regs) { --- linux-2.6.5-rc3/arch/s390/kernel/binfmt_elf32.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/s390/kernel/binfmt_elf32.c 2004-03-31 00:54:56.461332288 -0800 @@ -115,7 +115,7 @@ static inline int dump_regs32(struct pt_ #include #include -int setup_arg_pages32(struct linux_binprm *bprm); +int setup_arg_pages32(struct linux_binprm *bprm, int executable_stack); #define elf_prstatus elf_prstatus32 struct elf_prstatus32 @@ -166,7 +166,7 @@ struct elf_prpsinfo32 #undef start_thread #define start_thread start_thread31 -#define setup_arg_pages(bprm) setup_arg_pages32(bprm) +#define setup_arg_pages(bprm, exec) setup_arg_pages32(bprm, exec) #define elf_map elf_map32 MODULE_DESCRIPTION("Binary format loader for compatibility with 32bit Linux for S390 binaries," --- linux-2.6.5-rc3/arch/s390/kernel/compat_exec.c 2003-07-10 18:50:30.000000000 -0700 +++ 25/arch/s390/kernel/compat_exec.c 2004-03-31 00:54:56.462332136 -0800 @@ -37,7 +37,7 @@ #undef STACK_TOP #define STACK_TOP TASK31_SIZE -int setup_arg_pages32(struct linux_binprm *bprm) +int setup_arg_pages32(struct linux_binprm *bprm, int executable_stack) { unsigned long stack_base; struct vm_area_struct *mpnt; @@ -66,6 +66,7 @@ int setup_arg_pages32(struct linux_binpr mpnt->vm_mm = mm; mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; mpnt->vm_end = STACK_TOP; + /* executable stack setting would be applied here */ mpnt->vm_page_prot = PAGE_COPY; mpnt->vm_flags = VM_STACK_FLAGS; mpnt->vm_ops = NULL; --- linux-2.6.5-rc3/arch/s390/kernel/compat_signal.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/s390/kernel/compat_signal.c 2004-03-31 00:54:55.036548888 -0800 @@ -449,10 +449,10 @@ static inline int map_signal(int sig) return sig; } -static void setup_frame32(int sig, struct k_sigaction *ka, +static void setup_frame32(int sig, struct k_sigaction *ka_copy, sigset_t *set, struct pt_regs * regs) { - sigframe32 *frame = get_sigframe(ka, regs, sizeof(sigframe32)); + sigframe32 *frame = get_sigframe(ka_copy, regs, sizeof(sigframe32)); if (!access_ok(VERIFY_WRITE, frame, sizeof(sigframe32))) goto give_sigsegv; @@ -466,8 +466,8 @@ static void setup_frame32(int sig, struc /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ka->sa.sa_flags & SA_RESTORER) { - regs->gprs[14] = (__u64) ka->sa.sa_restorer; + if (ka_copy->sa.sa_flags & SA_RESTORER) { + regs->gprs[14] = (__u64) ka_copy->sa.sa_restorer; } else { regs->gprs[14] = (__u64) frame->retcode; if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, @@ -481,7 +481,7 @@ static void setup_frame32(int sig, struc /* Set up registers for signal handler */ regs->gprs[15] = (__u64) frame; - regs->psw.addr = (__u64) ka->sa.sa_handler; + regs->psw.addr = (__u64) ka_copy->sa.sa_handler; regs->gprs[2] = map_signal(sig); regs->gprs[3] = (__u64) &frame->sc; @@ -494,15 +494,16 @@ static void setup_frame32(int sig, struc give_sigsegv: if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + ka_copy->sa.sa_handler = SIG_DFL; force_sig(SIGSEGV, current); } -static void setup_rt_frame32(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs) +static void setup_rt_frame32(int sig, struct k_sigaction *ka_copy, + siginfo_t *info, sigset_t *set, + struct pt_regs * regs) { int err = 0; - rt_sigframe32 *frame = get_sigframe(ka, regs, sizeof(rt_sigframe32)); + rt_sigframe32 *frame = get_sigframe(ka_copy, regs, sizeof(rt_sigframe32)); if (!access_ok(VERIFY_WRITE, frame, sizeof(rt_sigframe32))) goto give_sigsegv; @@ -523,8 +524,8 @@ static void setup_rt_frame32(int sig, st /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ka->sa.sa_flags & SA_RESTORER) { - regs->gprs[14] = (__u64) ka->sa.sa_restorer; + if (ka_copy->sa.sa_flags & SA_RESTORER) { + regs->gprs[14] = (__u64) ka_copy->sa.sa_restorer; } else { regs->gprs[14] = (__u64) frame->retcode; err |= __put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, @@ -537,7 +538,7 @@ static void setup_rt_frame32(int sig, st /* Set up registers for signal handler */ regs->gprs[15] = (__u64) frame; - regs->psw.addr = (__u64) ka->sa.sa_handler; + regs->psw.addr = (__u64) ka_copy->sa.sa_handler; regs->gprs[2] = map_signal(sig); regs->gprs[3] = (__u64) &frame->info; @@ -546,7 +547,7 @@ static void setup_rt_frame32(int sig, st give_sigsegv: if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + ka_copy->sa.sa_handler = SIG_DFL; force_sig(SIGSEGV, current); } @@ -555,23 +556,22 @@ give_sigsegv: */ void -handle_signal32(unsigned long sig, siginfo_t *info, sigset_t *oldset, - struct pt_regs * regs) +handle_signal32(unsigned long sig, struct k_sigaction *ka_copy, + siginfo_t *info, sigset_t *oldset, struct pt_regs * regs) { - struct k_sigaction *ka = ¤t->sighand->action[sig-1]; - /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - setup_rt_frame32(sig, ka, info, oldset, regs); + if (ka_copy->sa.sa_flags & SA_SIGINFO) + setup_rt_frame32(sig, ka_copy, info, oldset, regs); else - setup_frame32(sig, ka, oldset, regs); + setup_frame32(sig, ka_copy, oldset, regs); - if (ka->sa.sa_flags & SA_ONESHOT) - ka->sa.sa_handler = SIG_DFL; + if (ka_copy->sa.sa_flags & SA_ONESHOT) + ka_copy->sa.sa_handler = SIG_DFL; - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (!(ka_copy->sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); + sigorsets(¤t->blocked,¤t->blocked, + &ka_copy->sa.sa_mask); sigaddset(¤t->blocked,sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); --- linux-2.6.5-rc3/arch/s390/kernel/compat_wrapper.S 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/s390/kernel/compat_wrapper.S 2004-03-31 00:56:31.710852160 -0800 @@ -1352,3 +1352,12 @@ compat_sys_fstatfs64_wrapper: llgfr %r3,%r3 # compat_size_t llgtr %r4,%r4 # struct compat_statfs64 * jg compat_fstatfs64 + + .globl sys32_remap_file_pages_wrapper +sys32_remap_file_pages_wrapper: + llgfr %r2,%r2 # unsigned long + llgfr %r3,%r3 # unsigned long + llgfr %r4,%r4 # unsigned long + llgfr %r5,%r5 # unsigned long + llgfr %r6,%r6 # unsigned long + jg sys_remap_file_pages --- linux-2.6.5-rc3/arch/s390/kernel/setup.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/s390/kernel/setup.c 2004-03-31 00:55:04.800064608 -0800 @@ -74,7 +74,6 @@ extern int _text,_etext, _edata, _end; #include static char command_line[COMMAND_LINE_SIZE] = { 0, }; - char saved_command_line[COMMAND_LINE_SIZE]; static struct resource code_resource = { "Kernel code", 0x100000, 0 }; static struct resource data_resource = { "Kernel data", 0, 0 }; @@ -373,10 +372,6 @@ void __init setup_arch(char **cmdline_p) data_resource.start = (unsigned long) &_etext; data_resource.end = (unsigned long) &_edata - 1; - /* Save unparsed command line copy for /proc/cmdline */ - memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE); - saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; - for (;;) { /* * "mem=XXX[kKmM]" sets memsize @@ -423,6 +418,7 @@ void __init setup_arch(char **cmdline_p) if (c == ' ' && to > command_line) to--; *to = '\0'; *cmdline_p = command_line; + parse_early_options(cmdline_p); /* * partially used pages are not usable - thus --- linux-2.6.5-rc3/arch/s390/kernel/signal.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/s390/kernel/signal.c 2004-03-31 00:54:55.037548736 -0800 @@ -303,10 +303,10 @@ static inline int map_signal(int sig) return sig; } -static void setup_frame(int sig, struct k_sigaction *ka, +static void setup_frame(int sig, struct k_sigaction *ka_copy, sigset_t *set, struct pt_regs * regs) { - sigframe *frame = get_sigframe(ka, regs, sizeof(sigframe)); + sigframe *frame = get_sigframe(ka_copy, regs, sizeof(sigframe)); if (!access_ok(VERIFY_WRITE, frame, sizeof(sigframe))) goto give_sigsegv; @@ -320,9 +320,9 @@ static void setup_frame(int sig, struct /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ka->sa.sa_flags & SA_RESTORER) { + if (ka_copy->sa.sa_flags & SA_RESTORER) { regs->gprs[14] = (unsigned long) - ka->sa.sa_restorer | PSW_ADDR_AMODE; + ka_copy->sa.sa_restorer | PSW_ADDR_AMODE; } else { regs->gprs[14] = (unsigned long) frame->retcode | PSW_ADDR_AMODE; @@ -337,7 +337,7 @@ static void setup_frame(int sig, struct /* Set up registers for signal handler */ regs->gprs[15] = (unsigned long) frame; - regs->psw.addr = (unsigned long) ka->sa.sa_handler | PSW_ADDR_AMODE; + regs->psw.addr = (unsigned long) ka_copy->sa.sa_handler | PSW_ADDR_AMODE; regs->gprs[2] = map_signal(sig); regs->gprs[3] = (unsigned long) &frame->sc; @@ -350,15 +350,15 @@ static void setup_frame(int sig, struct give_sigsegv: if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + ka_copy->sa.sa_handler = SIG_DFL; force_sig(SIGSEGV, current); } -static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +static void setup_rt_frame(int sig, struct k_sigaction *ka_copy, siginfo_t *info, sigset_t *set, struct pt_regs * regs) { int err = 0; - rt_sigframe *frame = get_sigframe(ka, regs, sizeof(rt_sigframe)); + rt_sigframe *frame = get_sigframe(ka_copy, regs, sizeof(rt_sigframe)); if (!access_ok(VERIFY_WRITE, frame, sizeof(rt_sigframe))) goto give_sigsegv; @@ -379,9 +379,9 @@ static void setup_rt_frame(int sig, stru /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ka->sa.sa_flags & SA_RESTORER) { + if (ka_copy->sa.sa_flags & SA_RESTORER) { regs->gprs[14] = (unsigned long) - ka->sa.sa_restorer | PSW_ADDR_AMODE; + ka_copy->sa.sa_restorer | PSW_ADDR_AMODE; } else { regs->gprs[14] = (unsigned long) frame->retcode | PSW_ADDR_AMODE; @@ -395,7 +395,7 @@ static void setup_rt_frame(int sig, stru /* Set up registers for signal handler */ regs->gprs[15] = (unsigned long) frame; - regs->psw.addr = (unsigned long) ka->sa.sa_handler | PSW_ADDR_AMODE; + regs->psw.addr = (unsigned long) ka_copy->sa.sa_handler | PSW_ADDR_AMODE; regs->gprs[2] = map_signal(sig); regs->gprs[3] = (unsigned long) &frame->info; @@ -404,7 +404,7 @@ static void setup_rt_frame(int sig, stru give_sigsegv: if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + ka_copy->sa.sa_handler = SIG_DFL; force_sig(SIGSEGV, current); } @@ -413,23 +413,22 @@ give_sigsegv: */ static void -handle_signal(unsigned long sig, siginfo_t *info, sigset_t *oldset, - struct pt_regs * regs) +handle_signal(unsigned long sig, struct k_sigaction *ka_copy, + siginfo_t *info, sigset_t *oldset, struct pt_regs * regs) { - struct k_sigaction *ka = ¤t->sighand->action[sig-1]; - /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - setup_rt_frame(sig, ka, info, oldset, regs); + if (ka_copy->sa.sa_flags & SA_SIGINFO) + setup_rt_frame(sig, ka_copy, info, oldset, regs); else - setup_frame(sig, ka, oldset, regs); + setup_frame(sig, ka_copy, oldset, regs); - if (ka->sa.sa_flags & SA_ONESHOT) - ka->sa.sa_handler = SIG_DFL; + if (ka_copy->sa.sa_flags & SA_ONESHOT) + ka_copy->sa.sa_handler = SIG_DFL; - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (!(ka_copy->sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); + sigorsets(¤t->blocked,¤t->blocked, + &ka_copy->sa.sa_mask); sigaddset(¤t->blocked,sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); @@ -450,6 +449,7 @@ int do_signal(struct pt_regs *regs, sigs unsigned long retval = 0, continue_addr = 0, restart_addr = 0; siginfo_t info; int signr; + struct k_sigaction ka_copy; /* * We want the common case to go fast, which @@ -483,7 +483,7 @@ int do_signal(struct pt_regs *regs, sigs /* Get signal to deliver. When running under ptrace, at this point the debugger may change all our registers ... */ - signr = get_signal_to_deliver(&info, regs, NULL); + signr = get_signal_to_deliver(&info, &ka_copy, regs, NULL); /* Depending on the signal settings we may need to revert the decision to restart the system call. */ @@ -502,14 +502,15 @@ int do_signal(struct pt_regs *regs, sigs #ifdef CONFIG_S390_SUPPORT if (test_thread_flag(TIF_31BIT)) { extern void handle_signal32(unsigned long sig, + struct k_sigaction *ka_copy, siginfo_t *info, sigset_t *oldset, struct pt_regs *regs); - handle_signal32(signr, &info, oldset, regs); + handle_signal32(signr, &ka_copy, &info, oldset, regs); return 1; } #endif - handle_signal(signr, &info, oldset, regs); + handle_signal(signr, &ka_copy, &info, oldset, regs); return 1; } --- linux-2.6.5-rc3/arch/s390/kernel/syscalls.S 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/s390/kernel/syscalls.S 2004-03-31 00:56:31.711852008 -0800 @@ -275,3 +275,4 @@ NI_SYSCALL /* reserved for vserver SYSCALL(s390_fadvise64_64,sys_ni_syscall,sys32_fadvise64_64_wrapper) SYSCALL(sys_statfs64,sys_statfs64,compat_sys_statfs64_wrapper) SYSCALL(sys_fstatfs64,sys_fstatfs64,compat_sys_fstatfs64_wrapper) +SYSCALL(sys_remap_file_pages,sys_remap_file_pages,sys32_remap_file_pages_wrapper) --- linux-2.6.5-rc3/arch/s390/kernel/vmlinux.lds.S 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/s390/kernel/vmlinux.lds.S 2004-03-31 00:55:04.800064608 -0800 @@ -77,6 +77,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; __start___param = .; __param : { *(__param) } __stop___param = .; --- linux-2.6.5-rc3/arch/s390/mm/fault.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/s390/mm/fault.c 2004-03-31 00:56:31.711852008 -0800 @@ -258,6 +258,8 @@ survive: goto do_sigbus; case VM_FAULT_OOM: goto out_of_memory; + case VM_FAULT_SIGSEGV: + goto bad_area; default: BUG(); } --- linux-2.6.5-rc3/arch/sh/kernel/setup.c 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/sh/kernel/setup.c 2004-03-31 00:55:04.948042112 -0800 @@ -25,6 +25,7 @@ #include #include #include +#include #ifdef CONFIG_SH_EARLY_PRINTK #include #endif @@ -85,14 +86,12 @@ static struct sh_machine_vector* __init #define INITRD_SIZE (*(unsigned long *) (PARAM+0x014)) /* ... */ #define COMMAND_LINE ((char *) (PARAM+0x100)) -#define COMMAND_LINE_SIZE 256 #define RAMDISK_IMAGE_START_MASK 0x07FF #define RAMDISK_PROMPT_FLAG 0x8000 #define RAMDISK_LOAD_FLAG 0x4000 static char command_line[COMMAND_LINE_SIZE] = { 0, }; - char saved_command_line[COMMAND_LINE_SIZE]; struct resource standard_io_resources[] = { { "dma1", 0x00, 0x1f }, @@ -252,10 +251,6 @@ static inline void parse_cmdline (char * char c = ' ', *to = command_line, *from = COMMAND_LINE; int len = 0; - /* Save unparsed command line copy for /proc/cmdline */ - memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE); - saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; - memory_start = (unsigned long)PAGE_OFFSET+__MEMORY_START; memory_end = memory_start + __MEMORY_SIZE; @@ -411,6 +406,7 @@ void __init setup_arch(char **cmdline_p) data_resource.end = virt_to_bus(_edata)-1; sh_mv_setup(cmdline_p); + parse_early_options(cmdline_p); #define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) #define PFN_DOWN(x) ((x) >> PAGE_SHIFT) --- linux-2.6.5-rc3/arch/sh/kernel/vmlinux.lds.S 2003-08-22 19:23:40.000000000 -0700 +++ 25/arch/sh/kernel/vmlinux.lds.S 2004-03-31 00:55:04.949041960 -0800 @@ -66,6 +66,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; __start___param = .; __param : { *(__param) } __stop___param = .; --- linux-2.6.5-rc3/arch/sh/mm/hugetlbpage.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/sh/mm/hugetlbpage.c 2004-03-31 00:56:27.132548168 -0800 @@ -33,7 +33,7 @@ static spinlock_t htlbpage_lock = SPIN_L static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -51,7 +51,7 @@ static struct page *dequeue_huge_page(vo !list_empty(&hugepage_freelists[nid])) { page = list_entry(hugepage_freelists[nid].next, struct page, list); - list_del(&page->list); + list_del(&page->lru); } return page; } @@ -254,8 +254,6 @@ static void free_huge_page(struct page * BUG_ON(page_count(page)); BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); - spin_lock(&htlbpage_lock); enqueue_huge_page(page); htlbpagemem++; @@ -388,7 +386,7 @@ static int try_to_free_low(int count) /* all lowmem is on node 0 */ list_for_each(p, &hugepage_freelists[0]) { if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; map = NULL; @@ -400,7 +398,7 @@ static int try_to_free_low(int count) map = page; } if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; count++; @@ -501,6 +499,12 @@ int is_hugepage_mem_enough(size_t size) return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem; } +/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ +unsigned long hugetlb_total_pages(void) +{ + return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE); +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the --- linux-2.6.5-rc3/arch/sparc64/Kconfig 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/sparc64/Kconfig 2004-03-31 00:56:32.608715664 -0800 @@ -687,12 +687,19 @@ config DEBUG_BOOTMEM depends on DEBUG_KERNEL bool "Debug BOOTMEM initialization" +config LOCKMETER + bool "Kernel lock metering" + depends on SMP && !PREEMPT + help + Say Y to enable kernel lock metering, which adds overhead to SMP locks, + but allows you to see various statistics using the lockstat command. + # We have a custom atomic_dec_and_lock() implementation but it's not # compatible with spinlock debugging so we need to fall back on # the generic version in that case. config HAVE_DEC_LOCK bool - depends on SMP && !DEBUG_SPINLOCK + depends on SMP && !DEBUG_SPINLOCK && !LOCKMETER default y config MCOUNT --- linux-2.6.5-rc3/arch/sparc64/kernel/binfmt_aout32.c 2003-06-14 12:18:21.000000000 -0700 +++ 25/arch/sparc64/kernel/binfmt_aout32.c 2004-03-31 00:54:56.462332136 -0800 @@ -310,7 +310,7 @@ beyond_if: orig_thr_flags = current_thread_info()->flags; current_thread_info()->flags |= _TIF_32BIT; - retval = setup_arg_pages(bprm); + retval = setup_arg_pages(bprm, EXSTACK_DEFAULT); if (retval < 0) { current_thread_info()->flags = orig_thr_flags; --- linux-2.6.5-rc3/arch/sparc64/kernel/setup.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/sparc64/kernel/setup.c 2004-03-31 00:55:05.259994688 -0800 @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef CONFIG_IP_PNP #include @@ -451,8 +452,7 @@ extern unsigned short ram_flags; extern int root_mountflags; -char saved_command_line[256]; -char reboot_command[256]; +char reboot_command[COMMAND_LINE_SIZE]; static struct pt_regs fake_swapper_regs = { { 0, }, 0, 0, 0, 0 }; @@ -476,7 +476,7 @@ void __init setup_arch(char **cmdline_p) /* Initialize PROM console and command line. */ *cmdline_p = prom_getbootargs(); - strcpy(saved_command_line, *cmdline_p); + parse_early_options(cmdline_p); printk("ARCH: SUN4U\n"); --- linux-2.6.5-rc3/arch/sparc64/kernel/vmlinux.lds.S 2003-08-22 19:23:40.000000000 -0700 +++ 25/arch/sparc64/kernel/vmlinux.lds.S 2004-03-31 00:55:05.260994536 -0800 @@ -51,6 +51,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; __start___param = .; __param : { *(__param) } __stop___param = .; --- linux-2.6.5-rc3/arch/sparc64/lib/rwlock.S 2003-11-23 19:03:00.000000000 -0800 +++ 25/arch/sparc64/lib/rwlock.S 2004-03-31 00:56:32.608715664 -0800 @@ -85,5 +85,20 @@ __write_trylock_succeed: __write_trylock_fail: retl mov 0, %o0 + + .globl __read_trylock +__read_trylock: /* %o0 = lock_ptr */ + ldsw [%o0], %g5 + brlz,pn %g5, 100f + add %g5, 1, %g7 + cas [%o0], %g5, %g7 + cmp %g5, %g7 + bne,pn %icc, __read_trylock + membar #StoreLoad | #StoreStore + retl + mov 1, %o0 +100: retl + mov 0, %o0 + rwlock_impl_end: --- linux-2.6.5-rc3/arch/sparc64/mm/hugetlbpage.c 2004-01-09 00:04:31.000000000 -0800 +++ 25/arch/sparc64/mm/hugetlbpage.c 2004-03-31 00:56:27.578480376 -0800 @@ -29,7 +29,7 @@ static spinlock_t htlbpage_lock = SPIN_L static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -46,8 +46,8 @@ static struct page *dequeue_huge_page(vo if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) { page = list_entry(hugepage_freelists[nid].next, - struct page, list); - list_del(&page->list); + struct page, lru); + list_del(&page->lru); } return page; } @@ -248,9 +248,8 @@ struct page *follow_huge_pmd(struct mm_s static void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); spin_lock(&htlbpage_lock); enqueue_huge_page(page); @@ -384,19 +383,19 @@ static int try_to_free_low(int count) /* all lowmem is on node 0 */ list_for_each(p, &hugepage_freelists[0]) { if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; map = NULL; if (++count == 0) break; } - page = list_entry(p, struct page, list); + page = list_entry(p, struct page, lru); if (!PageHighMem(page)) map = page; } if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; count++; @@ -497,6 +496,12 @@ int is_hugepage_mem_enough(size_t size) return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem; } +/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ +unsigned long hugetlb_total_pages(void) +{ + return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE); +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the --- linux-2.6.5-rc3/arch/sparc64/solaris/misc.c 2003-10-17 15:58:03.000000000 -0700 +++ 25/arch/sparc64/solaris/misc.c 2004-03-31 00:54:38.527058712 -0800 @@ -402,7 +402,7 @@ asmlinkage int solaris_procids(int cmd, Solaris setpgrp and setsid? */ ret = sys_setpgid(0, 0); if (ret) return ret; - current->tty = NULL; + current->signal->tty = NULL; return process_group(current); } case 2: /* getsid */ --- linux-2.6.5-rc3/arch/sparc/kernel/setup.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/sparc/kernel/setup.c 2004-03-31 00:55:05.108017792 -0800 @@ -47,6 +47,7 @@ #include #include #include +#include struct screen_info screen_info = { 0, 0, /* orig-x, orig-y */ @@ -244,8 +245,7 @@ extern unsigned short ram_flags; extern int root_mountflags; -char saved_command_line[256]; -char reboot_command[256]; +char reboot_command[COMMAND_LINE_SIZE]; enum sparc_cpu sparc_cpu_model; struct tt_entry *sparc_ttable; @@ -263,7 +263,7 @@ void __init setup_arch(char **cmdline_p) /* Initialize PROM console and command line. */ *cmdline_p = prom_getbootargs(); - strcpy(saved_command_line, *cmdline_p); + parse_early_options(cmdline_p); /* Set sparc_cpu_model */ sparc_cpu_model = sun_unknown; --- linux-2.6.5-rc3/arch/sparc/kernel/vmlinux.lds.S 2003-08-22 19:23:40.000000000 -0700 +++ 25/arch/sparc/kernel/vmlinux.lds.S 2004-03-31 00:55:05.108017792 -0800 @@ -45,6 +45,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; __start___param = .; __param : { *(__param) } __stop___param = .; --- linux-2.6.5-rc3/arch/um/Kconfig 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/um/Kconfig 2004-03-30 23:39:45.000000000 -0800 @@ -261,7 +261,7 @@ config GCOV This option allows developers to retrieve coverage data from a UML session. - See for more + See for more details. If you're involved in UML kernel development and want to use gcov, --- linux-2.6.5-rc3/arch/um/kernel/um_arch.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/um/kernel/um_arch.c 2004-03-31 00:55:05.409971888 -0800 @@ -395,6 +395,7 @@ void __init setup_arch(char **cmdline_p) paging_init(); strcpy(command_line, saved_command_line); *cmdline_p = command_line; + parse_early_options(cmdline_p); setup_hostinfo(); } --- linux-2.6.5-rc3/arch/um/kernel/user_util.c 2003-06-14 12:17:59.000000000 -0700 +++ 25/arch/um/kernel/user_util.c 2004-03-31 00:55:05.409971888 -0800 @@ -34,7 +34,6 @@ #define COMMAND_LINE_SIZE _POSIX_ARG_MAX /* Changed in linux_main and setup_arch, which run before SMP is started */ -char saved_command_line[COMMAND_LINE_SIZE] = { 0 }; char command_line[COMMAND_LINE_SIZE] = { 0 }; void add_arg(char *cmd_line, char *arg) --- linux-2.6.5-rc3/arch/v850/kernel/setup.c 2004-02-17 20:48:42.000000000 -0800 +++ 25/arch/v850/kernel/setup.c 2004-03-31 00:55:05.566948024 -0800 @@ -40,8 +40,7 @@ extern char _root_fs_image_start __attri extern char _root_fs_image_end __attribute__ ((__weak__)); -char command_line[512]; -char saved_command_line[512]; +char command_line[COMMAND_LINE_SIZE]; /* Memory not used by the kernel. */ static unsigned long total_ram_pages; @@ -61,10 +60,8 @@ void set_mem_root (void *addr, size_t le void __init setup_arch (char **cmdline) { - /* Keep a copy of command line */ *cmdline = command_line; - memcpy (saved_command_line, command_line, sizeof saved_command_line); - saved_command_line[sizeof saved_command_line - 1] = '\0'; + parse_early_options(cmdline_p); console_verbose (); --- linux-2.6.5-rc3/arch/v850/kernel/vmlinux.lds.S 2003-10-08 15:07:08.000000000 -0700 +++ 25/arch/v850/kernel/vmlinux.lds.S 2004-03-31 00:55:05.566948024 -0800 @@ -109,6 +109,11 @@ *(.init.setup) /* 2.5 convention */ \ *(.setup.init) /* 2.4 convention */ \ ___setup_end = . ; \ + + __early_begin = .; + *(__early_param) + __early_end = .; + ___start___param = . ; \ *(__param) \ ___stop___param = . ; \ --- linux-2.6.5-rc3/arch/x86_64/boot/setup.S 2004-03-10 20:41:26.000000000 -0800 +++ 25/arch/x86_64/boot/setup.S 2004-03-30 23:39:44.000000000 -0800 @@ -533,6 +533,8 @@ is_disk1: movw $0xAA, (0x1ff) # device present no_psmouse: +#include "../../i386/boot/edd.S" + # Now we want to move to protected mode ... cmpw $0, %cs:realmode_swtch jz rmodeswtch_normal --- linux-2.6.5-rc3/arch/x86_64/ia32/ia32_aout.c 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/x86_64/ia32/ia32_aout.c 2004-03-31 00:54:56.463331984 -0800 @@ -35,7 +35,7 @@ #undef WARN_OLD #undef CORE_DUMP /* probably broken */ -extern int ia32_setup_arg_pages(struct linux_binprm *bprm); +extern int ia32_setup_arg_pages(struct linux_binprm *bprm, int exec_stack); static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); static int load_aout_library(struct file*); @@ -395,7 +395,7 @@ beyond_if: set_brk(current->mm->start_brk, current->mm->brk); - retval = ia32_setup_arg_pages(bprm); + retval = ia32_setup_arg_pages(bprm, EXSTACK_DEFAULT); if (retval < 0) { /* Someone check-me: is this error path enough? */ send_sig(SIGKILL, current, 0); --- linux-2.6.5-rc3/arch/x86_64/ia32/ia32_binfmt.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/ia32/ia32_binfmt.c 2004-03-31 00:54:56.464331832 -0800 @@ -272,8 +272,8 @@ do { \ #define load_elf_binary load_elf32_binary #define ELF_PLAT_INIT(r, load_addr) elf32_init(r) -#define setup_arg_pages(bprm) ia32_setup_arg_pages(bprm) -int ia32_setup_arg_pages(struct linux_binprm *bprm); +#define setup_arg_pages(bprm, exec_stack) ia32_setup_arg_pages(bprm, exec_stack) +int ia32_setup_arg_pages(struct linux_binprm *bprm, int executable_stack); #undef start_thread #define start_thread(regs,new_rip,new_rsp) do { \ @@ -325,7 +325,7 @@ static void elf32_init(struct pt_regs *r me->thread.es = __USER_DS; } -int setup_arg_pages(struct linux_binprm *bprm) +int setup_arg_pages(struct linux_binprm *bprm, int executable_stack) { unsigned long stack_base; struct vm_area_struct *mpnt; @@ -354,7 +354,12 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_mm = mm; mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; mpnt->vm_end = IA32_STACK_TOP; - mpnt->vm_flags = vm_stack_flags32; + if (executable_stack == EXSTACK_ENABLE_X) + mpnt->vm_flags = vm_stack_flags32 | VM_EXEC; + else if (executable_stack == EXSTACK_DISABLE_X) + mpnt->vm_flags = vm_stack_flags32 & ~VM_EXEC; + else + mpnt->vm_flags = vm_stack_flags32; mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? PAGE_COPY_EXEC : PAGE_COPY; mpnt->vm_ops = NULL; @@ -370,7 +375,7 @@ int setup_arg_pages(struct linux_binprm struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY_EXEC); + put_dirty_page(current,page,stack_base,mpnt->vm_page_prot); } stack_base += PAGE_SIZE; } --- linux-2.6.5-rc3/arch/x86_64/ia32/ia32entry.S 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/ia32/ia32entry.S 2004-03-31 00:56:31.019957192 -0800 @@ -78,8 +78,8 @@ ENTRY(ia32_sysenter_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - bt $TIF_SYSCALL_TRACE,threadinfo_flags(%r10) - jc sysenter_tracesys + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + jnz sysenter_tracesys sysenter_do_call: cmpl $(IA32_NR_syscalls),%eax jae ia32_badsys @@ -106,7 +106,7 @@ sysenter_tracesys: CLEAR_RREGS movq $-ENOSYS,RAX(%rsp) /* really needed? */ movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace + call syscall_trace_enter LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST movl %ebp, %ebp @@ -163,8 +163,8 @@ ENTRY(ia32_cstar_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - bt $TIF_SYSCALL_TRACE,threadinfo_flags(%r10) - jc cstar_tracesys + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + jnz cstar_tracesys cstar_do_call: cmpl $IA32_NR_syscalls,%eax jae ia32_badsys @@ -187,7 +187,7 @@ cstar_tracesys: CLEAR_RREGS movq $-ENOSYS,RAX(%rsp) /* really needed? */ movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace + call syscall_trace_enter LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST movl RSP-ARGOFFSET(%rsp), %r8d @@ -236,8 +236,8 @@ ENTRY(ia32_syscall) this could be a problem. */ SAVE_ARGS 0,0,1 GET_THREAD_INFO(%r10) - bt $TIF_SYSCALL_TRACE,threadinfo_flags(%r10) - jc ia32_tracesys + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + jnz ia32_tracesys ia32_do_syscall: cmpl $(IA32_NR_syscalls),%eax jae ia32_badsys @@ -251,7 +251,7 @@ ia32_tracesys: SAVE_REST movq $-ENOSYS,RAX(%rsp) /* really needed? */ movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace + call syscall_trace_enter LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST jmp ia32_do_syscall @@ -562,7 +562,7 @@ ia32_sys_call_table: .quad sys_epoll_create .quad sys_epoll_ctl .quad sys_epoll_wait - .quad sys_remap_file_pages + .quad old_remap_file_pages .quad sys_set_tid_address .quad sys32_timer_create .quad compat_timer_settime --- linux-2.6.5-rc3/arch/x86_64/ia32/ia32_signal.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/ia32/ia32_signal.c 2004-03-31 00:54:55.197524416 -0800 @@ -391,7 +391,8 @@ ia32_setup_sigcontext(struct sigcontext_ * Determine which stack to use.. */ static void * -get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) +get_sigframe(struct k_sigaction *ka_copy, struct pt_regs * regs, + size_t frame_size) { unsigned long rsp; @@ -399,28 +400,28 @@ get_sigframe(struct k_sigaction *ka, str rsp = regs->rsp; /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { + if (ka_copy->sa.sa_flags & SA_ONSTACK) { if (sas_ss_flags(rsp) == 0) rsp = current->sas_ss_sp + current->sas_ss_size; } /* This is the legacy signal stack switching. */ else if ((regs->ss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) { - rsp = (unsigned long) ka->sa.sa_restorer; + !(ka_copy->sa.sa_flags & SA_RESTORER) && + ka_copy->sa.sa_restorer) { + rsp = (unsigned long) ka_copy->sa.sa_restorer; } return (void *)((rsp - frame_size) & -8UL); } -void ia32_setup_frame(int sig, struct k_sigaction *ka, +void ia32_setup_frame(int sig, struct k_sigaction *ka_copy, compat_sigset_t *set, struct pt_regs * regs) { struct sigframe *frame; int err = 0; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka_copy, regs, sizeof(*frame)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; @@ -451,8 +452,8 @@ void ia32_setup_frame(int sig, struct k_ /* Return stub is in 32bit vsyscall page */ { void *restorer = VSYSCALL32_SIGRETURN; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ka_copy->sa.sa_flags & SA_RESTORER) + restorer = ka_copy->sa.sa_restorer; err |= __put_user(ptr_to_u32(restorer), &frame->pretcode); } /* These are actually not used anymore, but left because some @@ -477,7 +478,7 @@ void ia32_setup_frame(int sig, struct k_ /* Set up registers for signal handler */ regs->rsp = (unsigned long) frame; - regs->rip = (unsigned long) ka->sa.sa_handler; + regs->rip = (unsigned long) ka_copy->sa.sa_handler; asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); @@ -497,17 +498,17 @@ void ia32_setup_frame(int sig, struct k_ give_sigsegv: if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + current->sighand->action[SIGSEGV-1].sa.sa_handler = SIG_DFL; signal_fault(regs,frame,"32bit signal deliver"); } -void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +void ia32_setup_rt_frame(int sig, struct k_sigaction *ka_copy, siginfo_t *info, compat_sigset_t *set, struct pt_regs * regs) { struct rt_sigframe *frame; int err = 0; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka_copy, regs, sizeof(*frame)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; @@ -544,8 +545,8 @@ void ia32_setup_rt_frame(int sig, struct { void *restorer = VSYSCALL32_RTSIGRETURN; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ka_copy->sa.sa_flags & SA_RESTORER) + restorer = ka_copy->sa.sa_restorer; err |= __put_user(ptr_to_u32(restorer), &frame->pretcode); } @@ -573,7 +574,7 @@ void ia32_setup_rt_frame(int sig, struct /* Set up registers for signal handler */ regs->rsp = (unsigned long) frame; - regs->rip = (unsigned long) ka->sa.sa_handler; + regs->rip = (unsigned long) ka_copy->sa.sa_handler; asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); @@ -593,7 +594,7 @@ void ia32_setup_rt_frame(int sig, struct give_sigsegv: if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + current->sighand->action[SIGSEGV-1].sa.sa_handler = SIG_DFL; signal_fault(regs, frame, "32bit rt signal setup"); } --- linux-2.6.5-rc3/arch/x86_64/ia32/sys_ia32.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/ia32/sys_ia32.c 2004-03-31 00:54:51.660062192 -0800 @@ -832,10 +832,11 @@ sys32_writev(int fd, struct compat_iovec asmlinkage long sys32_time(int * tloc) { int i; + struct timeval tv; + + do_gettimeofday(&tv); + i = tv.tv_sec; - /* SMP: This is fairly trivial. We grab CURRENT_TIME and - stuff it to user space. No side effects */ - i = get_seconds(); if (tloc) { if (put_user(i,tloc)) i = -EFAULT; @@ -1749,7 +1750,7 @@ asmlinkage long sys32_open(const char * char * tmp; int fd, error; - /* don't force O_LARGEFILE */ + /* don't force O_KERNEL_LARGEFILE */ tmp = getname(filename); fd = PTR_ERR(tmp); if (!IS_ERR(tmp)) { --- linux-2.6.5-rc3/arch/x86_64/Kconfig 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/Kconfig 2004-03-30 23:39:44.000000000 -0800 @@ -401,6 +401,8 @@ endmenu source drivers/Kconfig +source "drivers/firmware/Kconfig" + source fs/Kconfig source "arch/x86_64/oprofile/Kconfig" @@ -462,6 +464,7 @@ config INIT_DEBUG config DEBUG_INFO bool "Compile the kernel with debug info" depends on DEBUG_KERNEL + default n help If you say Y here the resulting kernel image will include debugging info resulting in a larger kernel image. @@ -480,11 +483,18 @@ config FRAME_POINTER config IOMMU_DEBUG depends on GART_IOMMU && DEBUG_KERNEL - bool "Force IOMMU to on" + bool "Enable IOMMU debugging" help - Force the IOMMU to on even when you have less than 4GB of memory and add - debugging code. - Can be disabled at boot time with iommu=noforce. + Force the IOMMU to on even when you have less than 4GB of + memory and add debugging code. On overflow always panic. And + allow to enable IOMMU leak tracing. Can be disabled at boot + time with iommu=noforce. This will also enable scatter gather + list merging. Currently not recommended for production + code. When you use it make sure you have a big enough + IOMMU/AGP aperture. Most of the options enabled by this can + be set more finegrained using the iommu= command line + options. See Documentation/x86_64/boot-options.txt for more + details. config IOMMU_LEAK bool "IOMMU leak tracing" @@ -493,9 +503,8 @@ config IOMMU_LEAK help Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. - -#config X86_REMOTE_DEBUG -# bool "kgdb debugging stub" + +source "arch/x86_64/Kconfig.kgdb" endmenu --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/arch/x86_64/Kconfig.kgdb 2004-03-30 23:39:44.000000000 -0800 @@ -0,0 +1,176 @@ +config KGDB + bool "Include kgdb kernel debugger" + depends on DEBUG_KERNEL + select DEBUG_INFO + help + If you say Y here, the system will be compiled with the debug + option (-g) and a debugging stub will be included in the + kernel. This stub communicates with gdb on another (host) + computer via a serial port. The host computer should have + access to the kernel binary file (vmlinux) and a serial port + that is connected to the target machine. Gdb can be made to + configure the serial port or you can use stty and setserial to + do this. See the 'target' command in gdb. This option also + configures in the ability to request a breakpoint early in the + boot process. To request the breakpoint just include 'kgdb' + as a boot option when booting the target machine. The system + will then break as soon as it looks at the boot options. This + option also installs a breakpoint in panic and sends any + kernel faults to the debugger. For more information see the + Documentation/i386/kgdb.txt file. + +choice + depends on KGDB + prompt "Debug serial port BAUD" + default KGDB_115200BAUD + help + Gdb and the kernel stub need to agree on the baud rate to be + used. Some systems (x86 family at this writing) allow this to + be configured. + +config KGDB_9600BAUD + bool "9600" + +config KGDB_19200BAUD + bool "19200" + +config KGDB_38400BAUD + bool "38400" + +config KGDB_57600BAUD + bool "57600" + +config KGDB_115200BAUD + bool "115200" +endchoice + +config KGDB_PORT + hex "hex I/O port address of the debug serial port" + depends on KGDB + default 3f8 + help + Some systems (x86 family at this writing) allow the port + address to be configured. The number entered is assumed to be + hex, don't put 0x in front of it. The standard address are: + COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx + will tell you what you have. It is good to test the serial + connection with a live system before trying to debug. + +config KGDB_IRQ + int "IRQ of the debug serial port" + depends on KGDB + default 4 + help + This is the irq for the debug port. If everything is working + correctly and the kernel has interrupts on a control C to the + port should cause a break into the kernel debug stub. + +config DEBUG_INFO + bool + depends on KGDB + default y + +config KGDB_MORE + bool "Add any additional compile options" + depends on KGDB + default n + help + Saying yes here turns on the ability to enter additional + compile options. + + +config KGDB_OPTIONS + depends on KGDB_MORE + string "Additional compile arguments" + default "-O1" + help + This option allows you enter additional compile options for + the whole kernel compile. Each platform will have a default + that seems right for it. For example on PPC "-ggdb -O1", and + for i386 "-O1". Note that by configuring KGDB "-g" is already + turned on. In addition, on i386 platforms + "-fomit-frame-pointer" is deleted from the standard compile + options. + +config NO_KGDB_CPUS + int "Number of CPUs" + depends on KGDB && SMP + default NR_CPUS + help + + This option sets the number of cpus for kgdb ONLY. It is used + to prune some internal structures so they look "nice" when + displayed with gdb. This is to overcome possibly larger + numbers that may have been entered above. Enter the real + number to get nice clean kgdb_info displays. + +config KGDB_TS + bool "Enable kgdb time stamp macros?" + depends on KGDB + default n + help + Kgdb event macros allow you to instrument your code with calls + to the kgdb event recording function. The event log may be + examined with gdb at a break point. Turning on this + capability also allows you to choose how many events to + keep. Kgdb always keeps the lastest events. + +choice + depends on KGDB_TS + prompt "Max number of time stamps to save?" + default KGDB_TS_128 + +config KGDB_TS_64 + bool "64" + +config KGDB_TS_128 + bool "128" + +config KGDB_TS_256 + bool "256" + +config KGDB_TS_512 + bool "512" + +config KGDB_TS_1024 + bool "1024" + +endchoice + +config STACK_OVERFLOW_TEST + bool "Turn on kernel stack overflow testing?" + depends on KGDB + default n + help + This option enables code in the front line interrupt handlers + to check for kernel stack overflow on interrupts and system + calls. This is part of the kgdb code on x86 systems. + +config KGDB_CONSOLE + bool "Enable serial console thru kgdb port" + depends on KGDB + default n + help + This option enables the command line "console=kgdb" option. + When the system is booted with this option in the command line + all kernel printk output is sent to gdb (as well as to other + consoles). For this to work gdb must be connected. For this + reason, this command line option will generate a breakpoint if + gdb has not yet connected. After the gdb continue command is + given all pent up console output will be printed by gdb on the + host machine. Neither this option, nor KGDB require the + serial driver to be configured. + +config KGDB_SYSRQ + bool "Turn on SysRq 'G' command to do a break?" + depends on KGDB + default y + help + This option includes an option in the SysRq code that allows + you to enter SysRq G which generates a breakpoint to the KGDB + stub. This will work if the keyboard is alive and can + interrupt the system. Because of constraints on when the + serial port interrupt can be enabled, this code may allow you + to interrupt the system before the serial port control C is + available. Just say yes here. + --- linux-2.6.5-rc3/arch/x86_64/kernel/aperture.c 2004-03-10 20:41:26.000000000 -0800 +++ 25/arch/x86_64/kernel/aperture.c 2004-03-30 23:39:44.000000000 -0800 @@ -25,6 +25,8 @@ #include int iommu_aperture; +int iommu_aperture_disabled __initdata = 0; +int iommu_aperture_allowed __initdata = 0; int fallback_aper_order __initdata = 1; /* 64MB */ int fallback_aper_force __initdata = 0; @@ -200,6 +202,9 @@ void __init iommu_hole_init(void) u64 aper_base; int valid_agp = 0; + if (iommu_aperture_disabled) + return; + printk("Checking aperture...\n"); fix = 0; --- linux-2.6.5-rc3/arch/x86_64/kernel/e820.c 2004-01-09 00:04:31.000000000 -0800 +++ 25/arch/x86_64/kernel/e820.c 2004-03-31 00:56:18.874803536 -0800 @@ -505,7 +505,7 @@ void __init setup_memory_region(void) e820_print_map(who); } -void __init parse_memopt(char *p, char **from) +static int __init parse_memopt(char *s) { /* * mem=XXX[kKmM] limits kernel memory to XXX+1MB @@ -520,8 +520,10 @@ void __init parse_memopt(char *p, char * * limit, you cannot force usage of memory not in e820. * * -AK - */ - end_user_pfn = memparse(p, from) + HIGH_MEMORY; + */ + end_user_pfn = memparse(s, &s) + HIGH_MEMORY; end_user_pfn >>= PAGE_SHIFT; + return 0; } +__early_param("mem=",parse_memopt); --- linux-2.6.5-rc3/arch/x86_64/kernel/early_printk.c 2004-03-10 20:41:26.000000000 -0800 +++ 25/arch/x86_64/kernel/early_printk.c 2004-03-31 00:55:05.718924920 -0800 @@ -219,4 +219,4 @@ void __init disable_early_printk(void) } } -__setup("earlyprintk=", setup_early_printk); +__early_param("earlyprintk=", setup_early_printk); --- linux-2.6.5-rc3/arch/x86_64/kernel/entry.S 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/kernel/entry.S 2004-03-31 00:54:30.689250240 -0800 @@ -131,8 +131,8 @@ ENTRY(ret_from_fork) CFI_DEFAULT_STACK call schedule_tail GET_THREAD_INFO(%rcx) - bt $TIF_SYSCALL_TRACE,threadinfo_flags(%rcx) - jc rff_trace + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + jnz rff_trace rff_action: RESTORE_REST testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? @@ -143,7 +143,7 @@ rff_action: jmp ret_from_sys_call rff_trace: movq %rsp,%rdi - call syscall_trace + call syscall_trace_leave GET_THREAD_INFO(%rcx) jmp rff_action CFI_ENDPROC @@ -185,8 +185,8 @@ ENTRY(system_call) movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) GET_THREAD_INFO(%rcx) - bt $TIF_SYSCALL_TRACE,threadinfo_flags(%rcx) - jc tracesys + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx @@ -244,7 +244,7 @@ tracesys: movq $-ENOSYS,RAX(%rsp) FIXUP_TOP_OF_STACK %rdi movq %rsp,%rdi - call syscall_trace + call syscall_trace_enter LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST cmpq $__NR_syscall_max,%rax @@ -254,7 +254,7 @@ tracesys: movq %rax,RAX-ARGOFFSET(%rsp) 1: SAVE_REST movq %rsp,%rdi - call syscall_trace + call syscall_trace_leave RESTORE_TOP_OF_STACK %rbx RESTORE_REST jmp ret_from_sys_call @@ -297,13 +297,14 @@ int_very_careful: sti SAVE_REST /* Check for syscall exit trace */ - bt $TIF_SYSCALL_TRACE,%edx - jnc int_signal + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),%edx + jz int_signal pushq %rdi leaq 8(%rsp),%rdi # &ptregs -> arg1 - call syscall_trace + call syscall_trace_leave popq %rdi btr $TIF_SYSCALL_TRACE,%edi + btr $TIF_SYSCALL_AUDIT,%edi jmp int_restore_rest int_signal: --- linux-2.6.5-rc3/arch/x86_64/kernel/head64.c 2004-03-10 20:41:27.000000000 -0800 +++ 25/arch/x86_64/kernel/head64.c 2004-03-31 00:56:18.874803536 -0800 @@ -34,12 +34,11 @@ extern char x86_boot_params[2048]; #define OLD_CL_BASE_ADDR 0x90000 #define OLD_CL_OFFSET 0x90022 -extern char saved_command_line[]; +char *early_command_line __initdata; static void __init copy_bootdata(char *real_mode_data) { int new_data; - char * command_line; memcpy(x86_boot_params, real_mode_data, 2048); new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); @@ -51,9 +50,8 @@ static void __init copy_bootdata(char *r new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; printk("old bootloader convention, maybe loadlin?\n"); } - command_line = (char *) ((u64)(new_data)); - memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); - printk("Bootdata ok (command line is %s)\n", saved_command_line); + early_command_line = (char *) ((u64)(new_data)); + printk("Bootdata ok (command line is %s)\n", early_command_line); } static void __init setup_boot_cpu_data(void) @@ -75,26 +73,9 @@ static void __init setup_boot_cpu_data(v void __init x86_64_start_kernel(char * real_mode_data) { - char *s; - clear_bss(); pda_init(0); copy_bootdata(real_mode_data); - /* default console: */ - if (!strstr(saved_command_line, "console=")) - strcat(saved_command_line, " console=tty0"); - s = strstr(saved_command_line, "earlyprintk="); - if (s != NULL) - setup_early_printk(s); -#ifdef CONFIG_DISCONTIGMEM - s = strstr(saved_command_line, "numa="); - if (s != NULL) - numa_setup(s+5); -#endif -#ifdef CONFIG_X86_IO_APIC - if (strstr(saved_command_line, "disableapic")) - disable_apic = 1; -#endif setup_boot_cpu_data(); start_kernel(); } --- linux-2.6.5-rc3/arch/x86_64/kernel/io_apic.c 2004-03-10 20:41:27.000000000 -0800 +++ 25/arch/x86_64/kernel/io_apic.c 2004-03-30 23:39:44.000000000 -0800 @@ -34,6 +34,7 @@ #include #include #include +#include int sis_apic_bug; /* not actually supported, dummy for compile */ @@ -211,7 +212,6 @@ static int __init enable_ioapic_setup(ch __setup("noapic", disable_ioapic_setup); __setup("apic", enable_ioapic_setup); -#ifndef CONFIG_SMP #include #include #include @@ -220,7 +220,11 @@ __setup("apic", enable_ioapic_setup); off. Check for an Nvidia or VIA PCI bridge and turn it off. Use pci direct infrastructure because this runs before the PCI subsystem. - Can be overwritten with "apic" */ + Can be overwritten with "apic" + + And another hack to disable the IOMMU on VIA chipsets. + + Kludge-O-Rama. */ void __init check_ioapic(void) { int num,slot,func; @@ -245,12 +249,21 @@ void __init check_ioapic(void) PCI_VENDOR_ID); vendor &= 0xffff; switch (vendor) { - case PCI_VENDOR_ID_NVIDIA: case PCI_VENDOR_ID_VIA: + if (end_pfn >= (0xffffffff>>PAGE_SHIFT) && + !iommu_aperture_allowed) { + printk(KERN_INFO + "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n"); + iommu_aperture_disabled = 1; + } + /* FALL THROUGH */ + case PCI_VENDOR_ID_NVIDIA: +#ifndef CONFIG_SMP printk(KERN_INFO "PCI bridge %02x:%02x from %x found. Setting \"noapic\". Overwrite with \"apic\"\n", num,slot,vendor); skip_ioapic_setup = 1; +#endif return; } @@ -263,7 +276,6 @@ void __init check_ioapic(void) } } } -#endif static int __init ioapic_pirq_setup(char *str) { @@ -1785,7 +1797,7 @@ int __init io_apic_get_unique_id (int io * advantage of new APIC bus architecture. */ - if (!physids_empty(apic_id_map)) + if (physids_empty(apic_id_map)) apic_id_map = phys_cpu_present_map; spin_lock_irqsave(&ioapic_lock, flags); --- linux-2.6.5-rc3/arch/x86_64/kernel/irq.c 2004-03-10 20:41:27.000000000 -0800 +++ 25/arch/x86_64/kernel/irq.c 2004-03-30 23:39:44.000000000 -0800 @@ -405,6 +405,9 @@ out: spin_unlock(&desc->lock); irq_exit(); + + kgdb_process_breakpoint(); + return 1; } --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/arch/x86_64/kernel/kgdb_stub.c 2004-03-30 23:39:44.000000000 -0800 @@ -0,0 +1,2595 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (c) 2000 VERITAS Software Corporation. + * + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: David Grothe + * Updated by: Robert Walsh + * Updated by: wangdi + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Compatibility with 2.1.xx kernel by David Grothe + * + * Changes to allow auto initilization. All that is needed is that it + * be linked with the kernel and a break point (int 3) be executed. + * The header file defines BREAKPOINT to allow one to do + * this. It should also be possible, once the interrupt system is up, to + * call putDebugChar("+"). Once this is done, the remote debugger should + * get our attention by sending a ^C in a packet. George Anzinger + * + * Integrated into 2.2.5 kernel by Tigran Aivazian + * Added thread support, support for multiple processors, + * support for ia-32(x86) hardware debugging. + * Amit S. Kale ( akale@veritas.com ) + * + * Modified to support debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + * X86_64 changes from Andi Kleen's patch merged by Jim Houston + * (jim.houston@ccur.com). If it works thank Andi if its broken + * blame me. + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ +#define KGDB_VERSION "<20030915.1651.33>" +#include +#include +#include /* for strcpy */ +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define Dearly_printk(x...) +int kgdb_enabled = 0; + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int tty_putDebugChar(int); /* write a single character */ +extern int tty_getDebugChar(void); /* read and return a single char */ +extern void tty_flushDebugChar(void); /* flush pending characters */ +extern int eth_putDebugChar(int); /* write a single character */ +extern int eth_getDebugChar(void); /* read and return a single char */ +extern void eth_flushDebugChar(void); /* flush pending characters */ + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 400 + +char *kgdb_version = KGDB_VERSION; + +/* debug > 0 prints ill-formed commands in valid packets & checksum errors */ +int debug_regs = 0; /* set to non-zero to print registers */ + +/* filled in by an external module */ +char *gdb_module_offsets; + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES (NUMREGS * sizeof(unsigned long)) +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + * + * Could add XMM and segment registers here. + */ +enum regnames {_RAX, + _RBX, + _RCX, + _RDX, + _RSI, + _RDI, + _RBP, + _RSP, + _R8, + _R9, + _R10, + _R11, + _R12, + _R13, + _R14, + _R15, + _PC, + _PS, + NUMREGS }; + + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* + * Put the error code here just in case the user cares. + * Likewise, the vector number here (since GDB only gets the signal + * number through the usual means, and that's not very specific). + * The called_from is the return address so he can tell how we entered kgdb. + * This will allow him to seperate out the various possible entries. + */ +#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */ + +#define PID_MAX PID_MAX_DEFAULT + +#ifdef CONFIG_SMP +void smp_send_nmi_allbutself(void); +#define IF_SMP(x) x +#undef MAX_NO_CPUS +#ifndef CONFIG_NO_KGDB_CPUS +#define CONFIG_NO_KGDB_CPUS 2 +#endif +#if CONFIG_NO_KGDB_CPUS > NR_CPUS +#define MAX_NO_CPUS NR_CPUS +#else +#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS +#endif +#define hold_init hold_on_sstep: 1, +#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL) +#define NUM_CPUS num_online_cpus() +#else +#define IF_SMP(x) +#define hold_init +#undef MAX_NO_CPUS +#define MAX_NO_CPUS 1 +#define NUM_CPUS 1 +#endif +#define NOCPU (struct task_struct *)0xbad1fbad +/* *INDENT-OFF* */ +struct kgdb_info { + int used_malloc; + void *called_from; + long long entry_tsc; + int errcode; + int vector; + int print_debug_info; +#ifdef CONFIG_SMP + int hold_on_sstep; + struct { + volatile struct task_struct *task; + int pid; + int hold; + struct pt_regs *regs; + } cpus_waiting[MAX_NO_CPUS]; +#endif +} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1}; + +/* *INDENT-ON* */ + +#define used_m kgdb_info.used_malloc +/* + * This is little area we set aside to contain the stack we + * need to build to allow gdb to call functions. We use one + * per cpu to avoid locking issues. We will do all this work + * with interrupts off so that should take care of the protection + * issues. + */ +#define LOOKASIDE_SIZE 200 /* should be more than enough */ +#define MALLOC_MAX 200 /* Max malloc size */ +struct { + unsigned long rsp; + unsigned long array[LOOKASIDE_SIZE]; +} fn_call_lookaside[MAX_NO_CPUS]; + +static int trap_cpu; +static unsigned long OLD_esp; + +#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE] +#define IF_BIT 0x200 +#define TF_BIT 0x100 + +#define MALLOC_ROUND 8-1 + +static char malloc_array[MALLOC_MAX]; +IF_SMP(static void to_gdb(const char *mess)); +void * +malloc(int size) +{ + + if (size <= (MALLOC_MAX - used_m)) { + int old_used = used_m; + used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND)); + return &malloc_array[old_used]; + } else { + return NULL; + } +} + +/* + * I/O dispatch functions... + * Based upon kgdboe, either call the ethernet + * handler or the serial one.. + */ +void +putDebugChar(int c) +{ + if (!kgdboe) { + tty_putDebugChar(c); + } else { + eth_putDebugChar(c); + } +} + +int +getDebugChar(void) +{ + if (!kgdboe) { + return tty_getDebugChar(); + } else { + return eth_getDebugChar(); + } +} + +void +flushDebugChar(void) +{ + if (!kgdboe) { + tty_flushDebugChar(); + } else { + eth_flushDebugChar(); + } +} + +/* + * Gdb calls functions by pushing agruments, including a return address + * on the stack and the adjusting EIP to point to the function. The + * whole assumption in GDB is that we are on a different stack than the + * one the "user" i.e. code that hit the break point, is on. This, of + * course is not true in the kernel. Thus various dodges are needed to + * do the call without directly messing with EIP (which we can not change + * as it is just a location and not a register. To adjust it would then + * require that we move every thing below EIP up or down as needed. This + * will not work as we may well have stack relative pointer on the stack + * (such as the pointer to regs, for example). + + * So here is what we do: + * We detect gdb attempting to store into the stack area and instead, store + * into the fn_call_lookaside.array at the same relative location as if it + * were the area ESP pointed at. We also trap ESP modifications + * and uses these to adjust fn_call_lookaside.esp. On entry + * fn_call_lookaside.esp will be set to point at the last entry in + * fn_call_lookaside.array. This allows us to check if it has changed, and + * if so, on exit, we add the registers we will use to do the move and a + * trap/ interrupt return exit sequence. We then adjust the eflags in the + * regs array (remember we now have a copy in the fn_call_lookaside.array) to + * kill the interrupt bit, AND we change EIP to point at our set up stub. + * As part of the register set up we preset the registers to point at the + * begining and end of the fn_call_lookaside.array, so all the stub needs to + * do is move words from the array to the stack until ESP= the desired value + * then do the rti. This will then transfer to the desired function with + * all the correct registers. Nifty huh? + */ +extern asmlinkage void fn_call_stub(void); +extern asmlinkage void fn_rtn_stub(void); +/* *INDENT-OFF* */ +__asm__("fn_rtn_stub:\n\t" + "movq %rax,%rsp\n\t" + "fn_call_stub:\n\t" + "1:\n\t" + "addq $-8,%rbx\n\t" + "movq (%rbx), %rax\n\t" + "pushq %rax\n\t" + "cmpq %rsp,%rcx\n\t" + "jne 1b\n\t" + "popq %rax\n\t" + "popq %rbx\n\t" + "popq %rcx\n\t" + "iret \n\t"); +/* *INDENT-ON* */ +#define gdb_i386vector kgdb_info.vector +#define gdb_i386errcode kgdb_info.errcode +#define waiting_cpus kgdb_info.cpus_waiting +#define remote_debug kgdb_info.print_debug_info +#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold +/* gdb locks */ + +#ifdef CONFIG_SMP +static int in_kgdb_called; +static spinlock_t waitlocks[MAX_NO_CPUS] = + {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; +/* + * The following array has the thread pointer of each of the "other" + * cpus. We make it global so it can be seen by gdb. + */ +volatile int in_kgdb_entry_log[MAX_NO_CPUS]; +volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; +/* +static spinlock_t continuelocks[MAX_NO_CPUS]; +*/ +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +/* waiters on our spinlock plus us */ +static atomic_t spinlock_waiters = ATOMIC_INIT(1); +static int spinlock_count = 0; +static int spinlock_cpu = 0; +/* + * Note we use nested spin locks to account for the case where a break + * point is encountered when calling a function by user direction from + * kgdb. Also there is the memory exception recursion to account for. + * Well, yes, but this lets other cpus thru too. Lets add a + * cpu id to the lock. + */ +#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \ + spinlock_cpu != smp_processor_id()){\ + atomic_inc(&spinlock_waiters); \ + while (! spin_trylock(x)) {\ + in_kgdb(®s);\ + }\ + atomic_dec(&spinlock_waiters); \ + spinlock_count = 1; \ + spinlock_cpu = smp_processor_id(); \ + }else{ \ + spinlock_count++; \ + } +#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x) +#else +unsigned kgdb_spinlock = 0; +#define KGDB_SPIN_LOCK(x) --*x +#define KGDB_SPIN_UNLOCK(x) ++*x +#endif + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + if ((remote_debug) && (checksum != xmitcsum)) { + printk + ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n", + checksum, xmitcsum, buffer); + } + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + + if (remote_debug) + printk("R:%s\n", buffer); + flushDebugChar(); +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + + if (!kgdboe) { + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + putDebugChar(ch); + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + + } while ((getDebugChar() & 0x7f) != '+'); + } else { + /* + * For udp, we can not transfer too much bytes once. + * We only transfer MAX_SEND_COUNT size bytes each time + */ + +#define MAX_SEND_COUNT 30 + + int send_count = 0, i = 0; + char send_buf[MAX_SEND_COUNT]; + + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + send_count = 0; + while ((ch = buffer[count])) { + if (send_count >= MAX_SEND_COUNT) { + for(i = 0; i < MAX_SEND_COUNT; i++) { + putDebugChar(send_buf[i]); + } + flushDebugChar(); + send_count = 0; + } else { + send_buf[send_count] = ch; + checksum += ch; + count ++; + send_count++; + } + } + for(i = 0; i < send_count; i++) + putDebugChar(send_buf[i]); + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + } while ((getDebugChar() & 0x7f) != '+'); + } +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static char lbuf[BUFMAX]; +static short error; + +void +debug_error(char *format, char *parm) +{ + if (remote_debug) + printk(format, parm); +} + +static void +print_regs(struct pt_regs *regs) +{ + printk("RAX=%016lx RBX=%016lx RCX=%016lx\n", + regs->rax, regs->rbx, regs->rcx); + printk("RDX=%016lx RSI=%016lx RDI=%016lx\n", + regs->rdx, regs->rsi, regs->rdi); + printk("RBP=%016lx PS=%016lx PC=%016lx\n", + regs->rbp, regs->eflags, regs->rip); + printk("R8=%016lx R9=%016lx R10=%016lx\n", + regs->r8, regs->r9, regs->r10); + printk("R11=%016lx R12=%016lx R13=%016lx\n", + regs->r11, regs->r12, regs->r13); + printk("R14=%016lx R15=%016lx RSP=%016lx\n", + regs->r14, regs->r15, regs->rsp); +} + +#define NEW_esp fn_call_lookaside[trap_cpu].rsp + +static void +regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_RAX] = regs->rax; + gdb_regs[_RBX] = regs->rbx; + gdb_regs[_RCX] = regs->rcx; + gdb_regs[_RDX] = regs->rdx; + gdb_regs[_RSI] = regs->rsi; + gdb_regs[_RDI] = regs->rdi; + gdb_regs[_RBP] = regs->rbp; + gdb_regs[ _PS] = regs->eflags; + gdb_regs[ _PC] = regs->rip; + gdb_regs[ _R8] = regs->r8; + gdb_regs[ _R9] = regs->r9; + gdb_regs[_R10] = regs->r10; + gdb_regs[_R11] = regs->r11; + gdb_regs[_R12] = regs->r12; + gdb_regs[_R13] = regs->r13; + gdb_regs[_R14] = regs->r14; + gdb_regs[_R15] = regs->r15; + gdb_regs[_RSP] = regs->rsp; + + /* Note, as we are a debugging the kernel, we will always + * trap in kernel code, this means no priviledge change, + * and so the pt_regs structure is not completely valid. In a non + * privilege change trap, only EFLAGS, CS and EIP are put on the stack, + * SS and ESP are not stacked, this means that the last 2 elements of + * pt_regs is not valid (they would normally refer to the user stack) + * also, using regs+1 is no good because you end up will a value that is + * 2 longs (8) too high. This used to cause stepping over functions + * to fail, so my fix is to use the address of regs->esp, which + * should point at the end of the stack frame. Note I have ignored + * completely exceptions that cause an error code to be stacked, such + * as double fault. Stuart Hughes, Zentropix. + * original code: gdb_regs[_ESP] = (int) (regs + 1) ; + + * this is now done on entry and moved to OLD_esp (as well as NEW_esp). + */ +} + +static void +gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + regs->rax = gdb_regs[_RAX] ; + regs->rbx = gdb_regs[_RBX] ; + regs->rcx = gdb_regs[_RCX] ; + regs->rdx = gdb_regs[_RDX] ; + regs->rsi = gdb_regs[_RSI] ; + regs->rdi = gdb_regs[_RDI] ; + regs->rbp = gdb_regs[_RBP] ; + regs->eflags = gdb_regs[ _PS] ; + regs->rip = gdb_regs[ _PC] ; + regs->r8 = gdb_regs[ _R8] ; + regs->r9 = gdb_regs[ _R9] ; + regs->r10 = gdb_regs[ _R10] ; + regs->r11 = gdb_regs[ _R11] ; + regs->r12 = gdb_regs[ _R12] ; + regs->r13 = gdb_regs[ _R13] ; + regs->r14 = gdb_regs[ _R14] ; + regs->r15 = gdb_regs[ _R15] ; + #if 0 /* can't change these */ + regs->rsp = gdb_regs[_RSP] ; + regs->ss = gdb_regs[ _SS] ; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif +} /* gdb_regs_to_regs */ + +extern void scheduling_functions_start_here(void); +extern void scheduling_functions_end_here(void); +#define first_sched ((unsigned long) scheduling_functions_start_here) +#define last_sched ((unsigned long) scheduling_functions_end_here) + +int thread_list = 0; +extern void thread_return(void); + +void +get_gdb_regs(struct task_struct *p, struct pt_regs *regs, unsigned long *gdb_regs) +{ + unsigned long **rbp, *rsp, *rsp0, pc; + int count = 0; + IF_SMP(int i); + if (!p || p == current) { + regs_to_gdb_regs(gdb_regs, regs); + return; + } +#ifdef CONFIG_SMP + for (i = 0; i < MAX_NO_CPUS; i++) { + if (p == kgdb_info.cpus_waiting[i].task) { + regs_to_gdb_regs(gdb_regs, + kgdb_info.cpus_waiting[i].regs); + gdb_regs[_RSP] = + (unsigned long)&kgdb_info.cpus_waiting[i].regs->rsp; + + return; + } + } +#endif + memset(gdb_regs, 0, NUMREGBYTES); + rsp = (unsigned long *)p->thread.rsp; + rbp = (unsigned long **)rsp[0]; + rsp += 2; + gdb_regs[_PC] = (unsigned long)thread_return; + gdb_regs[_RBP] = (unsigned long)rbp; + gdb_regs[_RSP] = (unsigned long)rsp; + +/* + * This code is to give a more informative notion of where a process + * is waiting. It is used only when the user asks for a thread info + * list. If he then switches to the thread, s/he will find the task + * is in schedule, but a back trace should show the same info we come + * up with. This code was shamelessly purloined from process.c. It was + * then enhanced to provide more registers than simply the program + * counter. + */ + + if (!thread_list) { + return; + } + + if (p->state == TASK_RUNNING) + return; + rsp0 = (unsigned long *)p->thread.rsp0; + if (rsp < (unsigned long *) p->thread_info || rsp > rsp0) + return; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + do { + if (*rbp < rsp || *rbp > rsp0) + break; + rbp = (unsigned long **)*rbp; + rsp = (unsigned long *)rbp; + pc = rsp[1]; + + if (pc < first_sched || pc >= last_sched) + break; + gdb_regs[_PC] = (unsigned long)pc; + gdb_regs[_RSP] = (unsigned long)rsp; + gdb_regs[_RBP] = (unsigned long)rbp; + } while (count++ < 16); + return; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* returns nonzero if any memory access fails. */ +int mem2hex( char* mem, char* buf, int count) +{ + int i; + unsigned char ch; + int ret = 0; + + for (i=0;i> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (ret) { + Dearly_printk("mem2hex: fault at accessing %p\n", mem); + } + return(ret); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return nonzero if any memory access fails. */ +int hex2mem( char* buf, char* mem, int count) +{ + int i; + unsigned char ch; + int ret = 0; + + for (i=0;i (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) { + addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr); + } + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set mem_err in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + /* printk("%lx = ", mem) ; */ + + ch = get_char(mem++); + + /* printk("%02x\n", ch & 0xFF) ; */ + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault fetching from addr %lx\n", + (long) (mem - 1)); + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + mem_err_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +/* NOTE: We use the may fault flag to also indicate if the write is to + * the registers (0) or "other" memory (!=0) + */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch, may_fault); + + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault storing to addr %lx\n", + (long) (mem - 1)); + return (mem); + } + } + if (may_fault) + mem_err_expected = 0; + return (mem); +} +#endif + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToLong(char **ptr, unsigned long *value) +{ + int numChars = 0; + int hexValue; + + *value = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *value = (*value << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#define stubhex(h) hex(h) +#ifdef old_thread_list + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +#ifdef old_thread_list +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} +int +int_to_hex_v(unsigned char * id, int value) +{ + unsigned char *start = id; + int shift; + int ch; + + for (shift = 28; shift >= 0; shift -= 4) { + if ((ch = (value >> shift) & 0xf) || (id != start)) { + *id = hexchars[ch]; + id++; + } + } + if (id == start) + *id++ = '0'; + return id - start; +} +#ifdef old_thread_list + +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} +#endif +static int +cmp_str(char *s1, char *s2, int count) +{ + while (count--) { + if (*s1++ != *s2++) + return 0; + } + return 1; +} + +#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */ +extern struct task_struct *kgdb_get_idle(int cpu); +#define idle_task(cpu) kgdb_get_idle(cpu) +#else +#define idle_task(cpu) init_tasks[cpu] +#endif + +extern int kgdb_pid_init_done; + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) { + if (!cpu_online(pid - PID_MAX)) + return NULL; + + return idle_task(pid - PID_MAX); + } else { + /* + * find_task_by_pid is relatively safe all the time + * Other pid functions require lock downs which imply + * that we may be interrupting them (as we get here + * in the middle of most any lock down). + * Still we don't want to call until the table exists! + */ + if (kgdb_pid_init_done){ + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } + } + } + return NULL; +} +/* *INDENT-OFF* */ +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned long addr; +} breakinfo[4] = { {enabled:0}, + {enabled:0}, + {enabled:0}, + {enabled:0}}; +/* *INDENT-ON* */ +unsigned long hw_breakpoint_status; +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned long dr7; + + asm volatile ("movq %%db7, %0\n":"=r" (dr7) + :); + /* *INDENT-OFF* */ + do { + unsigned long addr0, addr1, addr2, addr3; + asm volatile ("movq %%db0, %0\n" + "movq %%db1, %1\n" + "movq %%db2, %2\n" + "movq %%db3, %3\n" + :"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3) + :); + } while (0); + /* *INDENT-ON* */ + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movq %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movq %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movq %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movq %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movq %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +#ifdef CONFIG_SMP +static int in_kgdb_console = 0; + +int +in_kgdb(struct pt_regs *regs) +{ + unsigned long flags; + int cpu; + if (!kgdb_enabled) + return 0; + cpu = smp_processor_id(); + in_kgdb_called = 1; + if (!spin_is_locked(&kgdb_spinlock)) { + if (in_kgdb_here_log[cpu] || /* we are holding this cpu */ + in_kgdb_console) { /* or we are doing slow i/o */ + return 1; + } + return 0; + } + + /* As I see it the only reason not to let all cpus spin on + * the same spin_lock is to allow selected ones to proceed. + * This would be a good thing, so we leave it this way. + * Maybe someday.... Done ! + + * in_kgdb() is called from an NMI so we don't pretend + * to have any resources, like printk() for example. + */ + + local_irq_save(flags); /* only local here, to avoid hanging */ + /* + * log arival of this cpu + * The NMI keeps on ticking. Protect against recurring more + * than once, and ignor the cpu that has the kgdb lock + */ + in_kgdb_entry_log[cpu]++; + in_kgdb_here_log[cpu] = regs; + if (cpu == spinlock_cpu || waiting_cpus[cpu].task) + goto exit_in_kgdb; + + /* + * For protection of the initilization of the spin locks by kgdb + * it locks the kgdb spinlock before it gets the wait locks set + * up. We wait here for the wait lock to be taken. If the + * kgdb lock goes away first?? Well, it could be a slow exit + * sequence where the wait lock is removed prior to the kgdb lock + * so if kgdb gets unlocked, we just exit. + */ + + while (spin_is_locked(&kgdb_spinlock) && + !spin_is_locked(waitlocks + cpu)) ; + if (!spin_is_locked(&kgdb_spinlock)) + goto exit_in_kgdb; + + waiting_cpus[cpu].task = current; + waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu); + waiting_cpus[cpu].regs = regs; + + spin_unlock_wait(waitlocks + cpu); + + /* + * log departure of this cpu + */ + waiting_cpus[cpu].task = 0; + waiting_cpus[cpu].pid = 0; + waiting_cpus[cpu].regs = 0; + correct_hw_break(); + exit_in_kgdb: + in_kgdb_here_log[cpu] = 0; + local_irq_restore(flags); + return 1; + /* + spin_unlock(continuelocks + smp_processor_id()); + */ +} + +void +smp__in_kgdb(struct pt_regs regs) +{ + ack_APIC_irq(); + in_kgdb(®s); +} +#else +int +in_kgdb(struct pt_regs *regs) +{ + return (kgdb_spinlock); +} +#endif + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned long dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movq %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * The ThreadExtraInfo query allows us to pass an arbitrary string + * for display with the "info threads" command. + */ + +void +print_extra_info(task_t *p, char *buf) +{ + if (!p) { + sprintf(buf, "Invalid thread"); + return; + } + sprintf(buf, "0x%p %8d %4d %c %s", + (void *)p, p->parent->pid, + task_cpu(p), + (p->state == 0) ? (task_curr(p)?'R':'r') : + (p->state < 0) ? 'U' : + (p->state & TASK_UNINTERRUPTIBLE) ? 'D' : + (p->state & TASK_STOPPED || p->ptrace & PT_PTRACED) ? 'T' : + (p->state & (TASK_ZOMBIE | TASK_DEAD)) ? 'Z' : + (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?', + p->comm); +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +kgdb_handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + struct task_struct *thread_list_start = 0, *thread = NULL; + struct task_struct *p; + unsigned long addr, length; + unsigned long breakno, breaktype; + char *ptr; + unsigned long newPC; + threadref thref; + unsigned long threadid, tmpid; + int thread_min = PID_MAX + MAX_NO_CPUS; +#ifdef old_thread_list + int maxthreads; +#endif + int nothreads; + unsigned long flags; + unsigned long gdb_regs[NUMREGS]; + unsigned long dr6; + IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */ +#define NO_NMI 1 +#define NO_SYNC 2 +#define regs (*linux_regs) + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->cs)) { + printk("ignoring non-kernel exception\n"); + print_regs(®s); + return (0); + } + /* + * If we're using eth mode, set the 'mode' in the netdevice. + */ + + if (kgdboe) + netpoll_set_trap(1); + + local_irq_save(flags); + + /* Get kgdb spinlock */ + + KGDB_SPIN_LOCK(&kgdb_spinlock); + rdtscll(kgdb_info.entry_tsc); + /* + * We depend on this spinlock and the NMI watch dog to control the + * other cpus. They will arrive at "in_kgdb()" as a result of the + * NMI and will wait there for the following spin locks to be + * released. + */ +#ifdef CONFIG_SMP + +#if 0 + if (cpu_callout_map & ~MAX_CPU_MASK) { + printk("kgdb : too many cpus, possibly not mapped" + " in contiguous space, change MAX_NO_CPUS" + " in kgdb_stub and make new kernel.\n" + " cpu_callout_map is %lx\n", cpu_callout_map); + goto exit_just_unlock; + } +#endif + if (spinlock_count == 1) { + int time, end_time, dum; + int i; + int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0) + }; + if (remote_debug) { + printk("kgdb : cpu %d entry, syncing others\n", + smp_processor_id()); + } + for (i = 0; i < MAX_NO_CPUS; i++) { + /* + * Use trylock as we may already hold the lock if + * we are holding the cpu. Net result is all + * locked. + */ + spin_trylock(&waitlocks[i]); + } + for (i = 0; i < MAX_NO_CPUS; i++) + cpu_logged_in[i] = 0; + /* + * Wait for their arrival. We know the watch dog is active if + * in_kgdb() has ever been called, as it is always called on a + * watchdog tick. + */ + rdtsc(dum, time); + end_time = time + 2; /* Note: we use the High order bits! */ + i = 1; + if (num_online_cpus() > 1) { + int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()]; + smp_send_nmi_allbutself(); + + while (i < num_online_cpus() && time != end_time) { + int j; + for (j = 0; j < MAX_NO_CPUS; j++) { + if (waiting_cpus[j].task && + waiting_cpus[j].task != NOCPU && + !cpu_logged_in[j]) { + i++; + cpu_logged_in[j] = 1; + if (remote_debug) { + printk + ("kgdb : cpu %d arrived at kgdb\n", + j); + } + break; + } else if (!waiting_cpus[j].task && + !cpu_online(j)) { + waiting_cpus[j].task = NOCPU; + cpu_logged_in[j] = 1; + waiting_cpus[j].hold = 1; + break; + } + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + + int wait = 100000; + while (wait--) ; + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + printk + ("kgdb : cpu %d stall" + " in in_kgdb\n", + j); + i++; + cpu_logged_in[j] = 1; + waiting_cpus[j].task = + (struct task_struct + *) 1; + } + } + } + + if (in_kgdb_entry_log[smp_processor_id()] > + (me_in_kgdb + 10)) { + break; + } + + rdtsc(dum, time); + } + if (i < num_online_cpus()) { + printk + ("kgdb : time out, proceeding without sync\n"); +#if 0 + printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n", + waiting_cpus[0].task != 0, + waiting_cpus[1].task != 0); + printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n", + cpu_logged_in[0], cpu_logged_in[1]); + printk + ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n", + in_kgdb_here_log[0] != 0, + in_kgdb_here_log[1] != 0); +#endif + entry_state = NO_SYNC; + } else { +#if 0 + int ent = + in_kgdb_entry_log[smp_processor_id()] - + me_in_kgdb; + printk("kgdb : sync after %d entries\n", ent); +#endif + } + } else { + if (remote_debug) { + printk + ("kgdb : %d cpus, but watchdog not active\n" + "proceeding without locking down other cpus\n", + num_online_cpus()); + entry_state = NO_NMI; + } + } + } +#endif + + if (remote_debug) { + unsigned long *lp = (unsigned long *) &linux_regs; + + printk("handle_exception(exceptionVector=%d, " + "signo=%d, err_code=%d, linux_regs=%p)\n", + exceptionVector, signo, err_code, linux_regs); + if (debug_regs) { + print_regs(®s); + printk("Stk: %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[0], lp[1], lp[2], lp[3], + lp[4], lp[5], lp[6], lp[7]); + printk(" %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[8], lp[9], lp[10], lp[11], + lp[12], lp[13], lp[14], lp[15]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[16], lp[17], lp[18], lp[19], + lp[20], lp[21], lp[22], lp[23]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[24], lp[25], lp[26], lp[27], + lp[28], lp[29], lp[30], lp[31]); + } + } + + /* Disable hardware debugging while we are in kgdb */ + /* Get the debug register status register */ +/* *INDENT-OFF* */ + __asm__("movq %0,%%db7" + : /* no output */ + :"r"(0UL)); + + asm volatile ("movq %%db6, %0\n" + :"=r" (hw_breakpoint_status) + :); + +#if 0 +/* *INDENT-ON* */ + switch (exceptionVector) { + case 0: /* divide error */ + case 1: /* debug exception */ + case 2: /* NMI */ + case 3: /* breakpoint */ + case 4: /* overflow */ + case 5: /* bounds check */ + case 6: /* invalid opcode */ + case 7: /* device not available */ + case 8: /* double fault (errcode) */ + case 10: /* invalid TSS (errcode) */ + case 12: /* stack fault (errcode) */ + case 16: /* floating point error */ + case 17: /* alignment check (errcode) */ + default: /* any undocumented */ + break; + case 11: /* segment not present (errcode) */ + case 13: /* general protection (errcode) */ + case 14: /* page fault (special errcode) */ + case 19: /* cache flush denied */ + if (mem_err_expected) { + /* + * This fault occured because of the + * get_char or set_char routines. These + * two routines use either eax of edx to + * indirectly reference the location in + * memory that they are working with. + * For a page fault, when we return the + * instruction will be retried, so we + * have to make sure that these + * registers point to valid memory. + */ + mem_err = 1; /* set mem error flag */ + mem_err_expected = 0; + mem_err_cnt++; /* helps in debugging */ + /* make valid address */ + regs.eax = (long) &garbage_loc; + /* make valid address */ + regs.edx = (long) &garbage_loc; + if (remote_debug) + printk("Return after memory error: " + "mem_err_cnt=%d\n", mem_err_cnt); + if (debug_regs) + print_regs(®s); + goto exit_kgdb; + } + break; + } +#endif + if (remote_debug) + printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id()); + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + kgdb_info.called_from = __builtin_return_address(0); +#ifdef CONFIG_SMP + /* + * OK, we can now communicate, lets tell gdb about the sync. + * but only if we had a problem. + */ + switch (entry_state) { + case NO_NMI: + to_gdb("NMI not active, other cpus not stopped\n"); + break; + case NO_SYNC: + to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n"); + default:; + } + +#endif +/* + * Set up the gdb function call area. + */ + trap_cpu = smp_processor_id(); + OLD_esp = NEW_esp = (unsigned long) (&linux_regs->rsp); + + IF_SMP(once_again:) + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'd': + remote_debug = !(remote_debug); /* toggle debug flag */ + printk("Remote debug %s\n", + remote_debug ? "on" : "off"); + break; + case 'g': /* return the value of the CPU registers */ + get_gdb_regs(usethread, ®s, gdb_regs); + mem2hex((char *) gdb_regs, + remcomOutBuffer, NUMREGBYTES); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], + (char *) gdb_regs, NUMREGBYTES); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + case 'P':{ /* set the value of a single CPU register - + return OK */ + /* + * For some reason, gdb wants to talk about psudo + * registers (greater than 15). + */ + unsigned long regno; + + ptr = &remcomInBuffer[1]; + regs_to_gdb_regs(gdb_regs, ®s); + if ((!usethread || usethread == current) && + hexToLong(&ptr, ®no) && + *ptr++ == '=' && (regno >= 0)) { + if (regno >= NUMREGS) + break; + hex2mem(ptr, (char *) &gdb_regs[regno], + 8); + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + break; + } + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr) && + (*(ptr++) == ',') && (hexToLong(&ptr, &length))) { + ptr = 0; + /* + * hex doubles the byte count + */ + if (length > (BUFMAX / 2)) + length = BUFMAX / 2; + if (mem2hex((char *) addr, + remcomOutBuffer, length)) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + debug_error + ("malformed read memory command: %s\n", + remcomInBuffer); + } + break; + + /* MAA..AA,LLLL: + Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr) && + (*(ptr++) == ',') && + (hexToLong(&ptr, &length)) && (*(ptr++) == ':')) { + if (hex2mem(ptr, (char *) addr, length)) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } else { + strcpy(remcomOutBuffer, "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + debug_error + ("malformed write memory command: %s\n", + remcomInBuffer); + } + break; + case 'S': + remcomInBuffer[0] = 's'; + case 'C': + /* Csig;AA..AA where ;AA..AA is optional + * continue with signal + * Since signals are meaning less to us, delete that + * part and then fall into the 'c' code. + */ + ptr = &remcomInBuffer[1]; + length = 2; + while (*ptr && *ptr != ';') { + length++; + ptr++; + } + if (*ptr) { + do { + ptr++; + *(ptr - length++) = *ptr; + } while (*ptr); + } else { + remcomInBuffer[1] = 0; + } + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + /* D detach, reply OK and then continue */ + case 'c': + case 's': + case 'D': + + /* try to read optional parameter, + pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr)) { + if (remote_debug) + printk("Changing EIP to 0x%lx\n", addr); + + regs.rip = addr; + } + + newPC = regs.rip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + /* detach is a friendly version of continue. Note that + debugging is still enabled (e.g hit control C) + */ + if (remcomInBuffer[0] == 'D') { + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + } + + if (remote_debug) { + printk("Resuming execution\n"); + print_regs(®s); + } + asm volatile ("movq %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno) && + (breakinfo[breakno].type == 0)) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + + if (kgdboe) + netpoll_set_trap(0); + + correct_hw_break(); + asm volatile ("movq %0, %%db6\n"::"r" (0UL)); + goto exit_kgdb; + + /* kill the program */ + case 'k': /* do nothing */ + break; + + /* query */ + case 'q': + nothreads = 0; + switch (remcomInBuffer[1]) { + case 'f': + threadid = 1; + thread_list = 2; + thread_list_start = (usethread ? : current); + case 's': + if (!cmp_str(&remcomInBuffer[2], + "ThreadInfo", 10)) + break; + + remcomOutBuffer[nothreads++] = 'm'; + for (; threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + nothreads += int_to_hex_v( + &remcomOutBuffer[ + nothreads], + threadid); + if (thread_min > threadid) + thread_min = threadid; + remcomOutBuffer[ + nothreads] = ','; + nothreads++; + if (nothreads > BUFMAX - 10) + break; + } + } + if (remcomOutBuffer[nothreads - 1] == 'm') { + remcomOutBuffer[nothreads - 1] = 'l'; + } else { + nothreads--; + } + remcomOutBuffer[nothreads] = 0; + break; + +#ifdef old_thread_list /* Old thread info request */ + case 'L': + /* List threads */ + thread_list = 2; + thread_list_start = (usethread ? : current); + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + do { + int buf_thread_limit = + (BUFMAX - 22) / BUF_THREAD_ID_SIZE; + if (maxthreads > buf_thread_limit) { + maxthreads = buf_thread_limit; + } + } while (0); + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + /* If start flag set start at 0. */ + if (remcomInBuffer[2] == '1') + threadid = 0; + else + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads && + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + if (thread_min > threadid) + thread_min = threadid; + } + } + + if (threadid == PID_MAX + MAX_NO_CPUS) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; +#endif + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + if (!threadid) { + /* + * idle thread + */ + for (threadid = PID_MAX; + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + if (current == + idle_task(threadid - + PID_MAX)) + break; + } + } + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, + err_code, remcomOutBuffer); + break; + case 'T': + ptr = &remcomInBuffer[0]; + if (strncmp(ptr, "qThreadExtraInfo,", + strlen("qThreadExtraInfo,")) == 0) { + ptr += strlen("qThreadExtraInfo,"); + hexToLong(&ptr, &tmpid); + p = getthread(tmpid); + print_extra_info(p, lbuf); + mem2hex(lbuf, remcomOutBuffer, + strlen(lbuf)); + } + break; +#if 0 + case 'T':{ + char * nptr; + /* Thread extra info */ + if (!cmp_str(&remcomInBuffer[2], + "hreadExtraInfo,", 15)) { + break; + } + ptr = &remcomInBuffer[17]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + nptr = &thread->comm[0]; + length = 0; + ptr = &remcomOutBuffer[0]; + do { + length++; + ptr = pack_hex_byte(ptr, *nptr++); + } while (*nptr && length < 16); + /* + * would like that 16 to be the size of + * task_struct.comm but don't know the + * syntax.. + */ + *ptr = 0; + } +#endif + } + break; + + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + /* + * Just in case I forget what this is all about, + * the "thread info" command to gdb causes it + * to ask for a thread list. It then switches + * to each thread and asks for the registers. + * For this (and only this) usage, we want to + * fudge the registers of tasks not on the run + * list (i.e. waiting) to show the routine that + * called schedule. Also, gdb, is a minimalist + * in that if the current thread is the last + * it will not re-read the info when done. + * This means that in this case we must show + * the real registers. So here is how we do it: + * Each entry we keep track of the min + * thread in the list (the last that gdb will) + * get info for. We also keep track of the + * starting thread. + * "thread_list" is cleared when switching back + * to the min thread if it is was current, or + * if it was not current, thread_list is set + * to 1. When the switch to current comes, + * if thread_list is 1, clear it, else do + * nothing. + */ + usethread = thread; + if ((thread_list == 1) && + (thread == thread_list_start)) { + thread_list = 0; + } + if (thread_list && (threadid == thread_min)) { + if (thread == thread_list_start) { + thread_list = 0; + } else { + thread_list = 1; + } + } + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + if (thread_min > threadid) + thread_min = threadid; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; + + case 'Y': /* set up a hardware breakpoint */ + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &breakno); + ptr++; + hexToLong(&ptr, &breaktype); + ptr++; + hexToLong(&ptr, &length); + ptr++; + hexToLong(&ptr, &addr); + if (set_hw_break(breakno & 0x3, + breaktype & 0x3, + length & 0x3, addr) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + case 'r': /* reboot */ + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + /*to_gdb("Rebooting\n"); */ + /* triplefault no return from here */ + { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt[0])); + BREAKPOINT; + } + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + } /* while(1==1) */ + /* + * reached by goto only. + */ + exit_kgdb: + /* + * Here is where we set up to trap a gdb function call. NEW_esp + * will be changed if we are trying to do this. We handle both + * adding and subtracting, thus allowing gdb to put grung on + * the stack which it removes later. + */ + if (NEW_esp != OLD_esp) { + unsigned long *ptr = END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) + ptr -= (OLD_esp - NEW_esp) / sizeof (unsigned long); + *--ptr = linux_regs->eflags; + *--ptr = linux_regs->cs; + *--ptr = linux_regs->rip; + *--ptr = linux_regs->rcx; + *--ptr = linux_regs->rbx; + *--ptr = linux_regs->rax; + linux_regs->rcx = NEW_esp - (sizeof (unsigned long) * 6); + linux_regs->rbx = (unsigned long) END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) { + linux_regs->rip = (unsigned long) fn_call_stub; + } else { + linux_regs->rip = (unsigned long) fn_rtn_stub; + linux_regs->rax = NEW_esp; + } + linux_regs->eflags &= ~(IF_BIT | TF_BIT); + } +#ifdef CONFIG_SMP + /* + * Release gdb wait locks + * Sanity check time. Must have at least one cpu to run. Also single + * step must not be done if the current cpu is on hold. + */ + if (spinlock_count == 1) { + int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep; + int cpu_avail = 0; + int i; + + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!cpu_online(i)) + break; + if (!hold_cpu(i)) { + cpu_avail = 1; + } + } + /* + * Early in the bring up there will be NO cpus on line... + */ + if (!cpu_avail && !cpus_empty(cpu_online_map)) { + to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n"); + goto once_again; + } + if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) { + to_gdb + ("Current cpu must be unblocked to single step\n"); + goto once_again; + } + if (!(ss_hold)) { + int i; + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!hold_cpu(i)) { + spin_unlock(&waitlocks[i]); + } + } + } else { + spin_unlock(&waitlocks[smp_processor_id()]); + } + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + /* + * If this cpu is on hold, this is where we + * do it. Note, the NMI will pull us out of here, + * but will return as the above lock is not held. + * We will stay here till another cpu releases the lock for us. + */ + spin_unlock_wait(waitlocks + smp_processor_id()); + local_irq_restore(flags); + return (1); + } +#if 0 +exit_just_unlock: +#endif +#endif + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + local_irq_restore(flags); + return (1); +} + +#undef regs +static int kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) +{ + struct die_args *d = ptr; + + if (!kgdb_enabled || (cmd == DIE_DEBUG && user_mode(d->regs))) + return NOTIFY_DONE; + if (cmd == DIE_NMI_IPI) { + if (in_kgdb(d->regs)) + return NOTIFY_BAD; + } else if (kgdb_handle_exception(d->trapnr, d->signr, d->err, d->regs)) + return NOTIFY_BAD; /* skip */ + + return NOTIFY_DONE; +} + +static struct notifier_block kgdb_notifier = { + .notifier_call = kgdb_notify, + .priority = 0, +}; + +void set_debug_traps(void) +{ + static int initialized = 0; + + if (!initialized) { + initialized = 1; + notifier_chain_register(&die_chain, &kgdb_notifier); + } +} + +/* + * Provide the command line "gdb" initial break + */ +int __init kgdb_initial_break(char * str) +{ + if (*str == '\0'){ + breakpoint(); + return 1; + } + return 0; +} +__setup("gdb",kgdb_initial_break); + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ +/* But really, just use the BREAKPOINT macro. We will handle the int stuff + */ + +void breakpoint(void) +{ + + set_debug_traps(); + kgdb_enabled = 1; +#if 0 + /* + * These calls were not enough to allow breakpoint to be + * called before trap_init(). I moved the argument parsing + * after trap_init() and it seems to work. + */ + set_intr_usr_gate(3,&int3); /* disable ints on trap */ + set_intr_gate(1,&debug); + set_intr_gate(14,&page_fault); +#endif + + BREAKPOINT; +} + +#ifdef later +/* + * possibly we should not go thru the traps.c code at all? Someday. + */ +void +do_kgdb_int3(struct pt_regs *regs, long error_code) +{ + kgdb_handle_exception(3, 5, error_code, regs); + return; +} +#endif +#undef regs +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS +asmlinkage void +bad_sys_call_exit(int stuff) +{ + struct pt_regs *regs = (struct pt_regs *) &stuff; + printk("Sys call %d return with %x preempt_count\n", + (int) regs->orig_eax, preempt_count()); +} +#endif +#ifdef CONFIG_STACK_OVERFLOW_TEST +#include +asmlinkage void +stack_overflow(void) +{ +#ifdef BREAKPOINT + BREAKPOINT; +#else + printk("Kernel stack overflow, looping forever\n"); +#endif + while (1) { + } +} +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE) +char gdbconbuf[BUFMAX]; + +static void +kgdb_gdb_message(const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + /* + * This takes care of NMI while spining out chars to gdb + */ + IF_SMP(in_kgdb_console = 1); + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } + IF_SMP(in_kgdb_console = 0); +} +#endif +#ifdef CONFIG_SMP +static void +to_gdb(const char *s) +{ + int count = 0; + while (s[count] && (count++ < BUFMAX)) ; + kgdb_gdb_message(s, count); +} +#endif +#ifdef CONFIG_KGDB_CONSOLE +#include +#include +#include +#include + +void +kgdb_console_write(struct console *co, const char *s, unsigned count) +{ + + if (gdb_i386vector == -1) { + /* + * We have not yet talked to gdb. What to do... + * lets break, on continue we can do the write. + * But first tell him whats up. Uh, well no can do, + * as this IS the console. Oh well... + * We do need to wait or the messages will be lost. + * Other option would be to tell the above code to + * ignore this breakpoint and do an auto return, + * but that might confuse gdb. Also this happens + * early enough in boot up that we don't have the traps + * set up yet, so... + */ + breakpoint(); + } + kgdb_gdb_message(s, count); +} + +/* + * ------------------------------------------------------------ + * Serial KGDB driver + * ------------------------------------------------------------ + */ + +static struct console kgdbcons = { + name:"kgdb", + write:kgdb_console_write, +#ifdef CONFIG_KGDB_USER_CONSOLE + device:kgdb_console_device, +#endif + flags:CON_PRINTBUFFER | CON_ENABLED, + index:-1, +}; + +/* + * The trick here is that this file gets linked before printk.o + * That means we get to peer at the console info in the command + * line before it does. If we are up, we register, otherwise, + * do nothing. By returning 0, we allow printk to look also. + */ +static int kgdb_console_enabled; + +int __init +kgdb_console_init(char *str) +{ + if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) { + register_console(&kgdbcons); + kgdb_console_enabled = 1; + } + return 0; /* let others look at the string */ +} + +__setup("console=", kgdb_console_init); + +#ifdef CONFIG_KGDB_USER_CONSOLE +static kdev_t kgdb_console_device(struct console *c); +/* This stuff sort of works, but it knocks out telnet devices + * we are leaving it here in case we (or you) find time to figure it out + * better.. + */ + +/* + * We need a real char device as well for when the console is opened for user + * space activities. + */ + +static int +kgdb_consdev_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static ssize_t +kgdb_consdev_write(struct file *file, const char *buf, + size_t count, loff_t * ppos) +{ + int size, ret = 0; + static char kbuf[128]; + static DECLARE_MUTEX(sem); + + /* We are not reentrant... */ + if (down_interruptible(&sem)) + return -ERESTARTSYS; + + while (count > 0) { + /* need to copy the data from user space */ + size = count; + if (size > sizeof (kbuf)) + size = sizeof (kbuf); + if (copy_from_user(kbuf, buf, size)) { + ret = -EFAULT; + break;; + } + kgdb_console_write(&kgdbcons, kbuf, size); + count -= size; + ret += size; + buf += size; + } + + up(&sem); + + return ret; +} + +struct file_operations kgdb_consdev_fops = { + open:kgdb_consdev_open, + write:kgdb_consdev_write +}; +static kdev_t +kgdb_console_device(struct console *c) +{ + return MKDEV(TTYAUX_MAJOR, 1); +} + +/* + * This routine gets called from the serial stub in the i386/lib + * This is so it is done late in bring up (just before the console open). + */ +void +kgdb_console_finit(void) +{ + if (kgdb_console_enabled) { + char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1)); + char *cp = cptr; + while (*cptr && *cptr != '(') + cptr++; + *cptr = 0; + unregister_chrdev(TTYAUX_MAJOR, cp); + register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops); + } +} +#endif +#endif +#ifdef CONFIG_KGDB_TS +#include /* time stamp code */ +#include /* in_interrupt */ +#ifdef CONFIG_KGDB_TS_64 +#define DATA_POINTS 64 +#endif +#ifdef CONFIG_KGDB_TS_128 +#define DATA_POINTS 128 +#endif +#ifdef CONFIG_KGDB_TS_256 +#define DATA_POINTS 256 +#endif +#ifdef CONFIG_KGDB_TS_512 +#define DATA_POINTS 512 +#endif +#ifdef CONFIG_KGDB_TS_1024 +#define DATA_POINTS 1024 +#endif +#ifndef DATA_POINTS +#define DATA_POINTS 128 /* must be a power of two */ +#endif +#define INDEX_MASK (DATA_POINTS - 1) +#if (INDEX_MASK & DATA_POINTS) +#error "CONFIG_KGDB_TS_COUNT must be a power of 2" +#endif +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + int data0; + int data1; +}; +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + struct task_struct *t1; + struct task_struct *t2; +}; +struct kgdb_and_then_struct kgdb_data[DATA_POINTS]; + +struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0]; +int kgdb_and_then_count; + +void +kgdb_tstamp(int line, char *source, int data0, int data1) +{ + static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; + int flags; + local_irq_save(flags); + spin_lock(&ts_spin); + rdtscll(kgdb_and_then->at_time); +#ifdef CONFIG_SMP + kgdb_and_then->on_cpu = smp_processor_id(); +#endif + kgdb_and_then->task = current; + kgdb_and_then->from_ln = line; + kgdb_and_then->in_src = source; + kgdb_and_then->from = __builtin_return_address(0); + kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) | + (preempt_count() << 8)); + kgdb_and_then->data0 = data0; + kgdb_and_then->data1 = data1; + kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK]; + spin_unlock(&ts_spin); + local_irq_restore(flags); +#ifdef CONFIG_PREEMPT + +#endif + return; +} +#endif +typedef int gdb_debug_hook(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs); +gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */ + +static int kgdb_need_breakpoint[NR_CPUS]; + +void kgdb_schedule_breakpoint(void) +{ + kgdb_need_breakpoint[smp_processor_id()] = 1; +} + +void kgdb_process_breakpoint(void) +{ + /* + * Handle a breakpoint queued from inside network driver code + * to avoid reentrancy issues + */ + if (kgdb_need_breakpoint[smp_processor_id()]) { + kgdb_need_breakpoint[smp_processor_id()] = 0; + kgdb_enabled = 1; + BREAKPOINT; + } +} + --- linux-2.6.5-rc3/arch/x86_64/kernel/Makefile 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/kernel/Makefile 2004-03-30 23:39:44.000000000 -0800 @@ -27,6 +27,7 @@ obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_KGDB) += kgdb_stub.o obj-y += topology.o --- linux-2.6.5-rc3/arch/x86_64/kernel/mpparse.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/kernel/mpparse.c 2004-03-31 00:56:18.875803384 -0800 @@ -83,6 +83,9 @@ extern int acpi_parse_ioapic (acpi_table #endif /*CONFIG_X86_IO_APIC*/ #endif /*CONFIG_ACPI_BOOT*/ +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + + /* * Intel MP BIOS table parsing routines: */ @@ -147,6 +150,7 @@ static void __init MP_processor_info (st ver = 0x10; } apic_version[m->mpc_apicid] = ver; + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; } static void __init MP_bus_info (struct mpc_config_bus *m) @@ -972,3 +976,10 @@ void __init mp_parse_prt (void) #endif /*CONFIG_X86_IO_APIC*/ #endif /*CONFIG_ACPI_BOOT*/ + +static __init int early_maxcpus(char *s) +{ + maxcpus = simple_strtoul(s, NULL, 0); + return 0; +} +__early_param("maxcpus=", early_maxcpus); --- linux-2.6.5-rc3/arch/x86_64/kernel/mtrr/Makefile 2003-06-14 12:18:21.000000000 -0700 +++ /dev/null 2003-09-15 06:40:47.000000000 -0700 @@ -1,28 +0,0 @@ -# -# Reuse the i386 MTRR driver. -# - -obj-y := main.o if.o generic.o state.o -obj-y += amd.o -obj-y += cyrix.o -obj-y += centaur.o - -$(obj)/main.c: $(obj)/mtrr.h - @ln -sf ../../../i386/kernel/cpu/mtrr/main.c $(obj)/main.c -$(obj)/if.c: $(obj)/mtrr.h - @ln -sf ../../../i386/kernel/cpu/mtrr/if.c $(obj)/if.c -$(obj)/generic.c: $(obj)/mtrr.h - @ln -sf ../../../i386/kernel/cpu/mtrr/generic.c $(obj)/generic.c -$(obj)/state.c: $(obj)/mtrr.h - @ln -sf ../../../i386/kernel/cpu/mtrr/state.c $(obj)/state.c -$(obj)/amd.c: $(obj)/mtrr.h - @ln -sf ../../../i386/kernel/cpu/mtrr/amd.c $(obj)/amd.c -$(obj)/cyrix.c: $(obj)/mtrr.h - @ln -sf ../../../i386/kernel/cpu/mtrr/cyrix.c $(obj)/cyrix.c -$(obj)/centaur.c: $(obj)/mtrr.h - @ln -sf ../../../i386/kernel/cpu/mtrr/centaur.c $(obj)/centaur.c -$(obj)/mtrr.h: - @ln -sf ../../../i386/kernel/cpu/mtrr/mtrr.h $(obj)/mtrr.h - -clean-files += main.c if.c generic.c state.c amd.c cyrix.c centaur.c mtrr.h - --- linux-2.6.5-rc3/arch/x86_64/kernel/pci-gart.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/kernel/pci-gart.c 2004-03-31 00:56:18.876803232 -0800 @@ -194,6 +194,7 @@ void *pci_alloc_consistent(struct pci_de /* Kludge to make it bug-to-bug compatible with i386. i386 uses the normal dma_mask for alloc_consistent. */ + if (hwdev) dma_mask &= hwdev->dma_mask; again: @@ -849,9 +850,10 @@ fs_initcall(pci_iommu_init); forcesac For SAC mode for masks <40bits (experimental) fullflush Flush IOMMU on each allocation (default) nofullflush Don't use IOMMU fullflush + allowed overwrite iommu off workarounds for specific chipsets. soft Use software bounce buffering (default for Intel machines) */ -__init int iommu_setup(char *opt) +static __init int iommu_setup(char *opt) { int arg; char *p = opt; @@ -861,8 +863,12 @@ __init int iommu_setup(char *opt) no_agp = 1; if (!memcmp(p,"off", 3)) no_iommu = 1; - if (!memcmp(p,"force", 5)) + if (!memcmp(p,"force", 5)) { force_iommu = 1; + iommu_aperture_allowed = 1; + } + if (!memcmp(p,"allowed",7)) + iommu_aperture_allowed = 1; if (!memcmp(p,"noforce", 7)) { iommu_merge = 0; force_iommu = 0; @@ -907,5 +913,6 @@ __init int iommu_setup(char *opt) return 0; } while (*p++ != ','); } - return 1; + return 0; } +__early_param("iommu=",iommu_setup); --- linux-2.6.5-rc3/arch/x86_64/kernel/ptrace.c 2003-06-14 12:18:06.000000000 -0700 +++ 25/arch/x86_64/kernel/ptrace.c 2004-03-31 00:54:31.295158128 -0800 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -486,7 +487,7 @@ out: return ret; } -asmlinkage void syscall_trace(struct pt_regs *regs) +static void syscall_trace(struct pt_regs *regs) { #if 0 @@ -496,11 +497,6 @@ asmlinkage void syscall_trace(struct pt_ current_thread_info()->flags, current->ptrace); #endif - if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return; - if (!(current->ptrace & PT_PTRACED)) - return; - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); /* @@ -513,3 +509,25 @@ asmlinkage void syscall_trace(struct pt_ current->exit_code = 0; } } + +asmlinkage void syscall_trace_enter(struct pt_regs *regs) +{ + if (unlikely(current->audit_context)) + audit_syscall_entry(current, regs->orig_rax, + regs->rdi, regs->rsi, + regs->rdx, regs->r10); + + if (test_thread_flag(TIF_SYSCALL_TRACE) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); +} + +asmlinkage void syscall_trace_leave(struct pt_regs *regs) +{ + if (unlikely(current->audit_context)) + audit_syscall_exit(current, regs->rax); + + if (test_thread_flag(TIF_SYSCALL_TRACE) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); +} --- linux-2.6.5-rc3/arch/x86_64/kernel/setup.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/kernel/setup.c 2004-03-31 00:56:18.878802928 -0800 @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,7 @@ #include #include #include +#include /* For COMMAND_LINE_SIZE */ /* * Machine setup.. @@ -99,7 +101,6 @@ extern int root_mountflags; extern char _text, _etext, _edata, _end; char command_line[COMMAND_LINE_SIZE]; -char saved_command_line[COMMAND_LINE_SIZE]; struct resource standard_io_resources[] = { { "dma1", 0x00, 0x1f, IORESOURCE_BUSY }, @@ -190,104 +191,99 @@ static void __init probe_roms(void) } } -static __init void parse_cmdline_early (char ** cmdline_p) +#ifdef CONFIG_ACPI_BOOT +/* should be moved elsewhere */ +static __init int early_acpi(char *s) { - char c = ' ', *to = command_line, *from = COMMAND_LINE; - int len = 0; - - /* Save unparsed command line copy for /proc/cmdline */ - memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE); - saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; - - for (;;) { - if (c != ' ') - goto next_char; - -#ifdef CONFIG_SMP - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; + /* "acpi=off" disables both ACPI table parsing and interpreter init */ + if (!strcmp(s, "off")) + acpi_disabled = 1; + + else if (!strcmp(s, "force")) { + /* add later when we do DMI horrors: */ + /* acpi_force = 1; */ + acpi_disabled = 0; + } + + /* acpi=ht just means: do ACPI MADT parsing + at bootup, but don't enable the full ACPI interpreter */ + else if (!strcmp(s, "ht")) + acpi_ht = 1; + + /* acpi=strict disables out-of-spec workarounds */ + else if (!strcmp(s, "strict")) + acpi_strict = 1; + else + return -1; - maxcpus = simple_strtoul(from + 8, NULL, 0); - } -#endif -#ifdef CONFIG_ACPI_BOOT - /* "acpi=off" disables both ACPI table parsing and interpreter init */ - if (!memcmp(from, "acpi=off", 8)) - acpi_disabled = 1; - - if (!memcmp(from, "acpi=force", 10)) { - /* add later when we do DMI horrors: */ - /* acpi_force = 1; */ - acpi_disabled = 0; - } + return 0; +} +__early_param("acpi=", early_acpi); - /* acpi=ht just means: do ACPI MADT parsing - at bootup, but don't enable the full ACPI interpreter */ - if (!memcmp(from, "acpi=ht", 7)) { - acpi_ht = 1; - } - else if (!memcmp(from, "pci=noacpi", 10)) - acpi_noirq_set(); +static __init int early_acpi_sci(char *s) +{ + if (!strcmp(s, "edge")) + acpi_sci_flags.trigger = 1; + else if (!strcmp(s, "level")) + acpi_sci_flags.trigger = 3; + else if (!strcmp(s, "high")) + acpi_sci_flags.polarity = 1; + else if (!strcmp(s, "low")) + acpi_sci_flags.polarity = 3; + else + return -1; + return 0; +} +__early_param("acpi_sci=", early_acpi_sci); - else if (!memcmp(from, "acpi_sci=edge", 13)) - acpi_sci_flags.trigger = 1; - else if (!memcmp(from, "acpi_sci=level", 14)) - acpi_sci_flags.trigger = 3; - else if (!memcmp(from, "acpi_sci=high", 13)) - acpi_sci_flags.polarity = 1; - else if (!memcmp(from, "acpi_sci=low", 12)) - acpi_sci_flags.polarity = 3; - - /* acpi=strict disables out-of-spec workarounds */ - else if (!memcmp(from, "acpi=strict", 11)) { - acpi_strict = 1; - } +static __init int early_pci(char *s) +{ + if (!strcmp(s, "noacpi")) + acpi_noirq_set(); + else + return -1; + return 0; +} +__early_param("pci=", early_pci); #endif - if (!memcmp(from, "nolapic", 7) || - !memcmp(from, "disableapic", 11)) - disable_apic = 1; - - if (!memcmp(from, "noapic", 6)) - skip_ioapic_setup = 1; - - if (!memcmp(from, "apic", 4)) { - skip_ioapic_setup = 0; - ioapic_force = 1; - } - - if (!memcmp(from, "mem=", 4)) - parse_memopt(from+4, &from); +static __init int early_nolapic(char *s) +{ + disable_apic = 1; + return 0; +} +__early_param("nolapic", early_nolapic); -#ifdef CONFIG_DISCONTIGMEM - if (!memcmp(from, "numa=", 5)) - numa_setup(from+5); -#endif +/* alias for compatibility */ +static __init int early_disableapic(char *s) +{ + disable_apic = 1; + return 0; +} +__early_param("disableapic", early_disableapic); -#ifdef CONFIG_GART_IOMMU - if (!memcmp(from,"iommu=",6)) { - iommu_setup(from+6); - } -#endif +static __init int early_no_ioapic(char *s) +{ + skip_ioapic_setup = 1; + return 0; +} +__early_param("noapic", early_no_ioapic); - if (!memcmp(from,"oops=panic", 10)) - panic_on_oops = 1; +static __init int early_apic(char *s) +{ + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; +} +__early_param("apic", early_apic); - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - *to = '\0'; - *cmdline_p = command_line; +static __init int early_oopspanic(char *s) +{ + panic_on_oops = 1; + return 0; } +__early_param("oops=panic", early_oopspanic); + #ifndef CONFIG_DISCONTIGMEM static void __init contig_initmem_init(void) @@ -369,6 +365,30 @@ static int __init noreplacement_setup(ch __setup("noreplacement", noreplacement_setup); +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +unsigned char eddnr; +struct edd_info edd[EDDMAXNR]; +unsigned int edd_disk80_sig; +#ifdef CONFIG_EDD_MODULE +EXPORT_SYMBOL(eddnr); +EXPORT_SYMBOL(edd); +EXPORT_SYMBOL(edd_disk80_sig); +#endif +/** + * copy_edd() - Copy the BIOS EDD information + * from empty_zero_page into a safe place. + * + */ +static inline void copy_edd(void) +{ + eddnr = EDD_NR; + memcpy(edd, EDD_BUF, sizeof(edd)); + edd_disk80_sig = DISK80_SIGNATURE; +} +#else +#define copy_edd() do {} while (0) +#endif + void __init setup_arch(char **cmdline_p) { unsigned long low_mem_size; @@ -387,6 +407,7 @@ void __init setup_arch(char **cmdline_p) rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); #endif setup_memory_region(); + copy_edd(); if (!MOUNT_ROOT_RDONLY) root_mountflags &= ~MS_RDONLY; @@ -400,7 +421,8 @@ void __init setup_arch(char **cmdline_p) data_resource.start = virt_to_phys(&_etext); data_resource.end = virt_to_phys(&_edata)-1; - parse_cmdline_early(cmdline_p); + *cmdline_p = early_command_line; + parse_early_options(cmdline_p); /* * partially used pages are not usable - thus @@ -475,10 +497,7 @@ void __init setup_arch(char **cmdline_p) #endif paging_init(); -#ifndef CONFIG_SMP - /* Temporary hack: disable the IO-APIC for UP Nvidia and VIA. */ check_ioapic(); -#endif #ifdef CONFIG_ACPI_BOOT /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). @@ -797,6 +816,9 @@ static void __init init_intel(struct cpu c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; } + + if (c->x86 == 15) + c->x86_cache_alignment = c->x86_clflush_size * 2; } void __init get_cpu_vendor(struct cpuinfo_x86 *c) @@ -831,6 +853,7 @@ void __init early_identify_cpu(struct cp c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_clflush_size = 64; + c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); /* Get vendor name */ @@ -1058,6 +1081,7 @@ static int show_cpuinfo(struct seq_file if (c->x86_tlbsize > 0) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); + seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); --- linux-2.6.5-rc3/arch/x86_64/kernel/signal.c 2004-01-09 00:04:31.000000000 -0800 +++ 25/arch/x86_64/kernel/signal.c 2004-03-31 00:54:55.198524264 -0800 @@ -236,7 +236,8 @@ get_stack(struct k_sigaction *ka, struct return (void *)round_down(rsp - size, 16); } -static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +static void setup_rt_frame(int sig, struct k_sigaction *ka_copy, + siginfo_t *info, sigset_t *set, struct pt_regs * regs) { struct rt_sigframe *frame; @@ -245,7 +246,7 @@ static void setup_rt_frame(int sig, stru struct task_struct *me = current; if (me->used_math) { - fp = get_stack(ka, regs, sizeof(struct _fpstate)); + fp = get_stack(ka_copy, regs, sizeof(struct _fpstate)); frame = (void *)round_down((u64)fp - sizeof(struct rt_sigframe), 16) - 8; if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) { @@ -255,14 +256,14 @@ static void setup_rt_frame(int sig, stru if (save_i387(fp) < 0) err |= -1; } else { - frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; + frame = get_stack(ka_copy, regs, sizeof(struct rt_sigframe)) - 8; } if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) { goto give_sigsegv; } - if (ka->sa.sa_flags & SA_SIGINFO) { + if (ka_copy->sa.sa_flags & SA_SIGINFO) { err |= copy_siginfo_to_user(&frame->info, info); if (err) { goto give_sigsegv; @@ -288,10 +289,10 @@ static void setup_rt_frame(int sig, stru /* Set up to return from userspace. If provided, use a stub already in userspace. */ /* x86-64 should always use SA_RESTORER. */ - if (ka->sa.sa_flags & SA_RESTORER) { - err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); + if (ka_copy->sa.sa_flags & SA_RESTORER) { + err |= __put_user(ka_copy->sa.sa_restorer, &frame->pretcode); } else { - printk("%s forgot to set SA_RESTORER for signal %d.\n", me->comm, sig); + /* could use a vstub here */ goto give_sigsegv; } @@ -317,7 +318,7 @@ static void setup_rt_frame(int sig, stru next argument after the signal number on the stack. */ regs->rsi = (unsigned long)&frame->info; regs->rdx = (unsigned long)&frame->uc; - regs->rip = (unsigned long) ka->sa.sa_handler; + regs->rip = (unsigned long) ka_copy->sa.sa_handler; regs->rsp = (unsigned long)frame; @@ -333,7 +334,7 @@ static void setup_rt_frame(int sig, stru give_sigsegv: if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; + current->sighand->action[SIGSEGV-1].sa.sa_handler = SIG_DFL; signal_fault(regs,frame,"signal deliver"); } @@ -342,11 +343,10 @@ give_sigsegv: */ static void -handle_signal(unsigned long sig, siginfo_t *info, sigset_t *oldset, - struct pt_regs * regs) +handle_signal(unsigned long sig, siginfo_t *info, + struct k_sigaction *ka_copy, sigset_t *oldset, + struct pt_regs * regs) { - struct k_sigaction *ka = ¤t->sighand->action[sig-1]; - #if DEBUG_SIG printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", current->pid, sig, regs->rip, regs->rsp, regs); @@ -362,7 +362,7 @@ handle_signal(unsigned long sig, siginfo break; case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { + if (!(ka_copy->sa.sa_flags & SA_RESTART)) { regs->rax = -EINTR; break; } @@ -375,20 +375,18 @@ handle_signal(unsigned long sig, siginfo #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) { - if (ka->sa.sa_flags & SA_SIGINFO) - ia32_setup_rt_frame(sig, ka, info, oldset, regs); + if (ka_copy->sa.sa_flags & SA_SIGINFO) + ia32_setup_rt_frame(sig, ka_copy, info, oldset, regs); else - ia32_setup_frame(sig, ka, oldset, regs); + ia32_setup_frame(sig, ka_copy, oldset, regs); } else #endif - setup_rt_frame(sig, ka, info, oldset, regs); - - if (ka->sa.sa_flags & SA_ONESHOT) - ka->sa.sa_handler = SIG_DFL; + setup_rt_frame(sig, ka_copy, info, oldset, regs); - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (!(ka_copy->sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); + sigorsets(¤t->blocked,¤t->blocked, + &ka_copy->sa.sa_mask); sigaddset(¤t->blocked,sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); @@ -402,6 +400,7 @@ handle_signal(unsigned long sig, siginfo */ int do_signal(struct pt_regs *regs, sigset_t *oldset) { + struct k_sigaction ka_copy; siginfo_t info; int signr; @@ -423,7 +422,7 @@ int do_signal(struct pt_regs *regs, sigs if (!oldset) oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, regs, NULL); + signr = get_signal_to_deliver(&info, &ka_copy, regs, NULL); if (signr > 0) { /* Reenable any watchpoints before delivering the * signal to user space. The processor register will @@ -434,7 +433,7 @@ int do_signal(struct pt_regs *regs, sigs asm volatile("movq %0,%%db7" : : "r" (current->thread.debugreg7)); /* Whee! Actually deliver the signal. */ - handle_signal(signr, &info, oldset, regs); + handle_signal(signr, &info, &ka_copy, oldset, regs); return 1; } --- linux-2.6.5-rc3/arch/x86_64/kernel/smpboot.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/kernel/smpboot.c 2004-03-30 23:39:44.000000000 -0800 @@ -60,8 +60,6 @@ char phys_proc_id[NR_CPUS]; /* Package I /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map; -/* which CPU (physical APIC ID) maps to which logical CPU number */ -volatile char x86_apicid_to_cpu[NR_CPUS]; /* which logical CPU number maps to which CPU (physical APIC ID) */ volatile char x86_cpu_to_apicid[NR_CPUS]; @@ -580,8 +578,6 @@ static void __init do_boot_cpu (int apic panic("failed fork for CPU %d", cpu); wake_up_forked_process(idle); x86_cpu_to_apicid[cpu] = apicid; - x86_apicid_to_cpu[apicid] = cpu; - /* * We remove it from the pidhash and the runqueue @@ -735,7 +731,7 @@ static void smp_tune_scheduling (void) static void __init smp_boot_cpus(unsigned int max_cpus) { - unsigned apicid, cpu; + unsigned apicid, cpu, bit, kicked; nmi_watchdog_default(); @@ -820,11 +816,13 @@ static void __init smp_boot_cpus(unsigne */ Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); - for (apicid = 0; apicid < NR_CPUS; apicid++) { + kicked = 1; + for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { + apicid = cpu_present_to_apicid(bit); /* * Don't even attempt to start the boot CPU! */ - if (apicid == boot_cpu_id) + if (apicid == boot_cpu_id || (apicid == BAD_APICID)) continue; if (!cpu_isset(apicid, phys_cpu_present_map)) @@ -833,6 +831,7 @@ static void __init smp_boot_cpus(unsigne continue; do_boot_cpu(apicid); + ++kicked; } /* --- linux-2.6.5-rc3/arch/x86_64/kernel/smp.c 2003-11-23 19:03:00.000000000 -0800 +++ 25/arch/x86_64/kernel/smp.c 2004-03-30 23:39:44.000000000 -0800 @@ -362,6 +362,18 @@ void smp_send_reschedule(int cpu) send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } +#ifdef CONFIG_KGDB +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} +#endif + /* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. --- linux-2.6.5-rc3/arch/x86_64/kernel/traps.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/kernel/traps.c 2004-03-30 23:39:44.000000000 -0800 @@ -45,6 +45,9 @@ #include #include +#ifdef CONFIG_KGDB +#include +#endif extern struct gate_struct idt_table[256]; --- linux-2.6.5-rc3/arch/x86_64/kernel/vmlinux.lds.S 2004-02-03 20:42:35.000000000 -0800 +++ 25/arch/x86_64/kernel/vmlinux.lds.S 2004-03-31 00:55:05.720924616 -0800 @@ -89,6 +89,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __early_begin = .; + __early_param : { *(__early_param) } + __early_end = .; __start___param = .; __param : { *(__param) } __stop___param = .; --- linux-2.6.5-rc3/arch/x86_64/kernel/x8664_ksyms.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/kernel/x8664_ksyms.c 2004-03-30 23:39:44.000000000 -0800 @@ -98,8 +98,10 @@ EXPORT_SYMBOL(copy_to_user); EXPORT_SYMBOL(copy_in_user); EXPORT_SYMBOL(strnlen_user); +#ifdef CONFIG_PCI EXPORT_SYMBOL(pci_alloc_consistent); EXPORT_SYMBOL(pci_free_consistent); +#endif #ifdef CONFIG_PCI EXPORT_SYMBOL(pcibios_penalize_isa_irq); --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/arch/x86_64/lib/kgdb_serial.c 2004-03-30 23:39:44.000000000 -0800 @@ -0,0 +1,490 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * Modified to allow invokation early in boot see also + * kgdb.h for instructions by George Anzinger(george@mvista.com) + * Modified to handle debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_KGDB_USER_CONSOLE +extern void kgdb_console_finit(void); +#endif +#define PRNT_off +#define TEST_EXISTANCE +#ifdef PRNT +#define dbprintk(s) printk s +#else +#define dbprintk(s) +#endif +#define TEST_INTERRUPT_off +#ifdef TEST_INTERRUPT +#define intprintk(s) printk s +#else +#define intprintk(s) +#endif + +#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT) + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +struct async_struct *gdb_async_info; +static int gdb_async_irq; + +#define outb_px(a,b) outb_p(b,a) + +static void program_uart(struct async_struct *info); +static void write_char(struct async_struct *info, int chr); +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(struct async_struct *info) +{ + char it = inb_p(info->port + UART_LSR); + + if (it & UART_LSR_DR) + return (inb_p(info->port + UART_RX)); + /* + * If we have a framing error assume somebody messed with + * our uart. Reprogram it and send '-' both ways... + */ + if (it & 0xc) { + program_uart(info); + write_char(info, '-'); + return ('-'); + } + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + + * Locking here is a bit of a problem. We MUST not lock out communication + * if we are trying to talk to gdb about a kgdb entry. ON the other hand + * we can loose chars in the console pass thru if we don't lock. It is also + * possible that we could hold the lock or be waiting for it when kgdb + * NEEDS to talk. Since kgdb locks down the world, it does not need locks. + * We do, of course have possible issues with interrupting a uart operation, + * but we will just depend on the uart status to help keep that straight. + + */ +static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +#endif + +static int +read_char(struct async_struct *info) +{ + int chr; + unsigned long flags; + local_irq_save(flags); +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_lock(&uart_interrupt_lock); + } +#endif + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + } else { + chr = read_data_bfr(info); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_unlock(&uart_interrupt_lock); + } +#endif + local_irq_restore(flags); + return (chr); +} + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(struct async_struct *info, int chr) +{ + while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ; + + outb_p(chr, info->port + UART_TX); + +} /* write_char */ + +/* + * Mostly we don't need a spinlock, but since the console goes + * thru here with interrutps on, well, we need to catch those + * chars. + */ +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine tty_getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via tty_putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static irqreturn_t +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct async_struct *info; + unsigned long flags; + + info = gdb_async_info; + if (!info || !info->tty || irq != gdb_async_irq) + return IRQ_NONE; + + local_irq_save(flags); + spin_lock(&uart_interrupt_lock); + do { + int chr = read_data_bfr(info); + intprintk(("Debug char on int: %x hex\n", chr)); + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + BREAKPOINT; + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { + /* buffer overflow tosses early char */ + read_char(info); + } + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI); + spin_unlock(&uart_interrupt_lock); + local_irq_restore(flags); + return IRQ_HANDLED; +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +/* These structure are filled in with values defined in asm/kgdb_local.h + */ +static struct serial_state state = SB_STATE; +static struct async_struct local_info = SB_INFO; +static int ok_to_enable_ints = 0; +static void kgdb_enable_ints_now(void); + +extern char *kgdb_version; +/* + * Hook an IRQ for KGDB. + * + * This routine is called from tty_putDebugChar, below. + */ +static int ints_disabled = 1; +int +gdb_hook_interrupt(struct async_struct *info, int verb) +{ + struct serial_state *state = info->state; + unsigned long flags; + int port; +#ifdef TEST_EXISTANCE + int scratch, scratch2; +#endif + + /* The above fails if memory managment is not set up yet. + * Rather than fail the set up, just keep track of the fact + * and pick up the interrupt thing later. + */ + gdb_async_info = info; + port = gdb_async_info->port; + gdb_async_irq = state->irq; + if (verb) { + printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n", + kgdb_version, + port, + gdb_async_irq, gdb_async_info->state->custom_divisor); + } + local_irq_save(flags); +#ifdef TEST_EXISTANCE + /* Existance test */ + /* Should not need all this, but just in case.... */ + + scratch = inb_p(port + UART_IER); + outb_px(port + UART_IER, 0); + outb_px(0xff, 0x080); + scratch2 = inb_p(port + UART_IER); + outb_px(port + UART_IER, scratch); + if (scratch2) { + printk + ("gdb_hook_interrupt: Could not clear IER, not a UART!\n"); + local_irq_restore(flags); + return 1; /* We failed; there's nothing here */ + } + scratch2 = inb_p(port + UART_LCR); + outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */ + outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */ + outb_px(port + UART_LCR, 0); + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO); + scratch = inb_p(port + UART_IIR) >> 6; + if (scratch == 1) { + printk("gdb_hook_interrupt: Undefined UART type!" + " Not a UART! \n"); + local_irq_restore(flags); + return 1; + } else { + dbprintk(("gdb_hook_interrupt: UART type " + "is %d where 0=16450, 2=16550 3=16550A\n", scratch)); + } + scratch = inb_p(port + UART_MCR); + outb_px(port + UART_MCR, UART_MCR_LOOP | scratch); + outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A); + scratch2 = inb_p(port + UART_MSR) & 0xF0; + outb_px(port + UART_MCR, scratch); + if (scratch2 != 0x90) { + printk("gdb_hook_interrupt: " + "Loop back test failed! Not a UART!\n"); + local_irq_restore(flags); + return scratch2 + 1000; /* force 0 to fail */ + } +#endif /* test existance */ + program_uart(info); + local_irq_restore(flags); + + return (0); + +} /* gdb_hook_interrupt */ + +static void +program_uart(struct async_struct *info) +{ + int port = info->port; + + (void) inb_p(port + UART_RX); + outb_px(port + UART_IER, 0); + + (void) inb_p(port + UART_RX); /* serial driver comments say */ + (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */ + (void) inb_p(port + UART_MSR); + outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB); + outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */ + outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */ + outb_px(port + UART_MCR, info->MCR); + + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */ + outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */ + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */ + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + return; +} + +/* + * tty_getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. In the + */ +int kgdb_in_isr = 0; +int kgdb_in_lsr = 0; +extern spinlock_t kgdb_spinlock; + +/* Caller takes needed protections */ + +int +tty_getDebugChar(void) +{ + volatile int chr, dum, time, end_time; + + dbprintk(("tty_getDebugChar(port %x): ", gdb_async_info->port)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + /* + * This trick says if we wait a very long time and get + * no char, return the -1 and let the upper level deal + * with it. + */ + rdtsc(dum, time); + end_time = time + 2; + while (((chr = read_char(gdb_async_info)) == -1) && + (end_time - time) > 0) { + rdtsc(dum, time); + }; + /* + * This covers our butts if some other code messes with + * our uart, hay, it happens :o) + */ + if (chr == -1) + program_uart(gdb_async_info); + + dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' ')); + return (chr); + +} /* tty_getDebugChar */ + +static int count = 3; +static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; + +static int __init +kgdb_enable_ints(void) +{ + set_debug_traps(); + if (kgdboe) { + return 0; + } + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 1); + } + ok_to_enable_ints = 1; + kgdb_enable_ints_now(); +#ifdef CONFIG_KGDB_USER_CONSOLE + kgdb_console_finit(); +#endif + return 0; +} + +#ifdef CONFIG_SERIAL_8250 +void shutdown_for_kgdb(struct async_struct *gdb_async_info); +#endif + +#define kgdb_mem_init_done() (1) + +static void +kgdb_enable_ints_now(void) +{ + if (!spin_trylock(&one_at_atime)) + return; + if (!ints_disabled) + goto exit; + if (kgdb_mem_init_done() && + ints_disabled) { /* don't try till mem init */ +#ifdef CONFIG_SERIAL_8250 + /* + * The ifdef here allows the system to be configured + * without the serial driver. + * Don't make it a module, however, it will steal the port + */ + shutdown_for_kgdb(gdb_async_info); +#endif + ints_disabled = request_irq(gdb_async_info->state->irq, + gdb_interrupt, + IRQ_T(gdb_async_info), + "KGDB-stub", NULL); + intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); + } + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + exit: + spin_unlock(&one_at_atime); +} + +/* + * tty_putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. Caller takes needed protections. + */ +void +tty_putDebugChar(int chr) +{ + dbprintk(("tty_putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n", + gdb_async_info->port, + chr, + chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + + write_char(gdb_async_info, chr); /* this routine will wait */ + count = (chr == '#') ? 0 : count + 1; + if ((count == 2)) { /* try to enable after */ + if (ints_disabled & ok_to_enable_ints) + kgdb_enable_ints_now(); /* try to enable after */ + + /* We do this a lot because, well we really want to get these + * interrupts. The serial driver will clear these bits when it + * initializes the chip. Every thing else it does is ok, + * but this. + */ + if (!ints_disabled) { + outb_px(gdb_async_info->port + UART_IER, + gdb_async_info->IER); + } + } + +} /* tty_putDebugChar */ + +/* + * This does nothing for the serial port, since it doesn't buffer. + */ + +void tty_flushDebugChar(void) +{ +} + +module_init(kgdb_enable_ints); --- linux-2.6.5-rc3/arch/x86_64/lib/Makefile 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/lib/Makefile 2004-03-30 23:39:44.000000000 -0800 @@ -10,3 +10,4 @@ lib-y := csum-partial.o csum-copy.o csum lib-y += memcpy.o memmove.o memset.o copy_user.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o +lib-$(CONFIG_KGDB) += kgdb_serial.o --- linux-2.6.5-rc3/arch/x86_64/Makefile 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/Makefile 2004-03-30 23:39:44.000000000 -0800 @@ -38,7 +38,7 @@ OBJCOPYFLAGS := -O binary -R .note -R .c LDFLAGS_vmlinux := -e stext cflags-$(CONFIG_MK8) += $(call check_gcc,-march=k8,) -cflags-$(CONFIG_MPSC) += $(call check_gcc,-march=prescott,) +cflags-$(CONFIG_MPSC) += $(call check_gcc,-march=nocona,) CFLAGS += $(cflags-y) CFLAGS += -mno-red-zone --- linux-2.6.5-rc3/arch/x86_64/mm/init.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/mm/init.c 2004-03-30 23:39:44.000000000 -0800 @@ -603,8 +603,6 @@ struct vm_area_struct *get_gate_vma(stru int in_gate_area(struct task_struct *task, unsigned long addr) { - struct vm_area_struct *vma = &gate_vma; - if (test_tsk_thread_flag(task, TIF_IA32)) - vma = &gate32_vma; + struct vm_area_struct *vma = get_gate_vma(task); return (addr >= vma->vm_start) && (addr < vma->vm_end); } --- linux-2.6.5-rc3/arch/x86_64/mm/numa.c 2004-03-29 22:08:28.000000000 -0800 +++ 25/arch/x86_64/mm/numa.c 2004-03-31 00:56:18.878802928 -0800 @@ -178,11 +178,13 @@ void __init paging_init(void) } /* [numa=off] */ -__init int numa_setup(char *opt) +static __init int numa_setup(char *opt) { - if (!strncmp(opt,"off",3)) + if (!strcmp(opt,"off")) numa_off = 1; - return 1; + else + return -1; + return 0; } - +__early_param("numa=", numa_setup); --- linux-2.6.5-rc3/CREDITS 2004-03-29 22:08:27.000000000 -0800 +++ 25/CREDITS 2004-03-31 00:54:53.691753328 -0800 @@ -289,6 +289,15 @@ S: Via Delle Palme, 9 S: Terni 05100 S: Italy +N: Krzysztof Benedyczak +E: golbi@mat.uni.torun.pl +W: http://www.mat.uni.torun.pl/~golbi +D: POSIX message queues fs (with M. Wronski) +S: ul. Podmiejska 52 +S: Radunica +S: 83-000 Pruszcz Gdanski +S: Poland + N: Randolph Bentson E: bentson@grieg.seaslug.org W: http://www.aa.net/~bentson/ @@ -2258,7 +2267,7 @@ S: D-37083 Goettingen S: Germany N: Thomas Molina -E: tmolina@cox.net +E: tmolina@cablespeed.com D: bug fixes, documentation, minor hackery N: James Morris @@ -3485,6 +3494,14 @@ S: 12725 SW Millikan Way, Suite 400 S: Beaverton, OR 97005 S: USA +N: Michal Wronski +E: wrona@mat.uni.torun.pl +W: http://www.mat.uni.torun.pl/~wrona +D: POSIX message queues fs (with K. Benedyczak) +S: ul. Teczowa 23/12 +S: 80-680 Gdansk-Sobieszewo +S: Poland + N: Frank Xia E: qx@math.columbia.edu D: Xiafs filesystem [defunct] --- linux-2.6.5-rc3/Documentation/binfmt_misc.txt 2003-10-08 15:07:08.000000000 -0700 +++ 25/Documentation/binfmt_misc.txt 2004-03-31 00:54:30.442287784 -0800 @@ -15,7 +15,7 @@ First you must mount binfmt_misc: mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc To actually register a new binary type, you have to set up a string looking like -:name:type:offset:magic:mask:interpreter: (where you can choose the ':' upon +:name:type:offset:magic:mask:interpreter:flags (where you can choose the ':' upon your needs) and echo it to /proc/sys/fs/binfmt_misc/register. Here is what the fields mean: - 'name' is an identifier string. A new /proc file will be created with this @@ -34,6 +34,28 @@ Here is what the fields mean: The mask is anded with the byte sequence of the file. - 'interpreter' is the program that should be invoked with the binary as first argument (specify the full path) + - 'flags' is an optional field that controls several aspects of the invocation + of the interpreter. It is a string of capital letters, each controls a certain + aspect. The following flags are supported - + 'P' - preserve-argv[0]. Legacy behavior of binfmt_misc is to overwrite the + original argv[0] with the full path to the binary. When this flag is + included, binfmt_misc will add an argument to the argument vector for + this purpose, thus preserving the original argv[0]. + 'O' - open-binary. Legacy behavior of binfmt_misc is to pass the full path + of the binary to the interpreter as an argument. When this flag is + included, binfmt_misc will open the file for reading and pass its + descriptor as an argument, instead of the full path, thus allowing + the interpreter to execute non-readable binaries. This feature should + be used with care - the interpreter has to be trusted not to emit + the contents of the non-readable binary. + 'C' - credentials. Currently, the behavior of binfmt_misc is to calculate + the credentials and security token of the new process according to + the interpreter. When this flag is included, these attributes are + calculated according to the binary. It also implies the 'O' flag. + This feature should be used with care as the interpreter + will run with root permissions when a setuid binary owned by root + is run with binfmt_misc. + There are some restrictions: - the whole register string may not exceed 255 characters @@ -83,9 +105,9 @@ If you want to pass special arguments to write a wrapper script for it. See Documentation/java.txt for an example. -Your interpreter should NOT look in the PATH for the filename; the -kernel passes it the full filename to use. Using the PATH can cause -unexpected behaviour and be a security hazard. +Your interpreter should NOT look in the PATH for the filename; the kernel +passes it the full filename (or the file descriptor) to use. Using $PATH can +cause unexpected behaviour and can be a security hazard. There is a web page about binfmt_misc at --- linux-2.6.5-rc3/Documentation/Changes 2004-03-10 20:41:24.000000000 -0800 +++ 25/Documentation/Changes 2004-03-31 00:54:54.566620328 -0800 @@ -112,8 +112,8 @@ System utilities Architectural changes --------------------- -DevFS is now in the kernel. See Documentation/filesystems/devfs/* in -the kernel source tree for all the gory details. +DevFS has been obsoleted in favour of udev +(http://www.kernel.org/pub/linux/utils/kernel/hotplug/) 32-bit UID support is now in place. Have fun! @@ -400,8 +400,3 @@ NFS-Utils o -Suggestions and corrections -=========================== - -Please feel free to submit changes, corrections, gripes, flames, -money, etc. to me . Happy Linuxing! --- linux-2.6.5-rc3/Documentation/CodingStyle 2004-03-10 20:41:24.000000000 -0800 +++ 25/Documentation/CodingStyle 2004-03-31 00:54:52.125991360 -0800 @@ -254,6 +254,8 @@ values. To do the latter, you can stick (interactive) (c-mode) (c-set-style "K&R") + (setq tab-width 8) + (setq indent-tabs-mode t) (setq c-basic-offset 8)) This will define the M-x linux-c-mode command. When hacking on a --- linux-2.6.5-rc3/Documentation/DocBook/kernel-hacking.tmpl 2003-09-27 18:57:43.000000000 -0700 +++ 25/Documentation/DocBook/kernel-hacking.tmpl 2004-03-31 00:54:52.450941960 -0800 @@ -204,11 +204,11 @@ - include/linux/interrupt.h lists the + include/linux/interrupt.h lists the different BH's. No matter how many CPUs you have, no two BHs will run at the same time. This made the transition to SMP simpler, but sucks hard for scalable performance. A very important bottom half is the timer - BH (include/linux/timer.h): you + BH (include/linux/timer.h): you can register to have it call functions for you in a given length of time. @@ -224,7 +224,7 @@ - tasklets (include/linux/interrupt.h) + tasklets (include/linux/interrupt.h) are like softirqs, except they are dynamically-registrable (meaning you can have as many as you want), and they also guarantee that any tasklet will only run on one CPU at any time, although different tasklets can @@ -241,7 +241,7 @@ You can tell you are in a softirq (or bottom half, or tasklet) using the in_softirq() macro - (include/asm/hardirq.h). + (include/asm/hardirq.h). @@ -330,7 +330,7 @@ asmlinkage long sys_mycall(int arg) You create a character device and implement an appropriate ioctl for it. This is much more flexible than system calls, doesn't have to be entered in every architecture's - include/asm/unistd.h and + include/asm/unistd.h and arch/kernel/entry.S file, and is much more likely to be accepted by Linus. @@ -343,7 +343,7 @@ asmlinkage long sys_mycall(int arg) Inside the ioctl you're in user context to a process. When a error occurs you return a negated errno (see - include/linux/errno.h), + include/linux/errno.h), otherwise you return 0. @@ -429,7 +429,7 @@ cond_resched(); /* Will sleep */ <function>printk()</function> - <filename class=headerfile>include/linux/kernel.h</filename> + <filename class="headerfile">include/linux/kernel.h</filename> @@ -447,7 +447,7 @@ printk(KERN_INFO "i = %u\n", i); - See include/linux/kernel.h; + See include/linux/kernel.h; for other KERN_ values; these are interpreted by syslog as the level. Special case: for printing an IP address use @@ -487,7 +487,7 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", get_user() / put_user() - include/asm/uaccess.h + include/asm/uaccess.h @@ -525,7 +525,7 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", <function>kmalloc()</function>/<function>kfree()</function> - <filename class=headerfile>include/linux/slab.h</filename> + include/linux/slab.h [MAY SLEEP: SEE BELOW] @@ -593,10 +593,10 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", If you are allocating at least PAGE_SIZE - (include/asm/page.h) bytes, + (include/asm/page.h) bytes, consider using __get_free_pages() - (include/linux/mm.h). It + (include/linux/mm.h). It takes an order argument (0 for page sized, 1 for double page, 2 for four pages etc.) and the same memory priority flag word as above. @@ -619,13 +619,13 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", Before inventing your own cache of often-used objects consider using a slab cache in - include/linux/slab.h + include/linux/slab.h <function>current</function> - <filename class=headerfile>include/asm/current.h</filename> + include/asm/current.h This global variable (really a macro) contains a pointer to @@ -638,8 +638,8 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", <function>udelay()</function>/<function>mdelay()</function> - <filename class=headerfile>include/asm/delay.h</filename> - <filename class=headerfile>include/linux/delay.h</filename> + <filename class="headerfile">include/asm/delay.h</filename> + <filename class="headerfile">include/linux/delay.h</filename> @@ -652,7 +652,7 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", <function>cpu_to_be32()</function>/<function>be32_to_cpu()</function>/<function>cpu_to_le32()</function>/<function>le32_to_cpu()</function> - <filename class=headerfile>include/asm/byteorder.h</filename> + <filename class="headerfile">include/asm/byteorder.h</filename> @@ -675,7 +675,7 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", <function>local_irq_save()</function>/<function>local_irq_restore()</function> - <filename class=headerfile>include/asm/system.h</filename> + <filename class="headerfile">include/asm/system.h</filename> @@ -690,7 +690,7 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", <function>local_bh_disable()</function>/<function>local_bh_enable()</function> - <filename class=headerfile>include/linux/interrupt.h</filename> + include/linux/interrupt.h These routines disable soft interrupts on the local CPU, and @@ -703,7 +703,7 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", <function>smp_processor_id</function>() - <filename class=headerfile>include/asm/smp.h</filename> + include/asm/smp.h smp_processor_id() returns the current @@ -715,7 +715,7 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", <type>__init</type>/<type>__exit</type>/<type>__initdata</type> - <filename class=headerfile>include/linux/init.h</filename> + include/linux/init.h After boot, the kernel frees up a special section; functions @@ -738,7 +738,7 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", <function>__initcall()</function>/<function>module_init()</function> - <filename class=headerfile>include/linux/init.h</filename> + include/linux/init.h Many parts of the kernel are well served as a module (dynamically-loadable parts of the kernel). Using the @@ -768,7 +768,7 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", <function>module_exit()</function> - <filename class=headerfile>include/linux/init.h</filename> + include/linux/init.h This macro defines the function to be called at module removal @@ -781,7 +781,7 @@ printk(KERN_INFO "my ip: %d.%d.%d.%d\n", <function>MOD_INC_USE_COUNT</function>/<function>MOD_DEC_USE_COUNT</function> - <filename class=headerfile>include/linux/module.h</filename> + include/linux/module.h These manipulate the module usage count, to protect against @@ -839,7 +839,7 @@ foo_open (...) Wait Queues - <filename class=headerfile>include/linux/wait.h</filename> + <filename class="headerfile">include/linux/wait.h</filename> [SLEEPS] @@ -874,7 +874,7 @@ foo_open (...) There is a macro to do this: wait_event_interruptible() - include/linux/sched.h The + include/linux/sched.h The first argument is the wait queue head, and the second is an expression which is evaluated; the macro returns 0 when this expression is true, or @@ -900,7 +900,7 @@ foo_open (...) Call wake_up() - include/linux/sched.h;, + include/linux/sched.h;, which will wake up every process in the queue. The exception is if one has TASK_EXCLUSIVE set, in which case the remainder of the queue will not be woken. @@ -915,7 +915,7 @@ foo_open (...) Certain operations are guaranteed atomic on all platforms. The first class of operations work on atomic_t - include/asm/atomic.h; this + include/asm/atomic.h; this contains a signed integer (at least 24 bits long), and you must use these functions to manipulate or read atomic_t variables. atomic_read() and @@ -943,7 +943,7 @@ foo_open (...) The second class of atomic operations is atomic bit operations on a long, defined in - include/asm/bitops.h. These + include/asm/bitops.h. These operations generally take a pointer to the bit pattern, and a bit number: 0 is the least significant bit. set_bit(), clear_bit() @@ -982,7 +982,7 @@ foo_open (...) <function>EXPORT_SYMBOL()</function> - <filename class=headerfile>include/linux/module.h</filename> + include/linux/module.h This is the classic method of exporting a symbol, and it works @@ -995,7 +995,7 @@ foo_open (...) <function>EXPORT_SYMBOL_GPL()</function> - <filename class=headerfile>include/linux/module.h</filename> + include/linux/module.h Similar to EXPORT_SYMBOL() except that the @@ -1012,7 +1012,7 @@ foo_open (...) Double-linked lists - <filename class=headerfile>include/linux/list.h</filename> + include/linux/list.h There are three sets of linked-list routines in the kernel @@ -1039,7 +1039,7 @@ foo_open (...) The filesystem code uses ERR_PTR() - include/linux/fs.h; to + include/linux/fs.h; to encode a negative error number into a pointer, and IS_ERR() and PTR_ERR() to get it back out again: avoids a separate pointer parameter for --- linux-2.6.5-rc3/Documentation/DocBook/kernel-locking.tmpl 2004-02-17 20:48:41.000000000 -0800 +++ 25/Documentation/DocBook/kernel-locking.tmpl 2004-03-31 00:54:52.454941352 -0800 @@ -87,7 +87,7 @@ Expected Results - + @@ -133,7 +133,7 @@
Possible Results - + Instance 1 @@ -222,14 +222,14 @@ There are two main types of kernel locks. The fundamental type is the spinlock - (include/asm/spinlock.h), + (include/asm/spinlock.h), which is a very simple single-holder lock: if you can't get the spinlock, you keep trying (spinning) until you can. Spinlocks are very small and fast, and can be used anywhere. The second type is a semaphore - (include/asm/semaphore.h): it + (include/asm/semaphore.h): it can have more than one holder at any time (the number decided at initialization time), although it is most commonly used as a single-holder lock (a mutex). If you can't get a semaphore, @@ -315,7 +315,7 @@ user context can be interrupted by a softirq, and secondly, the critical region could be entered from another CPU. This is where spin_lock_bh() - (include/linux/spinlock.h) is + (include/linux/spinlock.h) is used. It disables softirqs on that CPU, then grabs the lock. spin_unlock_bh() does the reverse. (The '_bh' suffix is a historical reference to "Bottom Halves", the @@ -333,7 +333,7 @@ This works perfectly for UP as well: the spin lock vanishes, and this macro simply becomes local_bh_disable() - (include/linux/interrupt.h), which + (include/linux/interrupt.h), which protects you from the softirq being run. @@ -463,7 +463,7 @@ This works perfectly for UP as well: the spin lock vanishes, and this macro simply becomes local_irq_disable() - (include/asm/smp.h), which + (include/asm/smp.h), which protects you from the softirq/tasklet/BH being run. @@ -545,7 +545,7 @@
Table of Locking Requirements - + @@ -1026,7 +1026,7 @@ In practice, atomic_t would refcnt. There are a number of atomic operations defined in -include/asm/atomic.h: these are +include/asm/atomic.h: these are guaranteed to be seen atomically from all CPUs in the system, so no lock is required. In this case, it is simpler than using spinlocks, although for anything non-trivial using spinlocks is clearer. The @@ -1296,7 +1296,7 @@ as Alan Cox says, Lock data, not
Consequences - + @@ -1437,7 +1437,7 @@ as Alan Cox says, Lock data, not themselves (by calling add_timer() at the end of their timer function). Because this is a fairly common case which is prone to races, you should use del_timer_sync() - (include/linux/timer.h) + (include/linux/timer.h) to handle this case. It returns the number of times the timer had to be deleted before we finally stopped it from adding itself back in. @@ -1749,7 +1749,7 @@ machines due to caching. an exclusive lock. See DEFINE_PER_CPU(), get_cpu_var() and put_cpu_var() - (include/linux/percpu.h). + (include/linux/percpu.h). @@ -1757,7 +1757,7 @@ machines due to caching. local_t type, and the cpu_local_inc() and related functions, which are more efficient than simple code on some architectures - (include/asm/local.h). + (include/asm/local.h). --- linux-2.6.5-rc3/Documentation/DocBook/lsm.tmpl 2003-06-14 12:18:28.000000000 -0700 +++ 25/Documentation/DocBook/lsm.tmpl 2004-03-31 00:54:52.636913688 -0800 @@ -27,7 +27,7 @@
cvance@nai.com
- Introduction --- linux-2.6.5-rc3/Documentation/DocBook/mousedrivers.tmpl 2003-06-14 12:18:29.000000000 -0700 +++ 25/Documentation/DocBook/mousedrivers.tmpl 2004-03-31 00:54:52.455941200 -0800 @@ -88,9 +88,9 @@ Each read from a bus mouse interface device returns a block of data. The first three bytes of each read are defined as follows: -
+
Mouse Data Encoding - + Byte 0 --- linux-2.6.5-rc3/Documentation/DocBook/parportbook.tmpl 2004-03-29 22:08:27.000000000 -0800 +++ 25/Documentation/DocBook/parportbook.tmpl 2004-03-31 00:54:52.457940896 -0800 @@ -969,7 +969,7 @@ port->ops->write_data (port, d); To recap, then: - + @@ -1387,7 +1387,7 @@ struct parport_operations { /dev/parport0) allows you to: - + @@ -1614,7 +1614,7 @@ struct parport_operations { incluce/linux/parport.h and include: - + IEEE1284_MODE_COMPAT @@ -1713,7 +1713,7 @@ struct parport_operations { affect future I/O operations. Available flags are: - + PP_FASTWRITE @@ -1759,7 +1759,7 @@ struct parport_operations { include/linux/parport.h: - + PARPORT_CONTROL_STROBE --- linux-2.6.5-rc3/Documentation/DocBook/sis900.tmpl 2003-08-08 22:55:10.000000000 -0700 +++ 25/Documentation/DocBook/sis900.tmpl 2004-03-31 00:54:52.458940744 -0800 @@ -336,7 +336,7 @@ Those packages are listed in Document/Ch distribution. If you have to install the driver other than those bundled in kernel release, you should have your driver file sis900.c and sis900.h -copied into /usr/src/linux/drivers/net/ first. +copied into /usr/src/linux/drivers/net/ first. There are two alternative ways to install the driver --- linux-2.6.5-rc3/Documentation/DocBook/via-audio.tmpl 2003-06-14 12:18:52.000000000 -0700 +++ 25/Documentation/DocBook/via-audio.tmpl 2004-03-31 00:54:52.459940592 -0800 @@ -227,7 +227,7 @@ Version 1.9.1 - + DSP read/write bugfixes from Thomas Sailer. @@ -252,7 +252,7 @@ Version 1.9.1 Version 1.1.15 - + Support for variable fragment size and variable fragment number (Rui @@ -325,7 +325,7 @@ Version 1.1.15 Version 1.1.14 - + Use VM_RESERVE when available, to eliminate unnecessary page faults. @@ -337,7 +337,7 @@ Version 1.1.14 Version 1.1.12 - + mmap bug fixes from Linus. @@ -349,7 +349,7 @@ Version 1.1.12 Version 1.1.11 - + Many more bug fixes. mmap enabled by default, but may still be buggy. @@ -368,7 +368,7 @@ Version 1.1.11 Version 1.1.10 - + Many bug fixes. mmap enabled by default, but may still be buggy. @@ -380,7 +380,7 @@ Version 1.1.10 Version 1.1.9 - + Redesign and rewrite audio playback implementation. (faster and smaller, hopefully) @@ -478,7 +478,7 @@ Version 1.1.9 Version 1.1.8 - + Clean up interrupt handler output. Fixes the following kernel error message: @@ -501,7 +501,7 @@ Version 1.1.8 Version 1.1.7 - + Fix module unload bug where mixer device left registered @@ -514,7 +514,7 @@ Version 1.1.7 Version 1.1.6 - + Rewrite via_set_rate to mimic ALSA basic AC97 rate setting @@ -546,7 +546,7 @@ Version 1.1.6 Version 1.1.5 - + Disable some overly-verbose debugging code @@ -573,7 +573,7 @@ Version 1.1.5 Version 1.1.4 - + Completed rewrite of driver. Eliminated SoundBlaster compatibility --- linux-2.6.5-rc3/Documentation/DocBook/videobook.tmpl 2003-06-14 12:18:51.000000000 -0700 +++ 25/Documentation/DocBook/videobook.tmpl 2004-03-31 00:54:52.461940288 -0800 @@ -176,8 +176,8 @@ int __init myradio_init(struct video_ini The types available are -
Device Types - +
Device Types + VFL_TYPE_RADIO/dev/radio{n} @@ -299,8 +299,8 @@ static int radio_ioctl(struct video_devi allows the applications to find out what sort of a card they have found and to figure out what they want to do about it. The fields in the structure are -
struct video_capability fields - +
struct video_capability fields + nameThe device text name. This is intended for the user. @@ -373,8 +373,8 @@ static int radio_ioctl(struct video_devi The video_tuner structure has the following fields -
struct video_tuner fields - +
struct video_tuner fields + int tunerThe number of the tuner in question @@ -406,8 +406,8 @@ static int radio_ioctl(struct video_devi
- struct video_tuner flags - +
struct video_tuner flags + VIDEO_TUNER_PALA PAL TV tuner @@ -429,8 +429,8 @@ static int radio_ioctl(struct video_devi
- struct video_tuner modes - +
struct video_tuner modes + VIDEO_MODE_PALPAL Format @@ -580,8 +580,8 @@ static int current_volume=0; Then we fill in the video_audio structure. This has the following format -
struct video_audio fields - +
struct video_audio fields + audioThe input the user wishes to query @@ -615,8 +615,8 @@ static int current_volume=0;
- struct video_audio flags - +
struct video_audio flags + VIDEO_AUDIO_MUTEThe audio is currently muted. We @@ -633,8 +633,8 @@ static int current_volume=0;
- struct video_audio modes - +
struct video_audio modes + VIDEO_SOUND_MONOMono sound @@ -862,8 +862,8 @@ static struct video_device my_camera We use the extra video capability flags that did not apply to the radio interface. The video related flags are -
Capture Capabilities - +
Capture Capabilities + VID_TYPE_CAPTUREWe support image capture @@ -1204,8 +1204,8 @@ static int camera_ioctl(struct video_dev inputs to the video card). Our example card has a single camera input. The fields in the structure are -
struct video_channel fields - +
struct video_channel fields + @@ -1227,8 +1227,8 @@ static int camera_ioctl(struct video_dev
- struct video_channel flags - +
struct video_channel flags + VIDEO_VC_TUNERChannel has a tuner. @@ -1238,8 +1238,8 @@ static int camera_ioctl(struct video_dev
- struct video_channel types - +
struct video_channel types + VIDEO_TYPE_TVTelevision input. @@ -1251,8 +1251,8 @@ static int camera_ioctl(struct video_dev
- struct video_channel norms - +
struct video_channel norms + VIDEO_MODE_PALPAL encoded Television @@ -1337,8 +1337,8 @@ static int camera_ioctl(struct video_dev for every other pixel in the image. The other common formats the interface defines are -
Framebuffer Encodings - +
Framebuffer Encodings + GREYLinear greyscale. This is for simple cameras and the @@ -1475,8 +1475,8 @@ static struct video_buffer capture_fb; display. The video_window structure is used to describe the way the image should be displayed. -
struct video_window fields - +
struct video_window fields + widthThe width in pixels of the desired image. The card @@ -1512,8 +1512,8 @@ static struct video_buffer capture_fb; Each clip is a struct video_clip which has the following fields -
video_clip fields - +
video_clip fields + x, yCo-ordinates relative to the display --- linux-2.6.5-rc3/Documentation/early-userspace/README 2003-08-22 19:23:39.000000000 -0700 +++ 25/Documentation/early-userspace/README 2004-03-31 00:54:19.063017696 -0800 @@ -71,5 +71,31 @@ custom initramfs images that meet your n For questions and help, you can sign up for the early userspace mailing list at http://www.zytor.com/mailman/listinfo/klibc +How does it work? +================= + +The kernel has currently 3 ways to mount the root filesystem: + +a) all required device and filesystem drivers compiled into the kernel, no + initrd. init/main.c:init() will call prepare_namespace() to mount the + final root filesystem, based on the root= option and optional init= to run + some other init binary than listed at the end of init/main.c:init(). + +b) some device and filesystem drivers built as modules and stored in an + initrd. The initrd must contain a binary '/linuxrc' which is supposed to + load these driver modules. It is also possible to mount the final root + filesystem via linuxrc and use the pivot_root syscall. The initrd is + mounted and executed via prepare_namespace(). + +c) using initramfs. The call to prepare_namespace() must be skipped. + This means that a binary must do all the work. Said binary can be stored + into initramfs either via modifying usr/gen_init_cpio.c or via the new + initrd format, an cpio archive. It must be called "/init". This binary + is responsible to do all the things prepare_namespace() would do. + + To remain backwards compatibility, the /init binary will only run if it + comes via an initramfs cpio archive. If this is not the case, + init/main.c:init() will run prepare_namespace() to mount the final root + and exec one of the predefined init binaries. Bryan O'Sullivan --- linux-2.6.5-rc3/Documentation/filesystems/proc.txt 2004-03-10 20:41:24.000000000 -0800 +++ 25/Documentation/filesystems/proc.txt 2004-03-31 00:54:34.273705320 -0800 @@ -38,6 +38,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem ------------------------------------------------------------------------------ Preface @@ -1814,6 +1815,30 @@ The /proc/net/ipx_route table holds a gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. +2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem +---------------------------------------------------------- + +The "mqueue" filesystem provides the necessary kernel features to enable the +creation of a user space library that implements the POSIX message queues +API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System +Interfaces specification.) + +The "mqueue" filesystem contains values for determining/setting the amount of +resources used by the file system. + +/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the +maximum number of message queues allowed on the system. + +/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the +maximum number of messages in a queue value. In fact it is the limiting value +for another (user) limit which is set in mq_open invocation. This attribute of +a queue must be less or equal then msg_max. + +/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the +maximum message size value (it is every message queue's attribute set during +its creation). + + ------------------------------------------------------------------------------ Summary ------------------------------------------------------------------------------ --- linux-2.6.5-rc3/Documentation/firmware_class/hotplug-script 2003-06-22 12:04:43.000000000 -0700 +++ 25/Documentation/firmware_class/hotplug-script 2004-03-31 00:54:53.093844224 -0800 @@ -6,9 +6,9 @@ HOTPLUG_FW_DIR=/usr/lib/hotplug/firmware/ -echo 1 > /sysfs/$DEVPATH/loading +echo 1 > /sys/$DEVPATH/loading cat $HOTPLUG_FW_DIR/$FIRMWARE > /sysfs/$DEVPATH/data -echo 0 > /sysfs/$DEVPATH/loading +echo 0 > /sys/$DEVPATH/loading # To cancel the load in case of error: # --- linux-2.6.5-rc3/Documentation/firmware_class/README 2003-08-22 19:23:39.000000000 -0700 +++ 25/Documentation/firmware_class/README 2004-03-31 00:54:53.093844224 -0800 @@ -60,9 +60,9 @@ HOTPLUG_FW_DIR=/usr/lib/hotplug/firmware/ - echo 1 > /sysfs/$DEVPATH/loading + echo 1 > /sys/$DEVPATH/loading cat $HOTPLUG_FW_DIR/$FIRMWARE > /sysfs/$DEVPATH/data - echo 0 > /sysfs/$DEVPATH/loading + echo 0 > /sys/$DEVPATH/loading Random notes: ============ --- linux-2.6.5-rc3/Documentation/i2c/porting-clients 2004-01-09 00:04:30.000000000 -0800 +++ 25/Documentation/i2c/porting-clients 2004-03-30 20:21:45.000000000 -0800 @@ -1,4 +1,4 @@ -Revision 3, 2003-10-04 +Revision 4, 2004-03-30 Jean Delvare Greg KH @@ -24,9 +24,10 @@ Technical changes: #include #include #include - #include /* if you need VRM support */ - #include /* if you have I/O operations */ - Some extra headers may be required for a given driver. + #include /* if you need VRM support */ + #include /* if you have I/O operations */ + Please respect this inclusion order. Some extra headers may be + required for a given driver (e.g. "lm75.h"). * [Addresses] SENSORS_I2C_END becomes I2C_CLIENT_END, SENSORS_ISA_END becomes I2C_CLIENT_ISA_END. @@ -37,9 +38,9 @@ Technical changes: names for sysfs files (see the Sysctl section below). * [Function prototypes] The detect functions loses its flags - parameter. Sysctl (e.g. lm75_temp) and miscellaneous (e.g. - swap_bytes) functions are off the list of prototypes. This - usually leaves five prototypes: + parameter. Sysctl (e.g. lm75_temp) and miscellaneous functions + are off the list of prototypes. This usually leaves five + prototypes: static int lm75_attach_adapter(struct i2c_adapter *adapter); static int lm75_detect(struct i2c_adapter *adapter, int address, int kind); @@ -70,13 +71,14 @@ Technical changes: * [Detect] As mentioned earlier, the flags parameter is gone. The type_name and client_name strings are replaced by a single name string, which will be filled with a lowercase, short string - (typically the driver name, e.g. "lm75"). The errorN labels are - reduced to the number needed. If that number is 2 (i2c-only - drivers), it is advised that the labels are named exit and - exit_free. For i2c+isa drivers, labels should be named ERROR0, - ERROR1 and ERROR2. Don't forget to properly set err before - jumping to error labels. By the way, labels should be - left-aligned. + (typically the driver name, e.g. "lm75"). + In i2c-only drivers, drop the i2c_is_isa_adapter check, it's + useless. + The errorN labels are reduced to the number needed. If that number + is 2 (i2c-only drivers), it is advised that the labels are named + exit and exit_free. For i2c+isa drivers, labels should be named + ERROR0, ERROR1 and ERROR2. Don't forget to properly set err before + jumping to error labels. By the way, labels should be left-aligned. Use memset to fill the client and data area with 0x00. Use i2c_set_clientdata to set the client data (as opposed to a direct access to client->data). @@ -85,6 +87,11 @@ Technical changes: device_create_file. Move the driver initialization before any sysfs file creation. +* [Init] Limits must not be set by the driver (can be done later in + user-space). Chip should not be reset default (although a module + parameter may be used to force is), and initialization should be + limited to the strictly necessary steps. + * [Detach] Get rid of data, remove the call to i2c_deregister_entry. @@ -92,7 +99,8 @@ Technical changes: i2c_get_clientdata(client) instead. * [Interface] Init function should not print anything. Make sure - there is a MODULE_LICENSE() line. + there is a MODULE_LICENSE() line, at the bottom of the file + (after MODULE_AUTHOR() and MODULE_DESCRIPTION(), in this order). Coding policy: @@ -102,8 +110,7 @@ Coding policy: can. Calls to printk/pr_debug for debugging purposes are replaced by calls to dev_dbg. Here is an example on how to call it (taken from lm75_detect): - dev_dbg(&adapter->dev, - "lm75_detect called for an ISA bus adapter?!?\n"); + dev_dbg(&client->dev, "Starting lm75 update\n"); Replace other printk calls with the dev_info, dev_err or dev_warn function, as appropriate. @@ -119,3 +126,8 @@ Coding policy: * [Layout] Avoid extra empty lines between comments and what they comment. Respect the coding style (see Documentation/CodingStyle), in particular when it comes to placing curly braces. + +* [Comments] Make sure that no comment refers to a file that isn't + part of the Linux source tree (typically doc/chips/), + and that remaining comments still match the code. Merging comment + lines when possible is encouraged. --- linux-2.6.5-rc3/Documentation/i2c/sysfs-interface 2004-03-29 22:08:27.000000000 -0800 +++ 25/Documentation/i2c/sysfs-interface 2004-03-30 20:21:45.000000000 -0800 @@ -74,18 +74,15 @@ to cause an alarm) is chip-dependent. ************ in[0-8]_min Voltage min value. - Fixed point value in form XXXX. Divide by 1000 to get - Volts. + Unit: millivolt Read/Write in[0-8]_max Voltage max value. - Fixed point value in form XXXX. Divide by 1000 to get - Volts. + Unit: millivolt Read/Write in[0-8]_input Voltage input value. - Fixed point value in form XXXX. Divide by 1000 to get - Volts. + Unit: millivolt Read only Actual voltage depends on the scaling resistors on the motherboard, as recommended in the chip datasheet. @@ -95,10 +92,10 @@ in[0-8]_input Voltage input value. However, some drivers (notably lm87 and via686a) do scale, with various degrees of success. These drivers will output the actual voltage. - First two values are read/write and third is read only. + Typical usage: in0_* CPU #1 voltage (not scaled) - in1_* CPU #1 voltage (not scaled) + in1_* CPU #2 voltage (not scaled) in2_* 3.3V nominal (not scaled) in3_* 5.0V nominal (scaled) in4_* 12.0V nominal (scaled) @@ -108,17 +105,16 @@ in[0-8]_input Voltage input value. in8_* varies in0_ref CPU core reference voltage. + Unit: millivolt Read only. - Fixed point value in form XXXX corresponding to CPU core - voltage as told to the sensor chip. Divide by 1000 to - get Volts. Not always correct. + Not always correct. vrm Voltage Regulator Module version number. Read only. - Two digit number (XX), first is major version, second is + Two digit number, first is major version, second is minor version. - Affects the way the driver calculates the core voltage from - the vid pins. See doc/vid for details. + Affects the way the driver calculates the CPU core reference + voltage from the vid pins. ******** @@ -126,23 +122,23 @@ vrm Voltage Regulator Module version nu ******** fan[1-3]_min Fan minimum value - Integer value indicating RPM + Unit: revolution/min (RPM) Read/Write. fan[1-3]_input Fan input value. - Integer value indicating RPM + Unit: revolution/min (RPM) Read only. fan[1-3]_div Fan divisor. - Integers in powers of two (1,2,4,8,16,32,64,128). - Some chips only support values 1,2,4,8. - See doc/fan-divisors for details. + Integer value in powers of two (1, 2, 4, 8, 16, 32, 64, 128). + Some chips only support values 1, 2, 4 and 8. + Note that this is actually an internal clock divisor, which + affects the measurable speed range, not the read value. fan[1-3]_pwm Pulse width modulation fan control. - Integer 0 - 255 + Integer value in the range 0 to 255 Read/Write 255 is max or 100%. - Corresponds to the fans 1-3. fan[1-3]_pwm_enable Switch PWM on and off. @@ -157,46 +153,46 @@ fan[1-3]_pwm_enable **************** temp[1-3]_type Sensor type selection. - Integers 1,2,3, or thermistor Beta value (3435) + Integers 1, 2, 3 or thermistor Beta value (3435) Read/Write. + 1: PII/Celeron Diode + 2: 3904 transistor + 3: thermal diode + Not all types are supported by all chips temp[1-4]_max Temperature max value. - Fixed point value in form XXXXX and should be divided by - 1000 to get degrees Celsius. + Unit: millidegree Celcius Read/Write value. temp[1-3]_min Temperature min value. - Fixed point value in form XXXXX and should be divided by - 1000 to get degrees Celsius. + Unit: millidegree Celcius Read/Write value. temp[1-3]_max_hyst Temperature hysteresis value for max limit. - Fixed point value in form XXXXX and should be divided by - 1000 to get degrees Celsius. Must be reported as an - absolute temperature, NOT a delta from the max value. + Unit: millidegree Celcius + Must be reported as an absolute temperature, NOT a delta + from the max value. Read/Write value. temp[1-4]_input Temperature input value. - Fixed point value in form XXXXX and should be divided by - 1000 to get degrees Celsius. + Unit: millidegree Celcius Read only value. temp[1-4]_crit Temperature critical value, typically greater than corresponding temp_max values. - Fixed point value in form XXXXX and should be divided by - 1000 to get degrees Celsius. + Unit: millidegree Celcius Read/Write value. temp[1-2]_crit_hyst Temperature hysteresis value for critical limit. - Fixed point value in form XXXXX and should be divided by - 1000 to get degrees Celsius. Must be reported as an - absolute temperature, NOT a delta from the critical value. + Unit: millidegree Celcius + Must be reported as an absolute temperature, NOT a delta + from the critical value. Read/Write value. If there are multiple temperature sensors, temp1_* is - generally the sensor inside the chip itself, generally + generally the sensor inside the chip itself, reported as "motherboard temperature". temp2_* to temp4_* are generally sensors external to the chip itself, for example the thermal diode inside the CPU or @@ -211,15 +207,15 @@ Note that no known chip provides current so this part is theoretical, so to say. curr[1-n]_max Current max value - Fixed point XXXXX, divide by 1000 to get Amps. + Unit: milliampere Read/Write. curr[1-n]_min Current min value. - Fixed point XXXXX, divide by 1000 to get Amps. + Unit: milliampere Read/Write. curr[1-n]_input Current input value - Fixed point XXXXX, divide by 1000 to get Amps. + Unit: milliampere Read only. @@ -246,7 +242,7 @@ beep_enable Beep/interrupt enable beep_mask Bitmask for beep. Same format as 'alarms' with the same bit locations. - Read only. + Read/Write eeprom Raw EEPROM data in binary form. Read only. --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/Documentation/i386/kgdb/andthen 2004-03-30 20:43:37.000000000 -0800 @@ -0,0 +1,100 @@ + +define set_andthen + set var $thp=0 + set var $thp=(struct kgdb_and_then_struct *)&kgdb_data[0] + set var $at_size = (sizeof kgdb_data)/(sizeof *$thp) + set var $at_oc=kgdb_and_then_count + set var $at_cc=$at_oc +end + +define andthen_next + set var $at_cc=$arg0 +end + +define andthen + andthen_set_edge + if ($at_cc >= $at_oc) + printf "Outside window. Window size is %d\n",($at_oc-$at_low) + else + printf "%d: ",$at_cc + output *($thp+($at_cc++ % $at_size )) + printf "\n" + end +end +define andthen_set_edge + set var $at_oc=kgdb_and_then_count + set var $at_low = $at_oc - $at_size + if ($at_low < 0 ) + set var $at_low = 0 + end + if (( $at_cc > $at_oc) || ($at_cc < $at_low)) + printf "Count outside of window, setting count to " + if ($at_cc >= $at_oc) + set var $at_cc = $at_oc + else + set var $at_cc = $at_low + end + printf "%d\n",$at_cc + end +end + +define beforethat + andthen_set_edge + if ($at_cc <= $at_low) + printf "Outside window. Window size is %d\n",($at_oc-$at_low) + else + printf "%d: ",$at_cc-1 + output *($thp+(--$at_cc % $at_size )) + printf "\n" + end +end + +document andthen_next + andthen_next + . sets the number of the event to display next. If this event + . is not in the event pool, either andthen or beforethat will + . correct it to the nearest event pool edge. The event pool + . ends at the last event recorded and begins + . prior to that. If beforethat is used next, it will display + . event -1. +. + andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + + +document andthen + andthen +. displays the next event in the list. sets up to display +. the oldest saved event first. +. (optional) count of the event to display. +. note the number of events saved is specified at configure time. +. if events are saved between calls to andthen the index will change +. but the displayed event will be the next one (unless the event buffer +. is overrun). +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + +document set_andthen + set_andthen +. sets up to use the and commands. +. if you have defined your own struct, use the above and +. then enter the following: +. p $thp=(struct kgdb_and_then_structX *)&kgdb_data[0] +. where is the name of your structure. +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + +document beforethat + beforethat +. displays the next prior event in the list. sets up to +. display the last occuring event first. +. +. note the number of events saved is specified at configure time. +. if events are saved between calls to beforethat the index will change +. but the displayed event will be the next one (unless the event buffer +. is overrun). +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/Documentation/i386/kgdb/debug-nmi.txt 2004-03-30 20:43:37.000000000 -0800 @@ -0,0 +1,37 @@ +Subject: Debugging with NMI +Date: Mon, 12 Jul 1999 11:28:31 -0500 +From: David Grothe +Organization: Gcom, Inc +To: David Grothe + +Kernel hackers: + +Maybe this is old hat, but it is new to me -- + +On an ISA bus machine, if you short out the A1 and B1 pins of an ISA +slot you will generate an NMI to the CPU. This interrupts even a +machine that is hung in a loop with interrupts disabled. Used in +conjunction with kgdb < +ftp://ftp.gcom.com/pub/linux/src/kgdb-2.3.35/kgdb-2.3.35.tgz > you can +gain debugger control of a machine that is hung in the kernel! Even +without kgdb the kernel will print a stack trace so you can find out +where it was hung. + +The A1/B1 pins are directly opposite one another and the farthest pins +towards the bracket end of the ISA bus socket. You can stick a paper +clip or multi-meter probe between them to short them out. + +I had a spare ISA bus to PC104 bus adapter around. The PC104 end of the +board consists of two rows of wire wrap pins. So I wired a push button +between the A1/B1 pins and now have an ISA board that I can stick into +any ISA bus slot for debugger entry. + +Microsoft has a circuit diagram of a PCI card at +http://www.microsoft.com/hwdev/DEBUGGING/DMPSW.HTM. If you want to +build one you will have to mail them and ask for the PAL equations. +Nobody makes one comercially. + +[THIS TIP COMES WITH NO WARRANTY WHATSOEVER. It works for me, but if +your machine catches fire, it is your problem, not mine.] + +-- Dave (the kgdb guy) --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/Documentation/i386/kgdb/gdb-globals.txt 2004-03-30 20:43:37.000000000 -0800 @@ -0,0 +1,71 @@ +Sender: akale@veritas.com +Date: Fri, 23 Jun 2000 19:26:35 +0530 +From: "Amit S. Kale" +Organization: Veritas Software (India) +To: Dave Grothe , linux-kernel@vger.rutgers.edu +CC: David Milburn , + "Edouard G. Parmelan" , + ezannoni@cygnus.com, Keith Owens +Subject: Re: Module debugging using kgdb + +Dave Grothe wrote: +> +> Amit: +> +> There is a 2.4.0 version of kgdb on our ftp site: +> ftp://ftp.gcom.com/pub/linux/src/kgdb. I mirrored your version of gdb +> and loadmodule.sh there. +> +> Have a look at the README file and see if I go it right. If not, send +> me some corrections and I will update it. +> +> Does your version of gdb solve the global variable problem? + +Yes. +Thanks to Elena Zanoni, gdb (developement version) can now calculate +correctly addresses of dynamically loaded object files. I have not been +following gdb developement for sometime and am not sure when symbol +address calculation fix is going to appear in a gdb stable version. + +Elena, any idea when the fix will make it to a prebuilt gdb from a +redhat release? + +For the time being I have built a gdb developement version. It can be +used for module debugging with loadmodule.sh script. + +The problem with calculating of module addresses with previous versions +of gdb was as follows: +gdb did not use base address of a section while calculating address of +a symbol in the section in an object file loaded via 'add-symbol-file'. +It used address of .text segment instead. Due to this addresses of +symbols in .data, .bss etc. (e.g. global variables) were calculated incorrectly. + +Above mentioned fix allow gdb to use base address of a segment while +calculating address of a symbol in it. It adds a parameter '-s' to +'add-symbol-file' command for specifying base address of a segment. + +loadmodule.sh script works as follows. + +1. Copy a module file to target machine. +2. Load the module on the target machine using insmod with -m parameter. +insmod produces a module load map which contains base addresses of all +sections in the module and addresses of symbols in the module file. +3. Find all sections and their base addresses in the module from +the module map. +4. Generate a script that loads the module file. The script uses +'add-symbol-file' and specifies address of text segment followed by +addresses of all segments in the module. + +Here is an example gdb script produced by loadmodule.sh script. + +add-symbol-file foo 0xd082c060 -s .text.lock 0xd08cbfb5 +-s .fixup 0xd08cfbdf -s .rodata 0xd08cfde0 -s __ex_table 0xd08e3b38 +-s .data 0xd08e3d00 -s .bss 0xd08ec8c0 -s __ksymtab 0xd08ee838 + +With this command gdb can calculate addresses of symbols in ANY segment +in a module file. + +Regards. +-- +Amit Kale +Veritas Software ( http://www.veritas.com ) --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/Documentation/i386/kgdb/gdbinit 2004-03-30 20:43:37.000000000 -0800 @@ -0,0 +1,14 @@ +shell echo -e "\003" >/dev/ttyS0 +set remotebaud 38400 +target remote /dev/ttyS0 +define si +stepi +printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx +printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp +x/i $eip +end +define ni +nexti +printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx +printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp +x/i $eip --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/Documentation/i386/kgdb/gdbinit.hw 2004-03-30 20:43:37.000000000 -0800 @@ -0,0 +1,117 @@ + +#Using ia-32 hardware breakpoints. +# +#4 hardware breakpoints are available in ia-32 processors. These breakpoints +#do not need code modification. They are set using debug registers. +# +#Each hardware breakpoint can be of one of the +#three types: execution, write, access. +#1. An Execution breakpoint is triggered when code at the breakpoint address is +#executed. +#2. A write breakpoint ( aka watchpoints ) is triggered when memory location +#at the breakpoint address is written. +#3. An access breakpoint is triggered when memory location at the breakpoint +#address is either read or written. +# +#As hardware breakpoints are available in limited number, use software +#breakpoints ( br command in gdb ) instead of execution hardware breakpoints. +# +#Length of an access or a write breakpoint defines length of the datatype to +#be watched. Length is 1 for char, 2 short , 3 int. +# +#For placing execution, write and access breakpoints, use commands +#hwebrk, hwwbrk, hwabrk +#To remove a breakpoint use hwrmbrk command. +# +#These commands take following types of arguments. For arguments associated +#with each command, use help command. +#1. breakpointno: 0 to 3 +#2. length: 1 to 3 +#3. address: Memory location in hex ( without 0x ) e.g c015e9bc +# +#Use the command exinfo to find which hardware breakpoint occured. + +#hwebrk breakpointno address +define hwebrk + maintenance packet Y$arg0,0,0,$arg1 +end +document hwebrk + hwebrk
+ Places a hardware execution breakpoint + = 0 - 3 +
= Hex digits without leading "0x". +end + +#hwwbrk breakpointno length address +define hwwbrk + maintenance packet Y$arg0,1,$arg1,$arg2 +end +document hwwbrk + hwwbrk
+ Places a hardware write breakpoint + = 0 - 3 + = 1 (1 byte), 2 (2 byte), 3 (4 byte) +
= Hex digits without leading "0x". +end + +#hwabrk breakpointno length address +define hwabrk + maintenance packet Y$arg0,1,$arg1,$arg2 +end +document hwabrk + hwabrk
+ Places a hardware access breakpoint + = 0 - 3 + = 1 (1 byte), 2 (2 byte), 3 (4 byte) +
= Hex digits without leading "0x". +end + +#hwrmbrk breakpointno +define hwrmbrk + maintenance packet y$arg0 +end +document hwrmbrk + hwrmbrk + = 0 - 3 + Removes a hardware breakpoint +end + +define reboot + maintenance packet r +end +#exinfo +define exinfo + maintenance packet qE +end +document exinfo + exinfo + Gives information about a breakpoint. +end +define get_th + p $th=(struct thread_info *)((int)$esp & ~8191) +end +document get_th + get_tu + Gets and prints the current thread_info pointer, Defines th to be it. +end +define get_cu + p $cu=((struct thread_info *)((int)$esp & ~8191))->task +end +document get_cu + get_cu + Gets and print the "current" value. Defines $cu to be it. +end +define int_off + set var $flags=$eflags + set $eflags=$eflags&~0x200 + end +define int_on + set var $eflags|=$flags&0x200 + end +document int_off + saves the current interrupt state and clears the processor interrupt + flag. Use int_on to restore the saved flag. +end +document int_on + Restores the interrupt flag saved by int_off. +end --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/Documentation/i386/kgdb/gdbinit-modules 2004-03-30 20:43:37.000000000 -0800 @@ -0,0 +1,146 @@ +# +# Usefull GDB user-command to debug Linux Kernel Modules with gdbstub. +# +# This don't work for Linux-2.0 or older. +# +# Author Edouard G. Parmelan +# +# +# Fri Apr 30 20:33:29 CEST 1999 +# First public release. +# +# Major cleanup after experiment Linux-2.0 kernel without success. +# Symbols of a module are not in the correct order, I can't explain +# why :( +# +# Fri Mar 19 15:41:40 CET 1999 +# Initial version. +# +# Thu Jan 6 16:29:03 CST 2000 +# A little fixing by Dave Grothe +# +# Mon Jun 19 09:33:13 CDT 2000 +# Alignment changes from Edouard Parmelan +# +# The basic idea is to find where insmod load the module and inform +# GDB to load the symbol table of the module with the GDB command +# ``add-symbol-file
''. +# +# The Linux kernel holds the list of all loaded modules in module_list, +# this list end with &kernel_module (exactly with module->next == NULL, +# but the last module is not a real module). +# +# Insmod allocates the struct module before the object file. Since +# Linux-2.1, this structure contain his size. The real address of +# the object file is then (char*)module + module->size_of_struct. +# +# You can use three user functions ``mod-list'', ``mod-print-symbols'' +# and ``add-module-symbols''. +# +# mod-list list all loaded modules with the format: +# +# +# As soon as you have found the address of your module, you can +# print its exported symbols (mod-print-symbols) or inform GDB to add +# symbols from your module file (mod-add-symbols). +# +# The argument that you give to mod-print-symbols or mod-add-symbols +# is the from the mod-list command. +# +# When using the mod-add-symbols command you must also give the full +# pathname of the modules object code file. +# +# The command mod-add-lis is an example of how to make this easier. +# You can edit this macro to contain the path name of your own +# favorite module and then use it as a shorthand to load it. You +# still need the module-address, however. +# +# The internal function ``mod-validate'' set the GDB variable $mod +# as a ``struct module*'' if the kernel known the module otherwise +# $mod is set to NULL. This ensure to not add symbols for a wrong +# address. +# +# Have a nice hacking day ! +# +# +define mod-list + set $mod = (struct module*)module_list + # the last module is the kernel, ignore it + while $mod != &kernel_module + printf "%p\t%s\n", (long)$mod, ($mod)->name + set $mod = $mod->next + end +end +document mod-list +List all modules in the form: +Use the as the argument for the other +mod-commands: mod-print-symbols, mod-add-symbols. +end + +define mod-validate + set $mod = (struct module*)module_list + while ($mod != $arg0) && ($mod != &kernel_module) + set $mod = $mod->next + end + if $mod == &kernel_module + set $mod = 0 + printf "%p is not a module\n", $arg0 + end +end +document mod-validate +mod-validate +Internal user-command used to validate the module parameter. +If is a real loaded module, set $mod to it otherwise set $mod to 0. +end + + +define mod-print-symbols + mod-validate $arg0 + if $mod != 0 + set $i = 0 + while $i < $mod->nsyms + set $sym = $mod->syms[$i] + printf "%p\t%s\n", $sym->value, $sym->name + set $i = $i + 1 + end + end +end +document mod-print-symbols +mod-print-symbols +Print all exported symbols of the module. see mod-list +end + + +define mod-add-symbols-align + mod-validate $arg0 + if $mod != 0 + set $mod_base = ($mod->size_of_struct + (long)$mod) + if ($arg2 != 0) && (($mod_base & ($arg2 - 1)) != 0) + set $mod_base = ($mod_base | ($arg2 - 1)) + 1 + end + add-symbol-file $arg1 $mod_base + end +end +document mod-add-symbols-align +mod-add-symbols-align +Load the symbols table of the module from the object file where +first section aligment is . +To retreive alignment, use `objdump -h '. +end + +define mod-add-symbols + mod-add-symbols-align $arg0 $arg1 sizeof(long) +end +document mod-add-symbols +mod-add-symbols +Load the symbols table of the module from the object file. +Default alignment is 4. See mod-add-symbols-align. +end + +define mod-add-lis + mod-add-symbols-align $arg0 /usr/src/LiS/streams.o 16 +end +document mod-add-lis +mod-add-lis +Does mod-add-symbols /usr/src/LiS/streams.o +end --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/Documentation/i386/kgdb/kgdbeth.txt 2004-03-30 23:39:43.580394472 -0800 @@ -0,0 +1,92 @@ +KGDB over ethernet +================== + +Authors +------- + +Robert Walsh (2.6 port) +wangdi (2.6 port) +Matt Mackall (netpoll api) +San Mehat (original 2.4 code) + + +Introduction +------------ + +KGDB supports debugging over ethernet (kgdboe) via polling of a given +network interface. Most cards should be supported automatically. +Debugging facilities are available as soon as the network driver and +kgdboe have initialized. Unfortunately, this is too late in the boot +process for debugging some issues, but works quite well for many +others. This should not interfere with normal network usage and +doesn't require a dedicated NIC. + +Terminology +----------- + +This document uses the following terms: + + TARGET: the machine being debugged. + HOST: the machine running gdb. + + +Usage +----- + +You need to use the following command-line option on the TARGET kernel: + + kgdboe=[tgt-port]@/[dev],[host-port]@/[host-macaddr] + + where + tgt-port source for UDP packets (defaults to 6443) + tgt-ip source IP to use (interface address) + dev network interface (eth0) + host-port HOST UDP port (6442) (not really used) + host-ip IP address for HOST machine + host-macaddr ethernet MAC address for HOST (ff:ff:ff:ff:ff:ff) + + examples: + + kgdboe=7000@192.168.0.1/eth1,7001@192.168.0.2/00:05:3C:04:47:5D + this machine is 192.168.0.1 on eth1 + remote machine is 192.168.0.2 with MAC address 00:05:3C:04:47:5D + listen for gdb packets on port 7000 + send unsolicited gdb packets to port 7001 + + kgdboe=@192.168.0.1/,@192.168.0.2/ + this machine is 192.168.0.1 on default interface eth0 + remote machine is 192.168.0.2, use default broadcast MAC address + listen for gdb packets on default port 6443 + send unsolicited gdb packets to port 6442 + +Only packets originating from the configured HOST IP address will be +accepted by the debugger. + +On the HOST side, run gdb as normal and use a remote UDP host as the +target: + + % gdb ./vmlinux + GNU gdb Red Hat Linux (5.3post-0.20021129.18rh) + Copyright 2003 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux-gnu"... + (gdb) target remote udp:HOSTNAME:6443 + +You can now continue as if you were debugging over a serial line. + +Limitations +----------- + +The current release of this code is exclusive of using kgdb on a +serial interface, so you must boot without the kgdboe option to use +serial debugging. Trying to debug the network driver while using it +will prove interesting. + +Bug reports +----------- + +Send bug reports to Robert Walsh and Matt +Mackall . --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/Documentation/i386/kgdb/kgdb.txt 2004-03-30 20:43:37.000000000 -0800 @@ -0,0 +1,775 @@ +Last edit: <20030806.1637.12> +This file has information specific to the i386 kgdb option. Other +platforms with the kgdb option may behave in a similar fashion. + +New features: +============ +20030806.1557.37 +This version was made against the 2.6.0-test2 kernel. We have made the +following changes: + +- The getthread() code in the stub calls find_task_by_pid(). It fails + if we are early in the bring up such that the pid arrays have yet to + be allocated. We have added a line to kernel/pid.c to make + "kgdb_pid_init_done" true once the arrays are allocated. This way the + getthread() code knows not to call. This is only used by the thread + debugging stuff and threads will not yet exist at this point in the + boot. + +- For some reason, gdb was not asking for a new thread list when the + "info thread" command was given. We changed to the newer version of + the thread info command and gdb now seems to ask when needed. Result, + we now get all threads in the thread list. + +- We now respond to the ThreadExtraInfo request from gdb with the thread + name from task_struct .comm. This then appears in the thread list. + Thoughts on additional options for this are welcome. Things such as + "has BKL" and "Preempted" come to mind. I think we could have a flag + word that could enable different bits of info here. + +- We now honor, sort of, the C and S commands. These are continue and + single set after delivering a signal. We ignore the signal and do the + requested action. This only happens when we told gdb that a signal + was the reason for entry, which is only done on memory faults. The + result is that you can now continue into the Oops. + +- We changed the -g to -gdwarf-2. This seems to be the same as -ggdb, + but it is more exact on what language to use. + +- We added two dwarf2 include files and a bit of code at the end of + entry.S. This does not yet work, so it is disabled. Still we want to + keep track of the code and "maybe" someone out there can fix it. + +- Randy Dunlap sent some fix ups for this file which are now merged. + +- Hugh Dickins sent a fix to a bit of code in traps.c that prevents a + compiler warning if CONFIG_KGDB is off (now who would do that :). + +- Andrew Morton sent a fix for the serial driver which is now merged. + +- Andrew also sent a change to the stub around the cpu managment code + which is also merged. + +- Andrew also sent a patch to make "f" as well as "g" work as SysRq + commands to enter kgdb, merged. + +- If CONFIG_KGDB and CONFIG_DEBUG_SPINLOCKS are both set we added a + "who" field to the spinlock data struct. This is filled with + "current" when ever the spinlock suceeds. Useful if you want to know + who has the lock. + +_ And last, but not least, we fixed the "get_cu" macro to properly get + the current value of "current". + +New features: +============ +20030505.1827.27 +We are starting to align with the sourceforge version, at least in +commands. To this end, the boot command string to start kgdb at +boot time has been changed from "kgdb" to "gdb". + +Andrew Morton sent a couple of patches which are now included as follows: +1.) We now return a flag to the interrupt handler. +2.) We no longer use smp_num_cpus (a conflict with the lock meter). +3.) And from William Lee Irwin III code to make + sure high-mem is set up before we attempt to register our interrupt + handler. +We now include asm/kgdb.h from config.h so you will most likely never +have to include it. It also 'NULLS' the kgdb macros you might have in +your code when CONFIG_KGDB is not defined. This allows you to just +turn off CONFIG_KGDB to turn off all the kgdb_ts() calls and such. +This include is conditioned on the machine being an x86 so as to not +mess with other archs. + +20020801.1129.03 +This is currently the version for the 2.4.18 (and beyond?) kernel. + +We have several new "features" beginning with this version: + +1.) Kgdb now syncs the "other" CPUs with a cross-CPU NMI. No more + waiting and it will pull that guy out of an IRQ off spin lock :) + +2.) We doctored up the code that tells where a task is waiting and + included it so that the "info thread" command will show a bit more + than "schedule()". Try it... + +3.) Added the ability to call a function from gdb. All the standard gdb + issues apply, i.e. if you hit a breakpoint in the function, you are + not allowed to call another (gdb limitation, not kgdb). To help + this capability we added a memory allocation function. Gdb does not + return this memory (it is used for strings that you pass to that function + you are calling from gdb) so we fixed up a way to allow you to + manually return the memory (see below). + +4.) Kgdb time stamps (kgdb_ts()) are enhanced to expand what was the + interrupt flag to now also include the preemption count and the + "in_interrupt" info. The flag is now called "with_pif" to indicate + the order, preempt_count, in_interrupt, flag. The preempt_count is + shifted left by 4 bits so you can read the count in hex by dropping + the low order digit. In_interrupt is in bit 1, and the flag is in + bit 0. + +5.) The command: "p kgdb_info" is now expanded and prints something + like: +(gdb) p kgdb_info +$2 = {used_malloc = 0, called_from = 0xc0107506, entry_tsc = 67468627259, + errcode = 0, vector = 3, print_debug_info = 0, hold_on_sstep = 1, + cpus_waiting = {{task = 0xc027a000, pid = 32768, hold = 0, + regs = 0xc027bf84}, {task = 0x0, pid = 0, hold = 0, regs = 0x0}}} + + Things to note here: a.) used_malloc is the amount of memory that + has been malloc'ed to do calls from gdb. You can reclaim this + memory like this: "p kgdb_info.used_malloc=0" Cool, huh? b.) + cpus_waiting is now "sized" by the number of CPUs you enter at + configure time in the kgdb configure section. This is NOT used + anywhere else in the system, but it is "nice" here. c.) The task's + "pid" is now in the structure. This is the pid you will need to use + to decode to the thread id to get gdb to look at that thread. + Remember that the "info thread" command prints a list of threads + wherein it numbers each thread with its reference number followed + by the thread's pid. Note that the per-CPU idle threads actually + have pids of 0 (yes, there is more than one pid 0 in an SMP system). + To avoid confusion, kgdb numbers these threads with numbers beyond + the MAX_PID. That is why you see 32768 and above. + +6.) A subtle change, we now provide the complete register set for tasks + that are active on the other CPUs. This allows better trace back on + those tasks. + + And, let's mention what we could not fix. Back-trace from all but the + thread that we trapped will, most likely, have a bogus entry in it. + The problem is that gdb does not recognize the entry code for + functions that use "current" near (at all?) the entry. The compiler + is putting the "current" decode as the first two instructions of the + function where gdb expects to find %ebp changing code. Back trace + also has trouble with interrupt frames. I am talking with Daniel + Jacobowitz about some way to fix this, but don't hold your breath. + +20011220.0050.35 +Major enhancement with this version is the ability to hold one or more +CPUs in an SMP system while allowing the others to continue. Also, by +default only the current CPU is enabled on single-step commands (please +note that gdb issues single-step commands at times other than when you +use the si command). + +Another change is to collect some useful information in +a global structure called "kgdb_info". You should be able to just: + +p kgdb_info + +although I have seen cases where the first time this is done gdb just +prints the first member but prints the whole structure if you then enter +CR (carriage return or enter). This also works: + +p *&kgdb_info + +Here is a sample: +(gdb) p kgdb_info +$4 = {called_from = 0xc010732c, entry_tsc = 32804123790856, errcode = 0, + vector = 3, print_debug_info = 0} + +"Called_from" is the return address from the current entry into kgdb. +Sometimes it is useful to know why you are in kgdb, for example, was +it an NMI or a real breakpoint? The simple way to interrogate this +return address is: + +l *0xc010732c + +which will print the surrounding few lines of source code. + +"Entry_tsc" is the CPU TSC on entry to kgdb (useful to compare to the +kgdb_ts entries). + +"errcode" and "vector" are other entry parameters which may be helpful on +some traps. + +"print_debug_info" is the internal debugging kgdb print enable flag. Yes, +you can modify it. + +In SMP systems kgdb_info also includes the "cpus_waiting" structure and +"hold_on_step": + +(gdb) p kgdb_info +$7 = {called_from = 0xc0112739, entry_tsc = 1034936624074, errcode = 0, + vector = 2, print_debug_info = 0, hold_on_sstep = 1, cpus_waiting = {{ + task = 0x0, hold = 0, regs = 0x0}, {task = 0xc71b8000, hold = 0, + regs = 0xc71b9f70}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}}} + +"Cpus_waiting" has an entry for each CPU other than the current one that +has been stopped. Each entry contains the task_struct address for that +CPU, the address of the regs for that task and a hold flag. All these +have the proper typing so that, for example: + +p *kgdb_info.cpus_waiting[1].regs + +will print the registers for CPU 1. + +"Hold_on_sstep" is a new feature with this version and comes up set or +true. What this means is that whenever kgdb is asked to single-step all +other CPUs are held (i.e. not allowed to execute). The flag applies to +all but the current CPU and, again, can be changed: + +p kgdb_info.hold_on_sstep=0 + +restores the old behavior of letting all CPUs run during single-stepping. + +Likewise, each CPU has a "hold" flag, which if set, locks that CPU out +of execution. Note that this has some risk in cases where the CPUs need +to communicate with each other. If kgdb finds no CPU available on exit, +it will push a message thru gdb and stay in kgdb. Note that it is legal +to hold the current CPU as long as at least one CPU can execute. + +20010621.1117.09 +This version implements an event queue. Events are signaled by calling +a function in the kgdb stub and may be examined from gdb. See EVENTS +below for details. This version also tightens up the interrupt and SMP +handling to not allow interrupts on the way to kgdb from a breakpoint +trap. It is fine to allow these interrupts for user code, but not +system debugging. + +Version +======= + +This version of the kgdb package was developed and tested on +kernel version 2.4.16. It will not install on any earlier kernels. +It is possible that it will continue to work on later versions +of 2.4 and then versions of 2.5 (I hope). + + +Debugging Setup +=============== + +Designate one machine as the "development" machine. This is the +machine on which you run your compiles and which has your source +code for the kernel. Designate a second machine as the "target" +machine. This is the machine that will run your experimental +kernel. + +The two machines will be connected together via a serial line out +one or the other of the COM ports of the PC. You will need the +appropriate modem eliminator (null modem) cable(s) for this. + +Decide on which tty port you want the machines to communicate, then +connect them up back-to-back using the null modem cable. COM1 is +/dev/ttyS0 and COM2 is /dev/ttyS1. You should test this connection +with the two machines prior to trying to debug a kernel. Once you +have it working, on the TARGET machine, enter: + +setserial /dev/ttyS0 (or what ever tty you are using) + +and record the port address and the IRQ number. + +On the DEVELOPMENT machine you need to apply the patch for the kgdb +hooks. You have probably already done that if you are reading this +file. + +On your DEVELOPMENT machine, go to your kernel source directory and do +"make Xconfig" where X is one of "x", "menu", or "". If you are +configuring in the standard serial driver, it must not be a module. +Either yes or no is ok, but making the serial driver a module means it +will initialize after kgdb has set up the UART interrupt code and may +cause a failure of the control-C option discussed below. The configure +question for the serial driver is under the "Character devices" heading +and is: + +"Standard/generic (8250/16550 and compatible UARTs) serial support" + +Go down to the kernel debugging menu item and open it up. Enable the +kernel kgdb stub code by selecting that item. You can also choose to +turn on the "-ggdb -O1" compile options. The -ggdb causes the compiler +to put more debug info (like local symbols) in the object file. On the +i386 -g and -ggdb are the same so this option just reduces to "O1". The +-O1 reduces the optimization level. This may be helpful in some cases, +be aware, however, that this may also mask the problem you are looking +for. + +The baud rate. Default is 115200. What ever you choose be sure that +the host machine is set to the same speed. I recommend the default. + +The port. This is the I/O address of the serial UART that you should +have gotten using setserial as described above. The standard COM1 port +(3f8) using IRQ 4 is default. COM2 is 2f8 which by convention uses IRQ +3. + +The port IRQ (see above). + +Stack overflow test. This option makes a minor change in the trap, +system call and interrupt code to detect stack overflow and transfer +control to kgdb if it happens. (Some platforms have this in the +baseline code, but the i386 does not.) + +You can also configure the system to recognize the boot option +"console=kgdb" which if given will cause all console output during +booting to be put thru gdb as well as other consoles. This option +requires that gdb and kgdb be connected prior to sending console output +so, if they are not, a breakpoint is executed to force the connection. +This will happen before any kernel output (it is going thru gdb, right), +and will stall the boot until the connection is made. + +You can also configure in a patch to SysRq to enable the kGdb SysRq. +This request generates a breakpoint. Since the serial port IRQ line is +set up after any serial drivers, it is possible that this command will +work when the control-C will not. + +Save and exit the Xconfig program. Then do "make clean" , "make dep" +and "make bzImage" (or whatever target you want to make). This gets the +kernel compiled with the "-g" option set -- necessary for debugging. + +You have just built the kernel on your DEVELOPMENT machine that you +intend to run on your TARGET machine. + +To install this new kernel, use the following installation procedure. +Remember, you are on the DEVELOPMENT machine patching the kernel source +for the kernel that you intend to run on the TARGET machine. + +Copy this kernel to your target machine using your usual procedures. I +usually arrange to copy development: +/usr/src/linux/arch/i386/boot/bzImage to /vmlinuz on the TARGET machine +via a LAN based NFS access. That is, I run the cp command on the target +and copy from the development machine via the LAN. Run Lilo (see "man +lilo" for details on how to set this up) on the new kernel on the target +machine so that it will boot! Then boot the kernel on the target +machine. + +On the DEVELOPMENT machine, create a file called .gdbinit in the +directory /usr/src/linux. An example .gdbinit file looks like this: + +shell echo -e "\003" >/dev/ttyS0 +set remotebaud 38400 (or what ever speed you have chosen) +target remote /dev/ttyS0 + + +Change the "echo" and "target" definition so that it specifies the tty +port that you intend to use. Change the "remotebaud" definition to +match the data rate that you are going to use for the com line. + +You are now ready to try it out. + +Boot your target machine with "kgdb" in the boot command i.e. something +like: + +lilo> test kgdb + +or if you also want console output thru gdb: + +lilo> test kgdb console=kgdb + +You should see the lilo message saying it has loaded the kernel and then +all output stops. The kgdb stub is trying to connect with gdb. Start +gdb something like this: + + +On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". +When gdb gets the symbols loaded it will read your .gdbinit file and, if +everything is working correctly, you should see gdb print out a few +lines indicating that a breakpoint has been taken. It will actually +show a line of code in the target kernel inside the kgdb activation +code. + +The gdb interaction should look something like this: + + linux-dev:/usr/src/linux# gdb vmlinux + GDB is free software and you are welcome to distribute copies of it + under certain conditions; type "show copying" to see the conditions. + There is absolutely no warranty for GDB; type "show warranty" for details. + GDB 4.15.1 (i486-slackware-linux), + Copyright 1995 Free Software Foundation, Inc... + breakpoint () at i386-stub.c:750 + 750 } + (gdb) + +You can now use whatever gdb commands you like to set breakpoints. +Enter "continue" to start your target machine executing again. At this +point the target system will run at full speed until it encounters +your breakpoint or gets a segment violation in the kernel, or whatever. + +If you have the kgdb console enabled when you continue, gdb will print +out all the console messages. + +The above example caused a breakpoint relatively early in the boot +process. For the i386 kgdb it is possible to code a break instruction +as the first C-language point in init/main.c, i.e. as the first instruction +in start_kernel(). This could be done as follows: + +#include + breakpoint(); + +This breakpoint() is really a function that sets up the breakpoint and +single-step hardware trap cells and then executes a breakpoint. Any +early hard coded breakpoint will need to use this function. Once the +trap cells are set up they need not be set again, but doing it again +does not hurt anything, so you don't need to be concerned about which +breakpoint is hit first. Once the trap cells are set up (and the kernel +sets them up in due course even if breakpoint() is never called) the +macro: + +BREAKPOINT; + +will generate an inline breakpoint. This may be more useful as it stops +the processor at the instruction instead of in a function a step removed +from the location of interest. In either case must be +included to define both breakpoint() and BREAKPOINT. + +Triggering kgdbstub at other times +================================== + +Often you don't need to enter the debugger until much later in the boot +or even after the machine has been running for some time. Once the +kernel is booted and interrupts are on, you can force the system to +enter the debugger by sending a control-C to the debug port. This is +what the first line of the recommended .gdbinit file does. This allows +you to start gdb any time after the system is up as well as when the +system is already at a breakpoint. (In the case where the system is +already at a breakpoint the control-C is not needed, however, it will +be ignored by the target so no harm is done. Also note the the echo +command assumes that the port speed is already set. This will be true +once gdb has connected, but it is best to set the port speed before you +run gdb.) + +Another simple way to do this is to put the following file in you ~/bin +directory: + +#!/bin/bash +echo -e "\003" > /dev/ttyS0 + +Here, the ttyS0 should be replaced with what ever port you are using. +The "\003" is control-C. Once you are connected with gdb, you can enter +control-C at the command prompt. + +An alternative way to get control to the debugger is to enable the kGdb +SysRq command. Then you would enter Alt-SysRq-g (all three keys at the +same time, but push them down in the order given). To refresh your +memory of the available SysRq commands try Alt-SysRq-=. Actually any +undefined command could replace the "=", but I like to KNOW that what I +am pushing will never be defined. + +Debugging hints +=============== + +You can break into the target machine at any time from the development +machine by typing ^C (see above paragraph). If the target machine has +interrupts enabled this will stop it in the kernel and enter the +debugger. + +There is unfortunately no way of breaking into the kernel if it is +in a loop with interrupts disabled, so if this happens to you then +you need to place exploratory breakpoints or printk's into the kernel +to find out where it is looping. The exploratory breakpoints can be +entered either thru gdb or hard coded into the source. This is very +handy if you do something like: + +if () BREAKPOINT; + + +There is a copy of an e-mail in the Documentation/i386/kgdb/ directory +(debug-nmi.txt) which describes how to create an NMI on an ISA bus +machine using a paper clip. I have a sophisticated version of this made +by wiring a push button switch into a PC104/ISA bus adapter card. The +adapter card nicely furnishes wire wrap pins for all the ISA bus +signals. + +When you are done debugging the kernel on the target machine it is a +good idea to leave it in a running state. This makes reboots faster, +bypassing the fsck. So do a gdb "continue" as the last gdb command if +this is possible. To terminate gdb itself on the development machine +and leave the target machine running, first clear all breakpoints and +continue, then type ^Z to suspend gdb and then kill it with "kill %1" or +something similar. + +If gdbstub Does Not Work +======================== + +If it doesn't work, you will have to troubleshoot it. Do the easy +things first like double checking your cabling and data rates. You +might try some non-kernel based programs to see if the back-to-back +connection works properly. Just something simple like cat /etc/hosts +>/dev/ttyS0 on one machine and cat /dev/ttyS0 on the other will tell you +if you can send data from one machine to the other. Make sure it works +in both directions. There is no point in tearing out your hair in the +kernel if the line doesn't work. + +All of the real action takes place in the file +/usr/src/linux/arch/i386/kernel/kgdb_stub.c. That is the code on the target +machine that interacts with gdb on the development machine. In gdb you can +turn on a debug switch with the following command: + + set remotedebug + +This will print out the protocol messages that gdb is exchanging with +the target machine. + +Another place to look is /usr/src/arch/i386/lib/kgdb_serial.c. This is +the code that talks to the serial port on the target side. There might +be a problem there. In particular there is a section of this code that +tests the UART which will tell you what UART you have if you define +"PRNT" (just remove "_off" from the #define PRNT_off). To view this +report you will need to boot the system without any beakpoints. This +allows the kernel to run to the point where it calls kgdb to set up +interrupts. At this time kgdb will test the UART and print out the type +it finds. (You need to wait so that the printks are actually being +printed. Early in the boot they are cached, waiting for the console to +be enabled. Also, if kgdb is entered thru a breakpoint it is possible +to cause a dead lock by calling printk when the console is locked. The +stub thus avoids doing printks from breakpoints, especially in the +serial code.) At this time, if the UART fails to do the expected thing, +kgdb will print out (using printk) information on what failed. (These +messages will be buried in all the other boot up messages. Look for +lines that start with "gdb_hook_interrupt:". You may want to use dmesg +once the system is up to view the log. If this fails or if you still +don't connect, review your answers for the port address. Use: + +setserial /dev/ttyS0 + +to get the current port and IRQ information. This command will also +tell you what the system found for the UART type. The stub recognizes +the following UART types: + +16450, 16550, and 16550A + +If you are really desperate you can use printk debugging in the +kgdbstub code in the target kernel until you get it working. In particular, +there is a global variable in /usr/src/linux/arch/i386/kernel/kgdb_stub.c +named "remote_debug". Compile your kernel with this set to 1, rather +than 0 and the debug stub will print out lots of stuff as it does +what it does. Likewise there are debug printks in the kgdb_serial.c +code that can be turned on with simple changes in the macro defines. + + +Debugging Loadable Modules +========================== + +This technique comes courtesy of Edouard Parmelan + + +When you run gdb, enter the command + +source gdbinit-modules + +This will read in a file of gdb macros that was installed in your +kernel source directory when kgdb was installed. This file implements +the following commands: + +mod-list + Lists the loaded modules in the form + +mod-print-symbols + Prints all the symbols in the indicated module. + +mod-add-symbols + Loads the symbols from the object file and associates them + with the indicated module. + +After you have loaded the module that you want to debug, use the command +mod-list to find the of your module. Then use that +address in the mod-add-symbols command to load your module's symbols. +From that point onward you can debug your module as if it were a part +of the kernel. + +The file gdbinit-modules also contains a command named mod-add-lis as +an example of how to construct a command of your own to load your +favorite module. The idea is to "can" the pathname of the module +in the command so you don't have to type so much. + +Threads +======= + +Each process in a target machine is seen as a gdb thread. gdb thread +related commands (info threads, thread n) can be used. + +ia-32 hardware breakpoints +========================== + +kgdb stub contains support for hardware breakpoints using debugging features +of ia-32(x86) processors. These breakpoints do not need code modification. +They use debugging registers. 4 hardware breakpoints are available in ia-32 +processors. + +Each hardware breakpoint can be of one of the following three types. + +1. Execution breakpoint - An Execution breakpoint is triggered when code + at the breakpoint address is executed. + + As limited number of hardware breakpoints are available, it is + advisable to use software breakpoints ( break command ) instead + of execution hardware breakpoints, unless modification of code + is to be avoided. + +2. Write breakpoint - A write breakpoint is triggered when memory + location at the breakpoint address is written. + + A write or can be placed for data of variable length. Length of + a write breakpoint indicates length of the datatype to be + watched. Length is 1 for 1 byte data , 2 for 2 byte data, 3 for + 4 byte data. + +3. Access breakpoint - An access breakpoint is triggered when memory + location at the breakpoint address is either read or written. + + Access breakpoints also have lengths similar to write breakpoints. + +IO breakpoints in ia-32 are not supported. + +Since gdb stub at present does not use the protocol used by gdb for hardware +breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros +for hardware breakpoints are described below. + +hwebrk - Places an execution breakpoint + hwebrk breakpointno address +hwwbrk - Places a write breakpoint + hwwbrk breakpointno length address +hwabrk - Places an access breakpoint + hwabrk breakpointno length address +hwrmbrk - Removes a breakpoint + hwrmbrk breakpointno +exinfo - Tells whether a software or hardware breakpoint has occurred. + Prints number of the hardware breakpoint if a hardware breakpoint has + occurred. + +Arguments required by these commands are as follows +breakpointno - 0 to 3 +length - 1 to 3 +address - Memory location in hex digits ( without 0x ) e.g c015e9bc + +SMP support +========== + +When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb +client, all the processors are forced to enter the debugger. Current +thread corresponds to the thread running on the processor where +breakpoint occurred. Threads running on other processor(s) appear +similar to other non-running threads in the 'info threads' output. +Within the kgdb stub there is a structure "waiting_cpus" in which kgdb +records the values of "current" and "regs" for each CPU other than the +one that hit the breakpoint. "current" is a pointer to the task +structure for the task that CPU is running, while "regs" points to the +saved registers for the task. This structure can be examined with the +gdb "p" command. + +ia-32 hardware debugging registers on all processors are set to same +values. Hence any hardware breakpoints may occur on any processor. + +gdb troubleshooting +=================== + +1. gdb hangs +Kill it. restart gdb. Connect to target machine. + +2. gdb cannot connect to target machine (after killing a gdb and +restarting another) If the target machine was not inside debugger when +you killed gdb, gdb cannot connect because the target machine won't +respond. In this case echo "Ctrl+C"(ASCII 3) to the serial line. +e.g. echo -e "\003" > /dev/ttyS1 +This forces that target machine into the debugger, after which you +can connect. + +3. gdb cannot connect even after echoing Ctrl+C into serial line +Try changing serial line settings min to 1 and time to 0 +e.g. stty min 1 time 0 < /dev/ttyS1 +Try echoing again + +Check serial line speed and set it to correct value if required +e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 + +EVENTS +====== + +Ever want to know the order of things happening? Which CPU did what and +when? How did the spinlock get the way it is? Then events are for +you. Events are defined by calls to an event collection interface and +saved for later examination. In this case, kgdb events are saved by a +very fast bit of code in kgdb which is fully SMP and interrupt protected +and they are examined by using gdb to display them. Kgdb keeps only +the last N events, where N must be a power of two and is defined at +configure time. + + +Events are signaled to kgdb by calling: + +kgdb_ts(data0,data1) + +For each call kgdb records each call in an array along with other info. +Here is the array definition: + +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + long long at_time; + int from_ln; + char * in_src; + void *from; + int with_if; + int data0; + int data1; +}; + +For SMP machines the CPU is recorded, for all machines the TSC is +recorded (gets a time stamp) as well as the line number and source file +the call was made from. The address of the (from), the "if" (interrupt +flag) and the two data items are also recorded. The macro kgdb_ts casts +the types to int, so you can put any 32-bit values here. There is a +configure option to select the number of events you want to keep. A +nice number might be 128, but you can keep up to 1024 if you want. The +number must be a power of two. An "andthen" macro library is provided +for gdb to help you look at these events. It is also possible to define +a different structure for the event storage and cast the data to this +structure. For example the following structure is defined in kgdb: + +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + long long at_time; + int from_ln; + char * in_src; + void *from; + int with_if; + struct task_struct *t1; + struct task_struct *t2; +}; + +If you use this for display, the data elements will be displayed as +pointers to task_struct entries. You may want to define your own +structure to use in casting. You should only change the last two items +and you must keep the structure size the same. Kgdb will handle these +as 32-bit ints, but within that constraint you can define a structure to +cast to any 32-bit quantity. This need only be available to gdb and is +only used for casting in the display code. + +Final Items +=========== + +I picked up this code from Amit S. Kale and enhanced it. + +If you make some really cool modification to this stuff, or if you +fix a bug, please let me know. + +George Anzinger + + +Amit S. Kale + + +(First kgdb by David Grothe ) + +(modified by Tigran Aivazian ) + Putting gdbstub into the kernel config menu. + +(modified by Scott Foehner ) + Hooks for entering gdbstub at boot time. + +(modified by Amit S. Kale ) + Threads, ia-32 hw debugging, mp support, console support, + nmi watchdog handling. + +(modified by George Anzinger ) + Extended threads to include the idle threads. + Enhancements to allow breakpoint() at first C code. + Use of module_init() and __setup() to automate the configure. + Enhanced the cpu "collection" code to work in early bring-up. + Added ability to call functions from gdb + Print info thread stuff without going back to schedule() + Now collect the "other" cpus with an IPI/ NMI. --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25/Documentation/i386/kgdb/loadmodule.sh 2004-03-30 20:43:37.000000000 -0800 @@ -0,0 +1,78 @@ +#/bin/sh +# This script loads a module on a target machine and generates a gdb script. +# source generated gdb script to load the module file at appropriate addresses +# in gdb. +# +# Usage: +# Loading the module on target machine and generating gdb script) +# [foo]$ loadmodule.sh +# +# Loading the module file into gdb +# (gdb) source +# +# Modify following variables according to your setup. +# TESTMACHINE - Name of the target machine +# GDBSCRIPTS - The directory where a gdb script will be generated +# +# Author: Amit S. Kale (akale@veritas.com). +# +# If you run into problems, please check files pointed to by following +# variables. +# ERRFILE - /tmp/.errs contains stderr output of insmod +# MAPFILE - /tmp/.map contains stdout output of insmod +# GDBSCRIPT - $GDBSCRIPTS/load gdb script. + +TESTMACHINE=foo +GDBSCRIPTS=/home/bar + +if [ $# -lt 1 ] ; then { + echo Usage: $0 modulefile + exit +} ; fi + +MODULEFILE=$1 +MODULEFILEBASENAME=`basename $1` + +if [ $MODULEFILE = $MODULEFILEBASENAME ] ; then { + MODULEFILE=`pwd`/$MODULEFILE +} fi + +ERRFILE=/tmp/$MODULEFILEBASENAME.errs +MAPFILE=/tmp/$MODULEFILEBASENAME.map +GDBSCRIPT=$GDBSCRIPTS/load$MODULEFILEBASENAME + +function findaddr() { + local ADDR=0x$(echo "$SEGMENTS" | \ + grep "$1" | sed 's/^[^ ]*[ ]*[^ ]*[ ]*//' | \ + sed 's/[ ]*[^ ]*$//') + echo $ADDR +} + +function checkerrs() { + if [ "`cat $ERRFILE`" != "" ] ; then { + cat $ERRFILE + exit + } fi +} + +#load the module +echo Copying $MODULEFILE to $TESTMACHINE +rcp $MODULEFILE root@${TESTMACHINE}: + +echo Loading module $MODULEFILE +rsh -l root $TESTMACHINE /sbin/insmod -m ./`basename $MODULEFILE` \ + > $MAPFILE 2> $ERRFILE +checkerrs + +SEGMENTS=`head -n 11 $MAPFILE | tail -n 10` +TEXTADDR=$(findaddr "\\.text[^.]") +LOADSTRING="add-symbol-file $MODULEFILE $TEXTADDR" +SEGADDRS=`echo "$SEGMENTS" | awk '//{ + if ($1 != ".text" && $1 != ".this" && + $1 != ".kstrtab" && $1 != ".kmodtab") { + print " -s " $1 " 0x" $3 " " + } +}'` +LOADSTRING="$LOADSTRING $SEGADDRS" +echo Generating script $GDBSCRIPT +echo $LOADSTRING > $GDBSCRIPT --- linux-2.6.5-rc3/Documentation/input/ff.txt 2003-06-14 12:18:24.000000000 -0700 +++ 25/Documentation/input/ff.txt 2004-03-30 20:21:49.000000000 -0800 @@ -20,7 +20,7 @@ devices. Please read joystick.txt before 2. Instructions to the user ~~~~~~~~~~~~~~~~~~~~~~~~~~~ Here are instructions on how to compile and use the driver. In fact, this -driver is the normal iforce.o, input.o and evdev.o drivers written by Vojtech +driver is the normal iforce, input and evdev drivers written by Vojtech Pavlik, plus additions to support force feedback. Before you start, let me WARN you that some devices shake violently during the --- linux-2.6.5-rc3/Documentation/input/input.txt 2004-01-09 00:04:30.000000000 -0800 +++ 25/Documentation/input/input.txt 2004-03-30 20:21:49.000000000 -0800 @@ -35,18 +35,18 @@ devices, future use (say 2.5/2.6) is exp most of the existing input system, which is why it lives in drivers/input/ instead of drivers/usb/. - The centre of the input drivers is the input.o module, which must be + The centre of the input drivers is the input module, which must be loaded before any other of the input modules - it serves as a way of communication between two groups of modules: 1.1 Device drivers ~~~~~~~~~~~~~~~~~~ These modules talk to the hardware (for example via USB), and provide -events (keystrokes, mouse movements) to the input.o module. +events (keystrokes, mouse movements) to the input module. 1.2 Event handlers ~~~~~~~~~~~~~~~~~~ - These modules get events from input.o and pass them where needed via + These modules get events from input and pass them where needed via various interfaces - keystrokes to the kernel, mouse movements via a simulated PS/2 interface to GPM and X and so on. @@ -56,12 +56,12 @@ simulated PS/2 interface to GPM and X an you'll have to load the following modules (or have them built in to the kernel): - input.o - mousedev.o - keybdev.o - usbcore.o - usb-uhci.o or usb-ohci.o or uhci.o - hid.o + input + mousedev + keybdev + usbcore + uhci_hcd or ohci_hcd or ehci_hcd + usbhid After this, the USB keyboard will work straight away, and the USB mouse will be available as a character device on major 13, minor 63: @@ -98,9 +98,9 @@ XFree to this device to use it - GPM sho however not useful without being handled, so you also will need to use some of the modules from section 3.2. -3.1.1 hid.o -~~~~~~~~~~~ - hid.o is the largest and most complex driver of the whole suite. It +3.1.1 usbhid +~~~~~~~~~~~~ + usbhid is the largest and most complex driver of the whole suite. It handles all HID devices, and because there is a very wide variety of them, and because the USB HID specification isn't simple, it needs to be this big. @@ -115,7 +115,7 @@ interface, but for the UPSs and LCDs it the hiddev interface was designed. See Documentation/usb/hiddev.txt for more information about it. - The usage of the hid.o module is very simple, it takes no parameters, + The usage of the usbhid module is very simple, it takes no parameters, detects everything automatically and when a HID device is inserted, it detects it appropriately. @@ -123,30 +123,30 @@ detects it appropriately. device that doesn't work well. In that case #define DEBUG at the beginning of hid-core.c and send me the syslog traces. -3.1.2 usbmouse.o -~~~~~~~~~~~~~~~~ +3.1.2 usbmouse +~~~~~~~~~~~~~~ For embedded systems, for mice with broken HID descriptors and just any -other use when the big hid.o wouldn't be a good choice, there is the -usbmouse.c driver. It handles USB mice only. It uses a simpler HIDBP +other use when the big usbhid wouldn't be a good choice, there is the +usbmouse driver. It handles USB mice only. It uses a simpler HIDBP protocol. This also means the mice must support this simpler protocol. Not -all do. If you don't have any strong reason to use this module, use hid.o +all do. If you don't have any strong reason to use this module, use usbhid instead. -3.1.3 usbkbd.o -~~~~~~~~~~~~~~ - Much like usbmouse.c, this module talks to keyboards with a simplified +3.1.3 usbkbd +~~~~~~~~~~~~ + Much like usbmouse, this module talks to keyboards with a simplified HIDBP protocol. It's smaller, but doesn't support any extra special keys. -Use hid.o instead if there isn't any special reason to use this. +Use usbhid instead if there isn't any special reason to use this. -3.1.4 wacom.o -~~~~~~~~~~~~~ +3.1.4 wacom +~~~~~~~~~~~ This is a driver for Wacom Graphire and Intuos tablets. Not for Wacom PenPartner, that one is handled by the HID driver. Although the Intuos and Graphire tablets claim that they are HID tablets as well, they are not and thus need this specific driver. -3.1.5 iforce.o -~~~~~~~~~~~~~~~ +3.1.5 iforce +~~~~~~~~~~~~ A driver for I-Force joysticks and wheels, both over USB and RS232. It includes ForceFeedback support now, even though Immersion Corp. considers the protocol a trade secret and won't disclose a word @@ -157,8 +157,8 @@ about it. Event handlers distrubite the events from the devices to userland and kernel, as needed. -3.2.1 keybdev.o -~~~~~~~~~~~~~~~ +3.2.1 keybdev +~~~~~~~~~~~~~ keybdev is currently a rather ugly hack that translates the input events into architecture-specific keyboard raw mode (Xlated AT Set2 on x86), and passes them into the handle_scancode function of the @@ -170,13 +170,13 @@ it. best if keyboard.c would itself be an event handler. This is done in the input patch, available on the webpage mentioned below. -3.2.2 mousedev.o -~~~~~~~~~~~~~~~~ +3.2.2 mousedev +~~~~~~~~~~~~~~ mousedev is also a hack to make programs that use mouse input work. It takes events from either mice or digitizers/tablets and makes a PS/2-style (a la /dev/psaux) mouse device available to the userland. Ideally, the programs could use a more reasonable interface, -for example evdev.o +for example evdev Mousedev devices in /dev/input (as shown above) are: @@ -207,8 +207,8 @@ program reading the data wishes. You can these. You'll need ImPS/2 if you want to make use of a wheel on a USB mouse and ExplorerPS/2 if you want to use extra (up to 5) buttons. -3.2.3 joydev.o -~~~~~~~~~~~~~~ +3.2.3 joydev +~~~~~~~~~~~~ Joydev implements v0.x and v1.x Linux joystick api, much like drivers/char/joystick/joystick.c used to in earlier versions. See joystick-api.txt in the Documentation subdirectory for details. As @@ -223,8 +223,8 @@ on: And so on up to js31. -3.2.4 evdev.o -~~~~~~~~~~~~~ +3.2.4 evdev +~~~~~~~~~~~ evdev is the generic input event interface. It passes the events generated in the kernel straight to the program, with timestamps. The API is still evolving, but should be useable now. It's described in --- linux-2.6.5-rc3/Documentation/input/joystick.txt 2004-03-29 22:08:27.000000000 -0800 +++ 25/Documentation/input/joystick.txt 2004-03-30 20:21:49.000000000 -0800 @@ -182,7 +182,7 @@ they're basically souped up CHF sticks. For other joystick types (more/less axes, hats, and buttons) support you'll need to specify the types either on the kernel command line or on the -module command line, when inserting analog.o into the kernel. The +module command line, when inserting analog into the kernel. The parameters are: analog.map=,,,.... @@ -503,7 +503,7 @@ and the /dev/input/jsX device should bec 3.21 I-Force devices ~~~~~~~~~~~~~~~~~~~~ - All I-Force devices are supported by the iforce.o module. This includes: + All I-Force devices are supported by the iforce module. This includes: * AVB Mag Turbo Force * AVB Top Shot Pegasus --- linux-2.6.5-rc3/Documentation/IPMI.txt 2003-06-14 12:17:56.000000000 -0700 +++ 25/Documentation/IPMI.txt 2004-03-31 00:54:37.070280176 -0800 @@ -22,6 +22,58 @@ are not familiar with IPMI itself, see t http://www.intel.com/design/servers/ipmi/index.htm. IPMI is a big subject and I can't cover it all here! +Configuration +------------- + +The LinuxIPMI driver is modular, which means you have to pick several +things to have it work right depending on your hardware. Most of +these are available in the 'Character Devices' menu. + +No matter what, you must pick 'IPMI top-level message handler' to use +IPMI. What you do beyond that depends on your needs and hardware. + +The message handler does not provide any user-level interfaces. +Kernel code (like the watchdog) can still use it. If you need access +from userland, you need to select 'Device interface for IPMI' if you +want access through a device driver. Another interface is also +available, you may select 'IPMI sockets' in the 'Networking Support' +main menu. This provides a socket interface to IPMI. You may select +both of these at the same time, they will both work together. + +The driver interface depends on your hardware. If you have a board +with a standard interface (These will generally be either "KCS", +"SMIC", or "BT", consult your hardware manual), choose the 'IPMI SI +handler' option. A driver also exists for direct I2C access to the +IPMI management controller. Some boards support this, but it is +unknown if it will work on every board. For this, choose 'IPMI SMBus +handler', but be ready to try to do some figuring to see if it will +work. + +There is also a KCS-only driver interface supplied, but it is +depracated in favor of the SI interface. + +You should generally enable ACPI on your system, as systems with IPMI +should have ACPI tables describing them. + +If you have a standard interface and the board manufacturer has done +their job correctly, the IPMI controller should be automatically +detect (via ACPI or SMBIOS tables) and should just work. Sadly, many +boards do not have this information. The driver attempts standard +defaults, but they may not work. If you fall into this situation, you +need to read the section below named 'The SI Driver' on how to +hand-configure your system. + +IPMI defines a standard watchdog timer. You can enable this with the +'IPMI Watchdog Timer' config option. If you compile the driver into +the kernel, then via a kernel command-line option you can have the +watchdog timer start as soon as it intitializes. It also have a lot +of other options, see the 'Watchdog' section below for more details. +Note that you can also have the watchdog continue to run if it is +closed (by default it is disabled on close). Go into the 'Watchdog +Cards' menu, enable 'Watchdog Timer Support', and enable the option +'Disable watchdog shutdown on close'. + + Basic Design ------------ @@ -41,18 +93,30 @@ ipmi_devintf - This provides a userland driver, each open file for this device ties in to the message handler as an IPMI user. -ipmi_kcs_drv - A driver for the KCS SMI. Most system have a KCS -interface for IPMI. +ipmi_si - A driver for various system interfaces. This supports +KCS, SMIC, and may support BT in the future. Unless you have your own +custom interface, you probably need to use this. + +ipmi_smb - A driver for accessing BMCs on the SMBus. It uses the +I2C kernel driver's SMBus interfaces to send and receive IPMI messages +over the SMBus. +af_ipmi - A network socket interface to IPMI. This doesn't take up +a character device in your system. + +Note that the KCS-only interface ahs been removed. Much documentation for the interface is in the include files. The IPMI include files are: -ipmi.h - Contains the user interface and IOCTL interface for IPMI. +net/af_ipmi.h - Contains the socket interface. + +linux/ipmi.h - Contains the user interface and IOCTL interface for IPMI. -ipmi_smi.h - Contains the interface for SMI drivers to use. +linux/ipmi_smi.h - Contains the interface for system management interfaces +(things that interface to IPMI controllers) to use. -ipmi_msgdefs.h - General definitions for base IPMI messaging. +linux/ipmi_msgdefs.h - General definitions for base IPMI messaging. Addressing @@ -260,70 +324,131 @@ they register with the message handler. in the order they register, although if an SMI unregisters and then another one registers, all bets are off. -The ipmi_smi.h defines the interface for SMIs, see that for more -details. +The ipmi_smi.h defines the interface for management interfaces, see +that for more details. -The KCS Driver --------------- +The SI Driver +------------- -The KCS driver allows up to 4 KCS interfaces to be configured in the -system. By default, the driver will register one KCS interface at the -spec-specified I/O port 0xca2 without interrupts. You can change this -at module load time (for a module) with: +The SI driver allows up to 4 KCS or SMIC interfaces to be configured +in the system. By default, scan the ACPI tables for interfaces, and +if it doesn't find any the driver will attempt to register one KCS +interface at the spec-specified I/O port 0xca2 without interrupts. +You can change this at module load time (for a module) with: - insmod ipmi_kcs_drv.o kcs_ports=,... kcs_addrs=, - kcs_irqs=,... kcs_trydefaults=[0|1] + modprobe ipmi_si.o type=,.... + ports=,... addrs=,... + irqs=,... trydefaults=[0|1] -The KCS driver supports two types of interfaces, ports (for I/O port -based KCS interfaces) and memory addresses (for KCS interfaces in -memory). The driver will support both of them simultaneously, setting -the port to zero (or just not specifying it) will allow the memory -address to be used. The port will override the memory address if it -is specified and non-zero. kcs_trydefaults sets whether the standard -IPMI interface at 0xca2 and any interfaces specified by ACPE are -tried. By default, the driver tries it, set this value to zero to -turn this off. +Each of these except si_trydefaults is a list, the first item for the +first interface, second item for the second interface, etc. + +The si_type may be either "kcs", "smic", or "bt". If you leave it blank, it +defaults to "kcs". + +If you specify si_addrs as non-zero for an interface, the driver will +use the memory address given as the address of the device. This +overrides si_ports. + +If you specify si_ports as non-zero for an interface, the driver will +use the I/O port given as the device address. + +If you specify si_irqs as non-zero for an interface, the driver will +attempt to use the given interrupt for the device. + +si_trydefaults sets whether the standard IPMI interface at 0xca2 and +any interfaces specified by ACPE are tried. By default, the driver +tries it, set this value to zero to turn this off. When compiled into the kernel, the addresses can be specified on the kernel command line as: - ipmi_kcs=:,:....,[nodefault] - -The values is either "p" or "m" for port or memory -addresses. So for instance, a KCS interface at port 0xca2 using -interrupt 9 and a memory interface at address 0xf9827341 with no -interrupt would be specified "ipmi_kcs=p0xca2:9,m0xf9827341". -If you specify zero for in irq or don't specify it, the driver will -run polled unless the software can detect the interrupt to use in the -ACPI tables. - -By default, the driver will attempt to detect a KCS device at the -spec-specified 0xca2 address and any address specified by ACPI. If -you want to turn this off, use the "nodefault" option. + ipmi_si.type=,... + ipmi_si.ports=,... ipmi_si.addrs=,... + ipmi_si.irqs=,... ipmi_si.trydefaults=[0|1] + +It works the same as the module parameters of the same names. + +By default, the driver will attempt to detect any device specified by +ACPI, and if none of those then a KCS device at the spec-specified +0xca2. If you want to turn this off, set the "trydefaults" option to +false. If you have high-res timers compiled into the kernel, the driver will use them to provide much better performance. Note that if you do not have high-res timers enabled in the kernel and you don't have interrupts enabled, the driver will run VERY slowly. Don't blame me, -the KCS interface sucks. +these interfaces suck. + + +The SMBus Driver +---------------- + +The SMBus driver allows up to 4 SMBus devices to be configured in the +system. By default, the driver will register any SMBus interfaces it finds +in the I2C address range of 0x20 to 0x4f on any adapter. You can change this +at module load time (for a module) with: + + modprobe ipmi_smb.o + addr=,[,,[,...]] + dbg=,... + [defaultprobe=0] [dbg_probe=1] + +The addresses are specified in pairs, the first is the adapter ID and the +second is the I2C address on that adapter. + +The debug flags are bit flags for each BMC found, they are: +IPMI messages: 1, driver state: 2, timing: 4, I2C probe: 8 + +Setting smb_defaultprobe to zero disabled the default probing of SMBus +interfaces at address range 0x20 to 0x4f. This means that only the +BMCs specified on the smb_addr line will be detected. + +Setting smb_dbg_probe to 1 will enable debugging of the probing and +detection process for BMCs on the SMBusses. + +Discovering the IPMI compilant BMC on the SMBus can cause devices +on the I2C bus to fail. The SMBus driver writes a "Get Device ID" IPMI +message as a block write to the I2C bus and waits for a response. +This action can be detrimental to some I2C devices. It is highly recommended +that the known I2c address be given to the SMBus driver in the smb_addr +parameter. The default adrress range will not be used when a smb_addr +parameter is provided. + +When compiled into the kernel, the addresses can be specified on the +kernel command line as: + + ipmb_smb.addr=,[,,[,...]] + ipmi_smb.dbg=,... + ipmi_smb.defaultprobe=0 ipmi_smb.dbg_probe=1 + +These are the same options as on the module command line. + +Note that you might need some I2C changes if CONFIG_IPMI_PANIC_EVENT +is enabled along with this, so the I2C driver knows to run to +completion during sending a panic event. Other Pieces ------------ Watchdog +-------- A watchdog timer is provided that implements the Linux-standard watchdog timer interface. It has three module parameters that can be used to control it: - insmod ipmi_watchdog timeout= pretimeout= action= - preaction= preop= + modprobe ipmi_watchdog timeout= pretimeout= action= + preaction= preop= start_now=x The timeout is the number of seconds to the action, and the pretimeout is the amount of seconds before the reset that the pre-timeout panic will -occur (if pretimeout is zero, then pretimeout will not be enabled). +occur (if pretimeout is zero, then pretimeout will not be enabled). Note +that the pretimeout is the time before the final timeout. So if the +timeout is 50 seconds and the pretimeout is 10 seconds, then the pretimeout +will occur in 40 second (10 seconds before the timeout). The action may be "reset", "power_cycle", or "power_off", and specifies what to do when the timer times out, and defaults to @@ -344,16 +469,19 @@ When preop is set to "preop_give_data", on the device when the pretimeout occurs. Select and fasync work on the device, as well. +If start_now is set to 1, the watchdog timer will start running as +soon as the driver is loaded. + When compiled into the kernel, the kernel command line is available for configuring the watchdog: - ipmi_wdog=[,[,