From: Eric W. Biederman Seems to work. I find that the e100 driver fails to start up in the new kernel: PCI: Enabling device 02:08.0 (0000 -> 0003) PCI: Setting latency timer of device 02:08.0 to 64 e100: selftest timeout e100: Failed to initialize, instance #0 This is apparently a driver problem. Tiny howto: - enable kexec in config, build, install. - grab kexec-tools from http://www.osdl.org/archive/andyp/kexec/2.5.68/ - edit ./kexec/kexec-syscall.c and make sure __NR_kexec_load is set to 269 (-mm kernels have an additional syscall) - run `make distclean' and `make' - I use this script: #!/bin/sh usage() { echo "Usage: do-kexec.sh /boot/bzImage [commandline options]" exit 1 } if [ $# -lt 1 ] then usage fi sync IMAGE=$1 shift ./objdir/build/sbin/kexec -l $IMAGE --command-line="$(cat /proc/cmdline) $*" ./objdir/build/sbin/kexec -e invoked as cd /usr/src/kexec-tools ./do-kexec.sh This is fairly crude - it's an instant reboot, no shutdown or anything. Only do this if you're using journalled filesystems! MAINTAINERS | 8 arch/i386/Kconfig | 17 + arch/i386/defconfig | 1 arch/i386/kernel/Makefile | 1 arch/i386/kernel/apic.c | 51 +++ arch/i386/kernel/dmi_scan.c | 27 - arch/i386/kernel/entry.S | 1 arch/i386/kernel/i8259.c | 11 arch/i386/kernel/io_apic.c | 2 arch/i386/kernel/machine_kexec.c | 118 ++++++ arch/i386/kernel/reboot.c | 44 -- arch/i386/kernel/relocate_kernel.S | 107 ++++++ arch/i386/kernel/smp.c | 24 + fs/aio.c | 2 include/asm-i386/apic.h | 3 include/asm-i386/apicdef.h | 1 include/asm-i386/kexec.h | 23 + include/asm-i386/unistd.h | 3 include/linux/kexec.h | 54 +++ include/linux/reboot.h | 2 kernel/Makefile | 1 kernel/kexec.c | 629 +++++++++++++++++++++++++++++++++++++ kernel/sys.c | 23 + 23 files changed, 1084 insertions(+), 69 deletions(-) diff -puN arch/i386/defconfig~kexec arch/i386/defconfig --- 25/arch/i386/defconfig~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/arch/i386/defconfig 2003-05-05 19:05:58.000000000 -0700 @@ -72,6 +72,7 @@ CONFIG_SMP=y CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y CONFIG_NR_CPUS=32 +CONFIG_KEXEC=y CONFIG_X86_MCE=y # CONFIG_X86_MCE_NONFATAL is not set CONFIG_X86_MCE_P4THERMAL=y diff -puN arch/i386/Kconfig~kexec arch/i386/Kconfig --- 25/arch/i386/Kconfig~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/arch/i386/Kconfig 2003-05-05 19:05:58.000000000 -0700 @@ -840,6 +840,23 @@ config BOOT_IOREMAP depends on ((X86_SUMMIT || X86_GENERICARCH) && NUMA) default y +config KEXEC + bool "kexec system call (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is indepedent of the system firmware. And like a reboot + you can start any kernel with it not just Linux. + + The name comes from the similiarity to the exec system call. + + It is on an going process to be certain the hardware in a machine + is properly shutdown, so do not be surprised if this code does not + initially work for you. It may help to enable device hotplugging + support. As of this writing the exact hardware interface is + strongly in flux, so no good recommendation can be made. + endmenu diff -puN arch/i386/kernel/apic.c~kexec arch/i386/kernel/apic.c --- 25/arch/i386/kernel/apic.c~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/arch/i386/kernel/apic.c 2003-05-05 19:05:58.000000000 -0700 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -171,6 +172,36 @@ void disconnect_bsp_APIC(void) outb(0x70, 0x22); outb(0x00, 0x23); } + else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); + + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT); + apic_write_around(APIC_LVT0, value); + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } } void disable_local_APIC(void) @@ -1116,6 +1147,26 @@ asmlinkage void smp_error_interrupt(void irq_exit(); } +void stop_apics(void) +{ + /* By resetting the APIC's we disable the nmi watchdog */ +#if CONFIG_SMP + /* + * Stop all CPUs and turn off local APICs and the IO-APIC, so + * other OSs see a clean IRQ state. + */ + smp_send_stop(); +#else + disable_local_APIC(); +#endif +#if defined(CONFIG_X86_IO_APIC) + if (smp_found_config) { + disable_IO_APIC(); + } +#endif + disconnect_bsp_APIC(); +} + /* * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. diff -puN arch/i386/kernel/dmi_scan.c~kexec arch/i386/kernel/dmi_scan.c --- 25/arch/i386/kernel/dmi_scan.c~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/arch/i386/kernel/dmi_scan.c 2003-05-05 19:05:58.000000000 -0700 @@ -220,31 +220,6 @@ static __init int set_bios_reboot(struct return 0; } -/* - * Some machines require the "reboot=s" commandline option, this quirk makes that automatic. - */ -static __init int set_smp_reboot(struct dmi_blacklist *d) -{ -#ifdef CONFIG_SMP - extern int reboot_smp; - if (reboot_smp == 0) - { - reboot_smp = 1; - printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident); - } -#endif - return 0; -} - -/* - * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic. - */ -static __init int set_smp_bios_reboot(struct dmi_blacklist *d) -{ - set_smp_reboot(d); - set_bios_reboot(d); - return 0; -} /* * Some bioses have a broken protected mode poweroff and need to use realmode @@ -554,7 +529,7 @@ static __initdata struct dmi_blacklist d MATCH(DMI_BIOS_VERSION, "4.60 PGMA"), MATCH(DMI_BIOS_DATE, "134526184"), NO_MATCH } }, - { set_smp_bios_reboot, "Dell PowerEdge 1300", { /* Handle problems with rebooting on Dell 1300's */ + { set_bios_reboot, "Dell PowerEdge 1300", { /* Handle problems with rebooting on Dell 1300's */ MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"), NO_MATCH, NO_MATCH diff -puN arch/i386/kernel/entry.S~kexec arch/i386/kernel/entry.S --- 25/arch/i386/kernel/entry.S~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/arch/i386/kernel/entry.S 2003-05-05 19:05:58.000000000 -0700 @@ -879,6 +879,7 @@ ENTRY(sys_call_table) .long sys_clock_getres .long sys_clock_nanosleep .long sys_mknod64 + .long sys_kexec_load nr_syscalls=(.-sys_call_table)/4 diff -puN arch/i386/kernel/i8259.c~kexec arch/i386/kernel/i8259.c --- 25/arch/i386/kernel/i8259.c~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/arch/i386/kernel/i8259.c 2003-05-05 19:05:58.000000000 -0700 @@ -245,10 +245,21 @@ static int i8259A_resume(struct device * return 0; } +static void i8259A_shutdown(struct device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-1 */ +} + static struct device_driver i8259A_driver = { .name = "pic", .bus = &system_bus_type, .resume = i8259A_resume, + .shutdown = i8259A_shutdown, }; static struct sys_device device_i8259A = { diff -puN arch/i386/kernel/io_apic.c~kexec arch/i386/kernel/io_apic.c --- 25/arch/i386/kernel/io_apic.c~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/arch/i386/kernel/io_apic.c 2003-05-05 19:05:58.000000000 -0700 @@ -1548,8 +1548,6 @@ void disable_IO_APIC(void) * Clear the IO-APIC before rebooting: */ clear_IO_APIC(); - - disconnect_bsp_APIC(); } /* diff -puN /dev/null arch/i386/kernel/machine_kexec.c --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25-akpm/arch/i386/kernel/machine_kexec.c 2003-05-05 19:05:58.000000000 -0700 @@ -0,0 +1,118 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * machine_kexec + * ======================= + */ + + +static void set_idt(void *newidt, __u16 limit) +{ + unsigned char curidt[6]; + + /* ia32 supports unaliged loads & stores */ + (*(__u16 *)(curidt)) = limit; + (*(__u32 *)(curidt +2)) = (unsigned long)(newidt); + + __asm__ __volatile__ ( + "lidt %0\n" + : "=m" (curidt) + ); +}; + + +static void set_gdt(void *newgdt, __u16 limit) +{ + unsigned char curgdt[6]; + + /* ia32 supports unaliged loads & stores */ + (*(__u16 *)(curgdt)) = limit; + (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt); + + __asm__ __volatile__ ( + "lgdt %0\n" + : "=m" (curgdt) + ); +}; + +static void load_segments(void) +{ +#define __STR(X) #X +#define STR(X) __STR(X) + + __asm__ __volatile__ ( + "\tljmp $"STR(__KERNEL_CS)",$1f\n" + "\t1:\n" + "\tmovl $"STR(__KERNEL_DS)",%eax\n" + "\tmovl %eax,%ds\n" + "\tmovl %eax,%es\n" + "\tmovl %eax,%fs\n" + "\tmovl %eax,%gs\n" + "\tmovl %eax,%ss\n" + ); +#undef STR +#undef __STR +} + +typedef void (*relocate_new_kernel_t)( + unsigned long indirection_page, unsigned long reboot_code_buffer, + unsigned long start_address); + +const extern unsigned char relocate_new_kernel[]; +extern void relocate_new_kernel_end(void); +const extern unsigned int relocate_new_kernel_size; + +void machine_kexec(struct kimage *image) +{ + unsigned long indirection_page; + unsigned long reboot_code_buffer; + relocate_new_kernel_t rnk; + + /* switch to an mm where the reboot_code_buffer is identity mapped */ + + extern void use_mm(struct mm_struct *mm); + use_mm(&init_mm); + + stop_apics(); + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + reboot_code_buffer = page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT; + indirection_page = image->head & PAGE_MASK; + + /* copy it out */ + memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size); + + /* The segment registers are funny things, they are + * automatically loaded from a table, in memory wherever you + * set them to a specific selector, but this table is never + * accessed again you set the segment to a different selector. + * + * The more common model is are caches where the behide + * the scenes work is done, but is also dropped at arbitrary + * times. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0),0); + set_idt(phys_to_virt(0),0); + + /* now call it */ + rnk = (relocate_new_kernel_t) reboot_code_buffer; + (*rnk)(indirection_page, reboot_code_buffer, image->start); +} diff -puN arch/i386/kernel/Makefile~kexec arch/i386/kernel/Makefile --- 25/arch/i386/kernel/Makefile~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/arch/i386/kernel/Makefile 2003-05-05 19:05:58.000000000 -0700 @@ -24,6 +24,7 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoli obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o suspend_asm.o obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_EDD) += edd.o diff -puN arch/i386/kernel/reboot.c~kexec arch/i386/kernel/reboot.c --- 25/arch/i386/kernel/reboot.c~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/arch/i386/kernel/reboot.c 2003-05-05 19:05:58.000000000 -0700 @@ -8,6 +8,7 @@ #include #include #include +#include #include "mach_reboot.h" /* @@ -20,8 +21,7 @@ static int reboot_mode; int reboot_thru_bios; #ifdef CONFIG_SMP -int reboot_smp = 0; -static int reboot_cpu = -1; +int reboot_cpu = -1; /* specifies the internal linux cpu id, not the apicid */ /* shamelessly grabbed from lib/vsprintf.c for readability */ #define is_digit(c) ((c) >= '0' && (c) <= '9') #endif @@ -43,7 +43,6 @@ static int __init reboot_setup(char *str break; #ifdef CONFIG_SMP case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ - reboot_smp = 1; if (is_digit(*(str+1))) { reboot_cpu = (int) (*(str+1) - '0'); if (is_digit(*(str+2))) @@ -215,42 +214,7 @@ void machine_real_restart(unsigned char void machine_restart(char * __unused) { -#if CONFIG_SMP - int cpuid; - - cpuid = GET_APIC_ID(apic_read(APIC_ID)); - - if (reboot_smp) { - - /* check to see if reboot_cpu is valid - if its not, default to the BSP */ - if ((reboot_cpu == -1) || - (reboot_cpu > (NR_CPUS -1)) || - !(phys_cpu_present_map & (1< +#include + + /* Must be relocatable PIC code callable as a C function, that once + * it starts can not use the previous processes stack. + * + */ + .globl relocate_new_kernel +relocate_new_kernel: + /* read the arguments and say goodbye to the stack */ + movl 4(%esp), %ebx /* indirection_page */ + movl 8(%esp), %ebp /* reboot_code_buffer */ + movl 12(%esp), %edx /* start address */ + + /* zero out flags, and disable interrupts */ + pushl $0 + popfl + + /* set a new stack at the bottom of our page... */ + lea 4096(%ebp), %esp + + /* store the parameters back on the stack */ + pushl %edx /* store the start address */ + + /* Set cr0 to a known state: + * 31 0 == Paging disabled + * 18 0 == Alignment check disabled + * 16 0 == Write protect disabled + * 3 0 == No task switch + * 2 0 == Don't do FP software emulation. + * 0 1 == Proctected mode enabled + */ + movl %cr0, %eax + andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax + orl $(1<<0), %eax + movl %eax, %cr0 + + /* Set cr4 to a known state: + * Setting everything to zero seems safe. + */ + movl %cr4, %eax + andl $0, %eax + movl %eax, %cr4 + + jmp 1f +1: + + /* Flush the TLB (needed?) */ + xorl %eax, %eax + movl %eax, %cr3 + + /* Do the copies */ + cld +0: /* top, read another word for the indirection page */ + movl %ebx, %ecx + movl (%ebx), %ecx + addl $4, %ebx + testl $0x1, %ecx /* is it a destination page */ + jz 1f + movl %ecx, %edi + andl $0xfffff000, %edi + jmp 0b +1: + testl $0x2, %ecx /* is it an indirection page */ + jz 1f + movl %ecx, %ebx + andl $0xfffff000, %ebx + jmp 0b +1: + testl $0x4, %ecx /* is it the done indicator */ + jz 1f + jmp 2f +1: + testl $0x8, %ecx /* is it the source indicator */ + jz 0b /* Ignore it otherwise */ + movl %ecx, %esi /* For every source page do a copy */ + andl $0xfffff000, %esi + + movl $1024, %ecx + rep ; movsl + jmp 0b + +2: + + /* To be certain of avoiding problems with self modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* set all of the registers to known values */ + /* leave %esp alone */ + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + ret +relocate_new_kernel_end: + + .globl relocate_new_kernel_size +relocate_new_kernel_size: + .long relocate_new_kernel_end - relocate_new_kernel diff -puN arch/i386/kernel/smp.c~kexec arch/i386/kernel/smp.c --- 25/arch/i386/kernel/smp.c~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/arch/i386/kernel/smp.c 2003-05-05 19:05:58.000000000 -0700 @@ -568,6 +568,30 @@ static void stop_this_cpu (void * dummy) void smp_send_stop(void) { + extern int reboot_cpu; + int reboot_cpu_id; + + /* The boot cpu is always logical cpu 0 */ + reboot_cpu_id = 0; + + /* See if there has been give a command line override . + */ + if ((reboot_cpu != -1) && !(reboot_cpu >= NR_CPUS) && + test_bit(reboot_cpu, &cpu_online_map)) { + reboot_cpu_id = reboot_cpu; + } + + /* Make certain the the cpu I'm rebooting on is online */ + if (!test_bit(reboot_cpu_id, &cpu_online_map)) { + reboot_cpu_id = smp_processor_id(); + } + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed(current, 1 << reboot_cpu_id); + + /* O.k. Now that I'm on the appropriate processor stop + * all of the others. + */ smp_call_function(stop_this_cpu, NULL, 1, 0); local_irq_disable(); diff -puN fs/aio.c~kexec fs/aio.c --- 25/fs/aio.c~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/fs/aio.c 2003-05-05 19:05:58.000000000 -0700 @@ -539,7 +539,7 @@ struct kioctx *lookup_ioctx(unsigned lon return ioctx; } -static void use_mm(struct mm_struct *mm) +void use_mm(struct mm_struct *mm) { struct mm_struct *active_mm = current->active_mm; atomic_inc(&mm->mm_count); diff -puN include/asm-i386/apicdef.h~kexec include/asm-i386/apicdef.h --- 25/include/asm-i386/apicdef.h~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/include/asm-i386/apicdef.h 2003-05-05 19:05:58.000000000 -0700 @@ -86,6 +86,7 @@ #define APIC_LVT_REMOTE_IRR (1<<14) #define APIC_INPUT_POLARITY (1<<13) #define APIC_SEND_PENDING (1<<12) +#define APIC_MODE_MASK 0x700 #define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7) #define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8)) #define APIC_MODE_FIXED 0x0 diff -puN include/asm-i386/apic.h~kexec include/asm-i386/apic.h --- 25/include/asm-i386/apic.h~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/include/asm-i386/apic.h 2003-05-05 19:05:58.000000000 -0700 @@ -99,6 +99,9 @@ extern unsigned int nmi_watchdog; #define NMI_LOCAL_APIC 2 #define NMI_INVALID 3 +extern void stop_apics(void); +#else +static inline void stop_apics(void) { } #endif /* CONFIG_X86_LOCAL_APIC */ extern int phys_proc_id[NR_CPUS]; diff -puN /dev/null include/asm-i386/kexec.h --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25-akpm/include/asm-i386/kexec.h 2003-05-05 19:05:58.000000000 -0700 @@ -0,0 +1,23 @@ +#ifndef _I386_KEXEC_H +#define _I386_KEXEC_H + +#include + +/* + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. + * I.e. Maximum page that is mapped directly into kernel memory, + * and kmap is not required. + * + * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct + * calculation for the amount of memory directly mappable into the + * kernel memory space. + */ + +/* Maximum physical address we can use pages from */ +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) +/* Maximum address we can reach in physical address mode */ +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) + +#define KEXEC_REBOOT_CODE_SIZE 4096 + +#endif /* _I386_KEXEC_H */ diff -puN include/asm-i386/unistd.h~kexec include/asm-i386/unistd.h --- 25/include/asm-i386/unistd.h~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/include/asm-i386/unistd.h 2003-05-05 19:05:58.000000000 -0700 @@ -274,8 +274,9 @@ #define __NR_clock_getres (__NR_timer_create+7) #define __NR_clock_nanosleep (__NR_timer_create+8) #define __NR_sys_mknod64 268 +#define __NR_sys_kexec_load 269 -#define NR_syscalls 269 +#define NR_syscalls 270 /* user-visible error numbers are in the range -1 - -124: see */ diff -puN /dev/null include/linux/kexec.h --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25-akpm/include/linux/kexec.h 2003-05-05 19:05:58.000000000 -0700 @@ -0,0 +1,54 @@ +#ifndef LINUX_KEXEC_H +#define LINUX_KEXEC_H + +#if CONFIG_KEXEC +#include +#include +#include + +/* + * This structure is used to hold the arguments that are used when loading + * kernel binaries. + */ + +typedef unsigned long kimage_entry_t; +#define IND_DESTINATION 0x1 +#define IND_INDIRECTION 0x2 +#define IND_DONE 0x4 +#define IND_SOURCE 0x8 + +#define KEXEC_SEGMENT_MAX 8 +struct kexec_segment { + void *buf; + size_t bufsz; + void *mem; + size_t memsz; +}; + +struct kimage { + kimage_entry_t head; + kimage_entry_t *entry; + kimage_entry_t *last_entry; + + unsigned long destination; + unsigned long offset; + + unsigned long start; + struct page *reboot_code_pages; + + unsigned long nr_segments; + struct kexec_segment segment[KEXEC_SEGMENT_MAX+1]; + + struct list_head dest_pages; + struct list_head unuseable_pages; +}; + + +/* kexec interface functions */ +extern void machine_kexec(struct kimage *image); +extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments, + struct kexec_segment *segments); +extern struct kimage *kexec_image; +#endif +#endif /* LINUX_KEXEC_H */ + diff -puN include/linux/reboot.h~kexec include/linux/reboot.h --- 25/include/linux/reboot.h~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/include/linux/reboot.h 2003-05-05 19:05:58.000000000 -0700 @@ -21,6 +21,7 @@ * POWER_OFF Stop OS and remove all power from system, if possible. * RESTART2 Restart system using given command string. * SW_SUSPEND Suspend system using Software Suspend if compiled in + * KEXEC Restart the system using a different kernel. */ #define LINUX_REBOOT_CMD_RESTART 0x01234567 @@ -30,6 +31,7 @@ #define LINUX_REBOOT_CMD_POWER_OFF 0x4321FEDC #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4 #define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2 +#define LINUX_REBOOT_CMD_KEXEC 0x45584543 #ifdef __KERNEL__ diff -puN /dev/null kernel/kexec.c --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25-akpm/kernel/kexec.c 2003-05-05 19:05:58.000000000 -0700 @@ -0,0 +1,629 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* When kexec transitions to the new kernel there is a one to one + * mapping between physical and virtual addresses. On processors + * where you can disable the MMU this is trivial, and easy. For + * others it is still a simple predictable page table to setup. + * + * In that environment kexec copies the new kernel to it's final + * resting place. This means I can only support memory whose + * physical address can fit in an unsigned long. In particular + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. + * If the assembly stub has more restrictive requirements + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be + * defined more restrictively in . + * + * The code for the transition from the current kernel to the + * the new kernel is placed in the reboot_code_buffer, whose size + * is given by KEXEC_REBOOT_CODE_SIZE. In the best case only a single + * page of memory is necessary, but some architectures require more. + * Because this memory must be identity mapped in the transition from + * virtual to physical addresses it must live in the range + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily + * modifyable. + * + * The assembly stub in the reboot code buffer is passed a linked list + * of descriptor pages detailing the source pages of the new kernel, + * and the destination addresses of those source pages. As this data + * structure is not used in the context of the current OS, it must + * be self contained. + * + * The code has been made to work with highmem pages and will use a + * destination page in it's final resting place (if it happens + * to allocate it). The end product of this is that most of the + * physical address space, and most of ram can be used. + * + * Future directions include: + * - allocating a page table with the reboot code buffer identity + * mapped, to simplify machine_kexec and make kexec_on_panic, more + * reliable. + * - allocating the pages for a page table for machines that cannot + * disable their MMUs. (Hammer, Alpha...) + */ + +/* KIMAGE_NO_DEST is an impossible destination address..., for + * allocating pages whose destination address we do not care about. + */ +#define KIMAGE_NO_DEST (-1UL) + +static int kimage_is_destination_range( + struct kimage *image, unsigned long start, unsigned long end); +static struct page *kimage_alloc_reboot_code_pages(struct kimage *image); +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest); + + +static int kimage_alloc(struct kimage **rimage, + unsigned long nr_segments, struct kexec_segment *segments) +{ + int result; + struct kimage *image; + size_t segment_bytes; + struct page *reboot_pages; + unsigned long i; + + /* Allocate a controlling structure */ + result = -ENOMEM; + image = kmalloc(sizeof(*image), GFP_KERNEL); + if (!image) { + goto out; + } + memset(image, 0, sizeof(*image)); + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unuseable pages */ + INIT_LIST_HEAD(&image->unuseable_pages); + + /* Read in the segments */ + image->nr_segments = nr_segments; + segment_bytes = nr_segments * sizeof*segments; + result = copy_from_user(image->segment, segments, segment_bytes); + if (result) + goto out; + + /* Verify we have good destination addresses. The caller is + * responsible for making certain we don't attempt to load + * the new image into invalid or reserved areas of RAM. This + * just verifies it is an address we can use. + */ + result = -EADDRNOTAVAIL; + for(i = 0; i < nr_segments; i++) { + unsigned long mend; + mend = ((unsigned long)(image->segment[i].mem)) + + image->segment[i].memsz; + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) + goto out; + } + + /* Find a location for the reboot code buffer, and add it + * the vector of segments so that it's pages will also be + * counted as destination pages. + */ + result = -ENOMEM; + reboot_pages = kimage_alloc_reboot_code_pages(image); + if (!reboot_pages) { + printk(KERN_ERR "Could not allocate reboot_code_buffer\n"); + goto out; + } + image->reboot_code_pages = reboot_pages; + image->segment[nr_segments].buf = 0; + image->segment[nr_segments].bufsz = 0; + image->segment[nr_segments].mem = (void *)(page_to_pfn(reboot_pages) << PAGE_SHIFT); + image->segment[nr_segments].memsz = KEXEC_REBOOT_CODE_SIZE; + image->nr_segments++; + + result = 0; + out: + if (result == 0) { + *rimage = image; + } else { + kfree(image); + } + return result; +} + +static int kimage_is_destination_range( + struct kimage *image, unsigned long start, unsigned long end) +{ + unsigned long i; + for(i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + mstart = (unsigned long)image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((end > mstart) && (start < mend)) { + return 1; + } + } + return 0; +} + +#ifdef CONFIG_MMU +static int identity_map_pages(struct page *pages, int order) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + int error; + mm = &init_mm; + vma = 0; + + down_write(&mm->mmap_sem); + error = -ENOMEM; + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) { + goto out; + } + + memset(vma, 0, sizeof(vma)); + vma->vm_mm = mm; + vma->vm_start = page_to_pfn(pages) << PAGE_SHIFT; + vma->vm_end = vma->vm_start + (1 << (order + PAGE_SHIFT)); + vma->vm_ops = 0; + vma->vm_flags = VM_SHARED \ + | VM_READ | VM_WRITE | VM_EXEC \ + | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC \ + | VM_DONTCOPY | VM_RESERVED; + vma->vm_page_prot = protection_map[vma->vm_flags & 0xf]; + vma->vm_file = NULL; + vma->vm_private_data = NULL; + INIT_LIST_HEAD(&vma->shared); + insert_vm_struct(mm, vma); + + error = remap_page_range(vma, vma->vm_start, vma->vm_start, + vma->vm_end - vma->vm_start, vma->vm_page_prot); + if (error) { + goto out; + } + + error = 0; + out: + if (error && vma) { + kmem_cache_free(vm_area_cachep, vma); + vma = 0; + } + up_write(&mm->mmap_sem); + + return error; +} +#else +#define identity_map_pages(pages, order) 0 +#endif + +struct page *kimage_alloc_reboot_code_pages(struct kimage *image) +{ + /* The reboot code buffer is special. It is the only set of + * pages that must be allocated in their final resting place, + * and the only set of pages whose final resting place we can + * pick. + * + * At worst this runs in O(N) of the image size. + */ + struct list_head extra_pages, *pos, *next; + struct page *pages; + unsigned long addr; + int order, count; + order = get_order(KEXEC_REBOOT_CODE_SIZE); + count = 1 << order; + INIT_LIST_HEAD(&extra_pages); + do { + int i; + pages = alloc_pages(GFP_HIGHUSER, order); + if (!pages) + break; + for(i = 0; i < count; i++) { + SetPageReserved(pages +i); + } + addr = page_to_pfn(pages) << PAGE_SHIFT; + if ((page_to_pfn(pages) >= (TASK_SIZE >> PAGE_SHIFT)) || + kimage_is_destination_range(image, addr, addr + KEXEC_REBOOT_CODE_SIZE)) { + list_add(&pages->list, &extra_pages); + pages = 0; + } + } while(!pages); + if (pages) { + int result; + result = identity_map_pages(pages, order); + if (result < 0) { + list_add(&pages->list, &extra_pages); + pages = 0; + } + } + /* If I could convert a multi page allocation into a buch of + * single page allocations I could add these pages to + * image->dest_pages. For now it is simpler to just free the + * pages again. + */ + list_for_each_safe(pos, next, &extra_pages) { + struct page *page; + int i; + page = list_entry(pos, struct page, list); + for(i = 0; i < count; i++) { + ClearPageReserved(pages +i); + } + list_del(&extra_pages); + __free_pages(page, order); + } + return pages; +} + +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) +{ + if (image->offset != 0) { + image->entry++; + } + if (image->entry == image->last_entry) { + kimage_entry_t *ind_page; + struct page *page; + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); + if (!page) { + return -ENOMEM; + } + ind_page = page_address(page); + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + image->entry = ind_page; + image->last_entry = + ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); + } + *image->entry = entry; + image->entry++; + image->offset = 0; + return 0; +} + +static int kimage_set_destination( + struct kimage *image, unsigned long destination) +{ + int result; + destination &= PAGE_MASK; + result = kimage_add_entry(image, destination | IND_DESTINATION); + if (result == 0) { + image->destination = destination; + } + return result; +} + + +static int kimage_add_page(struct kimage *image, unsigned long page) +{ + int result; + page &= PAGE_MASK; + result = kimage_add_entry(image, page | IND_SOURCE); + if (result == 0) { + image->destination += PAGE_SIZE; + } + return result; +} + + +static void kimage_free_extra_pages(struct kimage *image) +{ + /* Walk through and free any extra destination pages I may have */ + struct list_head *pos, *next; + list_for_each_safe(pos, next, &image->dest_pages) { + struct page *page; + page = list_entry(pos, struct page, list); + list_del(&page->list); + ClearPageReserved(page); + __free_page(page); + } + /* Walk through and free any unuseable pages I have cached */ + list_for_each_safe(pos, next, &image->unuseable_pages) { + struct page *page; + page = list_entry(pos, struct page, list); + list_del(&page->list); + ClearPageReserved(page); + __free_page(page); + } + +} +static int kimage_terminate(struct kimage *image) +{ + int result; + result = kimage_add_entry(image, IND_DONE); + if (result == 0) { + /* Point at the terminating element */ + image->entry--; + kimage_free_extra_pages(image); + } + return result; +} + +#define for_each_kimage_entry(image, ptr, entry) \ + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ + ptr = (entry & IND_INDIRECTION)? \ + phys_to_virt((entry & PAGE_MASK)): ptr +1) + +static void kimage_free(struct kimage *image) +{ + kimage_entry_t *ptr, entry; + kimage_entry_t ind = 0; + int i, count, order; + if (!image) + return; + kimage_free_extra_pages(image); + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_INDIRECTION) { + /* Free the previous indirection page */ + if (ind & IND_INDIRECTION) { + free_page((unsigned long)phys_to_virt(ind & PAGE_MASK)); + } + /* Save this indirection page until we are + * done with it. + */ + ind = entry; + } + else if (entry & IND_SOURCE) { + free_page((unsigned long)phys_to_virt(entry & PAGE_MASK)); + } + } + order = get_order(KEXEC_REBOOT_CODE_SIZE); + count = 1 << order; + do_munmap(&init_mm, + page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT, + count << PAGE_SHIFT); + for(i = 0; i < count; i++) { + ClearPageReserved(image->reboot_code_pages + i); + } + __free_pages(image->reboot_code_pages, order); + kfree(image); +} + +static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page) +{ + kimage_entry_t *ptr, entry; + unsigned long destination = 0; + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) { + destination = entry & PAGE_MASK; + } + else if (entry & IND_SOURCE) { + if (page == destination) { + return ptr; + } + destination += PAGE_SIZE; + } + } + return 0; +} + +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination) +{ + /* Here we implment safe guards to ensure that a source page + * is not copied to it's destination page before the data on + * the destination page is no longer useful. + * + * To do this we maintain the invariant that a source page is + * either it's own destination page, or it is not a + * destination page at all. + * + * That is slightly stronger than required, but the proof + * that no problems will not occur is trivial, and the + * implemenation is simply to verify. + * + * When allocating all pages normally this algorithm will run + * in O(N) time, but in the worst case it will run in O(N^2) + * time. If the runtime is a problem the data structures can + * be fixed. + */ + struct page *page; + unsigned long addr; + + /* Walk through the list of destination pages, and see if I + * have a match. + */ + list_for_each_entry(page, &image->dest_pages, list) { + addr = page_to_pfn(page) << PAGE_SHIFT; + if (addr == destination) { + list_del(&page->list); + return page; + } + } + page = 0; + while(1) { + kimage_entry_t *old; + /* Allocate a page, if we run out of memory give up */ + page = alloc_page(gfp_mask); + if (!page) { + return 0; + } + SetPageReserved(page); + /* If the page cannot be used file it away */ + if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { + list_add(&page->list, &image->unuseable_pages); + continue; + } + addr = page_to_pfn(page) << PAGE_SHIFT; + + /* If it is the destination page we want use it */ + if (addr == destination) + break; + + /* If the page is not a destination page use it */ + if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE)) + break; + + /* I know that the page is someones destination page. + * See if there is already a source page for this + * destination page. And if so swap the source pages. + */ + old = kimage_dst_used(image, addr); + if (old) { + /* If so move it */ + unsigned long old_addr; + struct page *old_page; + + old_addr = *old & PAGE_MASK; + old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + copy_highpage(page, old_page); + *old = addr | (*old & ~PAGE_MASK); + + /* The old page I have found cannot be a + * destination page, so return it. + */ + addr = old_addr; + page = old_page; + break; + } + else { + /* Place the page on the destination list I + * will use it later. + */ + list_add(&page->list, &image->dest_pages); + } + } + return page; +} + +static int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + unsigned long mstart; + int result; + unsigned long offset; + unsigned long offset_end; + unsigned char *buf; + + result = 0; + buf = segment->buf; + mstart = (unsigned long)segment->mem; + + offset_end = segment->memsz; + + result = kimage_set_destination(image, mstart); + if (result < 0) { + goto out; + } + for(offset = 0; offset < segment->memsz; offset += PAGE_SIZE) { + struct page *page; + char *ptr; + size_t size, leader; + page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset); + if (page == 0) { + result = -ENOMEM; + goto out; + } + result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT); + if (result < 0) { + goto out; + } + ptr = kmap(page); + if (segment->bufsz < offset) { + /* We are past the end zero the whole page */ + memset(ptr, 0, PAGE_SIZE); + kunmap(page); + continue; + } + size = PAGE_SIZE; + leader = 0; + if ((offset == 0)) { + leader = mstart & ~PAGE_MASK; + } + if (leader) { + /* We are on the first page zero the unused portion */ + memset(ptr, 0, leader); + size -= leader; + ptr += leader; + } + if (size > (segment->bufsz - offset)) { + size = segment->bufsz - offset; + } + if (size < (PAGE_SIZE - leader)) { + /* zero the trailing part of the page */ + memset(ptr + size, 0, (PAGE_SIZE - leader) - size); + } + result = copy_from_user(ptr, buf + offset, size); + kunmap(page); + if (result) { + result = (result < 0)?result : -EIO; + goto out; + } + } + out: + return result; +} + +/* + * Exec Kernel system call: for obvious reasons only root may call it. + * + * This call breaks up into three pieces. + * - A generic part which loads the new kernel from the current + * address space, and very carefully places the data in the + * allocated pages. + * + * - A generic part that interacts with the kernel and tells all of + * the devices to shut down. Preventing on-going dmas, and placing + * the devices in a consistent state so a later kernel can + * reinitialize them. + * + * - A machine specific part that includes the syscall number + * and the copies the image to it's final destination. And + * jumps into the image at entry. + * + * kexec does not sync, or unmount filesystems so if you need + * that to happen you need to do that yourself. + */ +struct kimage *kexec_image = 0; + +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment *segments, unsigned long flags) +{ + struct kimage *image; + int result; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* In case we need just a little bit of special behavior for + * reboot on panic + */ + if (flags != 0) + return -EINVAL; + + if (nr_segments > KEXEC_SEGMENT_MAX) + return -EINVAL; + image = 0; + + result = 0; + if (nr_segments > 0) { + unsigned long i; + result = kimage_alloc(&image, nr_segments, segments); + if (result) { + goto out; + } + image->start = entry; + for(i = 0; i < nr_segments; i++) { + result = kimage_load_segment(image, &segments[i]); + if (result) { + goto out; + } + } + result = kimage_terminate(image); + if (result) { + goto out; + } + } + + image = xchg(&kexec_image, image); + + out: + kimage_free(image); + return result; +} diff -puN kernel/Makefile~kexec kernel/Makefile --- 25/kernel/Makefile~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/kernel/Makefile 2003-05-05 19:05:58.000000000 -0700 @@ -18,6 +18,7 @@ obj-$(CONFIG_PM) += pm.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o +obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o ifneq ($(CONFIG_IA64),y) diff -puN kernel/sys.c~kexec kernel/sys.c --- 25/kernel/sys.c~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/kernel/sys.c 2003-05-05 19:05:58.000000000 -0700 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -207,6 +208,7 @@ cond_syscall(sys_acct) cond_syscall(sys_lookup_dcookie) cond_syscall(sys_swapon) cond_syscall(sys_swapoff) +cond_syscall(sys_kexec_load) cond_syscall(sys_init_module) cond_syscall(sys_delete_module) cond_syscall(sys_socketpair) @@ -443,6 +445,27 @@ asmlinkage long sys_reboot(int magic1, i machine_restart(buffer); break; +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + { + struct kimage *image; + if (arg) { + unlock_kernel(); + return -EINVAL; + } + image = xchg(&kexec_image, 0); + if (!image) { + unlock_kernel(); + return -EINVAL; + } + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); + system_running = 0; + device_shutdown(); + printk(KERN_EMERG "Starting new kernel\n"); + machine_kexec(image); + break; + } +#endif #ifdef CONFIG_SOFTWARE_SUSPEND case LINUX_REBOOT_CMD_SW_SUSPEND: if (!software_suspend_enabled) { diff -puN MAINTAINERS~kexec MAINTAINERS --- 25/MAINTAINERS~kexec 2003-05-05 19:05:58.000000000 -0700 +++ 25-akpm/MAINTAINERS 2003-05-05 19:05:58.000000000 -0700 @@ -1068,6 +1068,14 @@ W: http://nfs.sourceforge.net/ W: http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/ S: Maintained +KEXEC +P: Eric Biederman +M: ebiederm@xmission.com +M: ebiederman@lnxi.com +W: http://www.xmission.com/~ebiederm/files/kexec/ +L: linux-kernel@vger.kernel.org +S: Maintained + LANMEDIA WAN CARD DRIVER P: Andrew Stanley-Jones M: asj@lanmedia.com _