diff -urN 2.2.18pre14aa1/Documentation/Configure.help z/Documentation/Configure.help --- 2.2.18pre14aa1/Documentation/Configure.help Tue Oct 3 01:35:32 2000 +++ z/Documentation/Configure.help Tue Oct 3 03:58:04 2000 @@ -1738,10 +1738,10 @@ all x86 CPU types (albeit not optimally fast), you can specify "386" here. - If you specify one of "486" or "586" or "Pentium" or "PPro", then - the kernel will not necessarily run on earlier architectures (e.g. a - Pentium optimized kernel will run on a PPro, but not necessarily on - a i486). + If you specify one of "486" or "586" or "Pentium" or "PPro" or "PIII", + then the kernel will not necessarily run on earlier architectures + (e.g. a Pentium optimized kernel will run on a PPro, but not necessarily + on a i486). Here are the settings recommended for greatest speed: - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI @@ -1755,8 +1755,30 @@ K6-3D. - "PPro" for the Cyrix/IBM/National Semiconductor 6x86MX, MII and Intel Pentium II/Pentium Pro. + - "PIII/Xeon/Deschutes" for the PIII (Katmai), Xeon and later PIIs + with the Deschutes or Mendocino core. You have to chose this for + MMX2 support. If you don't know what to do, choose "386". + +Disable PII/PIII Serial Number at bootup +CONFIG_X86_PN_OFF + This makes the kernel disable the CPUID serial number that is embedded on + the new PIII CPUs at bootup. + +Enable PII/PIII Extended Fast FPU save and restore support +CONFIG_X86_FX + This enables use of the new PII/PIII FXSAVE/FXRSTOR support. This item + is required to make use of the new PIII 128bit XMM registers. It is safe + to leave this enabled all the time. + +Enable CPU Specific (MMX/MMX2) Optimizations +CONFIG_X86_CPU_OPTIMIZATIONS + This enables use of the MMX registers and 128bit MMX2 registers on CPUs + that can support the new instructions (Pentium/AMD K6 or newer). In + order to support the Pentium III 128 bit XMM registers you must enable + both this and PII/PIII Extended Fast FPU save support. It is safe to + leave this enabled all the time. VGA text console CONFIG_VGA_CONSOLE diff -urN 2.2.18pre14aa1/arch/i386/Makefile z/arch/i386/Makefile --- 2.2.18pre14aa1/arch/i386/Makefile Wed Aug 2 19:24:47 2000 +++ z/arch/i386/Makefile Tue Oct 3 03:58:03 2000 @@ -43,6 +43,10 @@ CFLAGS := $(CFLAGS) -m486 -malign-loops=2 -malign-jumps=2 -malign-functions=2 -DCPU=686 endif +ifdef CONFIG_M686FX +CFLAGS := $(CFLAGS) -m486 -malign-loops=0 -malign-jumps=0 -malign-functions=0 -DCPU=686 +endif + HEAD := arch/i386/kernel/head.o arch/i386/kernel/init_task.o SUBDIRS := $(SUBDIRS) arch/i386/kernel arch/i386/mm arch/i386/lib diff -urN 2.2.18pre14aa1/arch/i386/config.in z/arch/i386/config.in --- 2.2.18pre14aa1/arch/i386/config.in Tue Oct 3 01:35:32 2000 +++ z/arch/i386/config.in Tue Oct 3 03:59:54 2000 @@ -16,7 +16,8 @@ 486/Cx486 CONFIG_M486 \ 586/K5/5x86/6x86 CONFIG_M586 \ Pentium/K6/TSC CONFIG_M586TSC \ - PPro/6x86MX CONFIG_M686" PPro + PPro/6x86MX/PII CONFIG_M686 \ + PIII/Xeon/Deschutes CONFIG_M686FX" PIII # # Define implied options from the CPU selection here # @@ -26,10 +27,11 @@ define_bool CONFIG_X86_BSWAP y define_bool CONFIG_X86_POPAD_OK y fi -if [ "$CONFIG_M686" = "y" -o "$CONFIG_M586TSC" = "y" ]; then +if [ "$CONFIG_M686FX" = "y" -o "$CONFIG_M686" = "y" \ + -o "$CONFIG_M586TSC" = "y" ]; then define_bool CONFIG_X86_TSC y fi -if [ "$CONFIG_M686" = "y" ]; then +if [ "$CONFIG_M686FX" = "y" -o "$CONFIG_M686" = "y" ]; then define_bool CONFIG_X86_GOOD_APIC y fi @@ -41,6 +43,9 @@ "1GB CONFIG_1GB \ 2GB CONFIG_2GB" 1GB +bool 'Disable the PII/PIII Serial Number at bootup' CONFIG_X86_PN_OFF +bool 'Enable PII/PIII Extended/Fast FPU save and restore support' CONFIG_X86_FX +bool 'Enable CPU Specific (MMX/MMX2) Optimization Functions' CONFIG_X86_CPU_OPTIMIZATIONS bool 'Math emulation' CONFIG_MATH_EMULATION bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP diff -urN 2.2.18pre14aa1/arch/i386/defconfig z/arch/i386/defconfig --- 2.2.18pre14aa1/arch/i386/defconfig Mon Oct 2 22:28:07 2000 +++ z/arch/i386/defconfig Tue Oct 3 03:58:03 2000 @@ -21,11 +21,14 @@ CONFIG_X86_POPAD_OK=y CONFIG_X86_TSC=y CONFIG_X86_GOOD_APIC=y -CONFIG_1GB=y -# CONFIG_2GB is not set +CONFIG_X86_PN_OFF=y +CONFIG_X86_FX=y +CONFIG_X86_CPU_OPTIMIZATIONS=y # CONFIG_MATH_EMULATION is not set # CONFIG_MTRR is not set CONFIG_SMP=y +CONFIG_1GB=y +# CONFIG_2GB is not set # # Loadable module support diff -urN 2.2.18pre14aa1/arch/i386/kernel/head.S z/arch/i386/kernel/head.S --- 2.2.18pre14aa1/arch/i386/kernel/head.S Mon Jan 17 16:44:33 2000 +++ z/arch/i386/kernel/head.S Tue Oct 3 03:58:03 2000 @@ -14,7 +14,6 @@ #include #include - #define CL_MAGIC_ADDR 0x90020 #define CL_MAGIC 0xA33F #define CL_BASE_ADDR 0x90000 @@ -32,7 +31,8 @@ #define X86_HARD_MATH CPU_PARAMS+6 #define X86_CPUID CPU_PARAMS+8 #define X86_CAPABILITY CPU_PARAMS+12 -#define X86_VENDOR_ID CPU_PARAMS+16 +#define X86_MMU_CR4 CPU_PARAMS+16 +#define X86_VENDOR_ID CPU_PARAMS+20 /* * swapper_pg_dir is the main page directory, address 0x00101000 @@ -59,9 +59,8 @@ * NOTE! We have to correct for the fact that we're * not yet offset PAGE_OFFSET.. */ -#define cr4_bits mmu_cr4_features-__PAGE_OFFSET movl %cr4,%eax # Turn on 4Mb pages - orl cr4_bits,%eax + orl X86_MMU_CR4-__PAGE_OFFSET,%eax movl %eax,%cr4 #endif /* diff -urN 2.2.18pre14aa1/arch/i386/kernel/i386_ksyms.c z/arch/i386/kernel/i386_ksyms.c --- 2.2.18pre14aa1/arch/i386/kernel/i386_ksyms.c Mon Oct 2 22:28:07 2000 +++ z/arch/i386/kernel/i386_ksyms.c Tue Oct 3 04:02:31 2000 @@ -122,3 +122,12 @@ #endif EXPORT_SYMBOL(rtc_lock); + +#ifdef CONFIG_X86_CPU_OPTIMIZATIONS +EXPORT_SYMBOL(best_memcpy); +EXPORT_SYMBOL(best_memset); +EXPORT_SYMBOL(best_copy_to_user); +EXPORT_SYMBOL(best_copy_from_user); +EXPORT_SYMBOL(__best_copy_to_user); +EXPORT_SYMBOL(__best_copy_from_user); +#endif diff -urN 2.2.18pre14aa1/arch/i386/kernel/process.c z/arch/i386/kernel/process.c --- 2.2.18pre14aa1/arch/i386/kernel/process.c Thu May 4 13:00:36 2000 +++ z/arch/i386/kernel/process.c Tue Oct 3 03:58:03 2000 @@ -39,6 +39,7 @@ #include #include #include +#include #ifdef CONFIG_MATH_EMULATION #include #endif @@ -535,7 +536,6 @@ * Forget coprocessor state.. */ clear_fpu(tsk); - tsk->used_math = 0; } void release_thread(struct task_struct *dead_task) @@ -570,6 +570,106 @@ } /* + * FPU state handling functions + */ + +int i387_hard_to_user ( struct user_i387_struct * user, + union i387_hard_union * hard) +{ +#ifdef CONFIG_X86_FX + int i, err = 0; + short *tmp, *tmp2; + union i387_hard_union hard2; +#else + int err = 0; +#endif + + if (!access_ok(VERIFY_WRITE, user, sizeof(*user))) + return -EFAULT; +#ifdef CONFIG_X86_FX + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { + hard2.fsave.cwd = 0xffff0000 | hard->fxsave.fxcwd; + hard2.fsave.swd = 0xffff0000 | hard->fxsave.fxswd; + hard2.fsave.twd = fputag_KNI_to_387(hard->fxsave.fxtwd); + hard2.fsave.fip = hard->fxsave.fxfip; + hard2.fsave.fcs = hard->fxsave.fxfcs; + hard2.fsave.foo = hard->fxsave.fxfoo; + hard2.fsave.fos = hard->fxsave.fxfos; + + tmp = (short *)&hard2.fsave.st_space[0]; + tmp2 = (short *)&hard->fxsave.st_space[0]; + + /* + * Transform the two layouts: + * (we do not mix 32-bit access with 16-bit access because + * thats suboptimal on PPros) + */ + + for (i = 0; i < 8; i++) { + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2 += 4; + } + err = copy_to_user((void *)(user),(&(hard2)), + sizeof(struct i387_hard_fsave)); + } else +#endif + err = copy_to_user((void *)(user), + (&(hard->fsave.cwd)), + sizeof(struct i387_hard_fsave)); + return err; +} + +int i387_user_to_hard (union i387_hard_union * hard, + struct user_i387_struct * user) +{ +#ifdef CONFIG_X86_FX + int i, err = 0; + short *tmp, *tmp2; + union i387_hard_union hard2; +#else + int err = 0; +#endif + + if (!access_ok(VERIFY_READ, user, sizeof(*user))) + return -EFAULT; +#ifdef CONFIG_X86_FX + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { + err = copy_from_user((&(hard2)),(void *)(user), + sizeof(struct i387_hard_fsave)); + hard->fxsave.fxcwd = hard2.fsave.cwd & 0xffff; + hard->fxsave.fxswd = hard2.fsave.swd & 0xffff; + hard->fxsave.fxtwd = fputag_387_to_KNI(hard2.fsave.twd); + hard->fxsave.fxfip = hard2.fsave.fip; + hard->fxsave.fxfcs = hard2.fsave.fcs & 0xffff; + hard->fxsave.fxfoo = hard2.fsave.foo; + hard->fxsave.fxfos = hard2.fsave.fos & 0xffff; + + tmp2 = (short *)&hard->fxsave.st_space[0]; + tmp = (short *)&hard2.fsave.st_space[0]; + + for (i = 0; i < 8; i++) { + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = 0; tmp2++; + *tmp2 = 0; tmp2++; + *tmp2 = 0; tmp2++; + } + } else +#endif + err = copy_from_user((&(hard->fsave.cwd)), + (void *)(user), + sizeof(struct i387_hard_fsave)); + return err; +} + + +/* * Save a segment. */ #define savesegment(seg,value) \ @@ -605,6 +705,9 @@ unlazy_fpu(current); p->tss.i387 = current->tss.i387; + p->tss.x86_fpustate = current->tss.x86_fpustate; + p->tss.mmx_reg_space = NULL; + p->tss.xmm_reg_space = NULL; return 0; } @@ -614,13 +717,43 @@ */ int dump_fpu (struct pt_regs * regs, struct user_i387_struct* fpu) { +#ifdef CONFIG_X86_FX + int fpvalid, i; + short *tmp, *tmp2; + struct task_struct *tsk = current; + union i387_hard_union *hard; +#else int fpvalid; struct task_struct *tsk = current; - +#endif fpvalid = tsk->used_math; if (fpvalid) { unlazy_fpu(tsk); - memcpy(fpu,&tsk->tss.i387.hard,sizeof(*fpu)); +#ifdef CONFIG_X86_FX + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { + hard = &tsk->tss.i387.hard; + + fpu->cwd = 0xffff0000 | hard->fxsave.fxcwd; + fpu->swd = 0xffff0000 | hard->fxsave.fxswd; + fpu->twd = fputag_KNI_to_387(hard->fxsave.fxtwd); + fpu->fip = hard->fxsave.fxfip; + fpu->fcs = hard->fxsave.fxfcs; + fpu->foo = hard->fxsave.fxfoo; + fpu->fos = hard->fxsave.fxfos; + + tmp = (short *)&fpu->st_space[0]; + tmp2 = (short *)&hard->fxsave.st_space[0]; + + for (i = 0; i < 8; i++) { + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2+=4; + } + } else +#endif + memcpy(fpu,&tsk->tss.i387.hard.fsave,sizeof(*fpu)); } return fpvalid; @@ -680,8 +813,8 @@ /* * switch_to(x,yn) should switch tasks from x to y. * - * We fsave/fwait so that an exception goes off at the right time - * (as a call from the fsave or fwait in effect) rather than to + * We fpu_save so that an exception goes off at the right time + * (as a call from the f*save or fwait in effect) rather than to * the wrong process. Lazy FP saving no longer makes any sense * with modern CPU's, and this simplifies a lot of things (SMP * and UP become the same). diff -urN 2.2.18pre14aa1/arch/i386/kernel/ptrace.c z/arch/i386/kernel/ptrace.c --- 2.2.18pre14aa1/arch/i386/kernel/ptrace.c Tue Oct 3 01:35:32 2000 +++ z/arch/i386/kernel/ptrace.c Tue Oct 3 03:58:03 2000 @@ -19,6 +19,7 @@ #include #include #include +#include /* * does not yet catch signals sent when the child dies. @@ -660,6 +661,9 @@ }; case PTRACE_GETFPREGS: { /* Get the child FPU state. */ + /* + * user-space expects an 'old-style' FPU dump. + */ if (!access_ok(VERIFY_WRITE, (unsigned *)data, sizeof(struct user_i387_struct))) { @@ -669,15 +673,17 @@ ret = 0; if ( !child->used_math ) { /* Simulate an empty FPU. */ - child->tss.i387.hard.cwd = 0xffff037f; - child->tss.i387.hard.swd = 0xffff0000; - child->tss.i387.hard.twd = 0xffffffff; + i387_set_cwd(child->tss.i387.hard, 0x037f); + i387_set_swd(child->tss.i387.hard, 0x0000); + i387_set_twd(child->tss.i387.hard, 0xffff); } #ifdef CONFIG_MATH_EMULATION if ( boot_cpu_data.hard_math ) { #endif - __copy_to_user((void *)data, &child->tss.i387.hard, - sizeof(struct user_i387_struct)); + i387_hard_to_user( + (struct user_i387_struct *)data, + &child->tss.i387.hard + ); #ifdef CONFIG_MATH_EMULATION } else { save_i387_soft(&child->tss.i387.soft, @@ -695,11 +701,14 @@ goto out; } child->used_math = 1; + child->tss.x86_fpustate |= X86_FPUSTATE_FPU_SAVED; #ifdef CONFIG_MATH_EMULATION if ( boot_cpu_data.hard_math ) { #endif - __copy_from_user(&child->tss.i387.hard, (void *)data, - sizeof(struct user_i387_struct)); + i387_user_to_hard( + &child->tss.i387.hard, + (struct user_i387_struct *)data + ); #ifdef CONFIG_MATH_EMULATION } else { restore_i387_soft(&child->tss.i387.soft, diff -urN 2.2.18pre14aa1/arch/i386/kernel/setup.c z/arch/i386/kernel/setup.c --- 2.2.18pre14aa1/arch/i386/kernel/setup.c Tue Oct 3 01:35:32 2000 +++ z/arch/i386/kernel/setup.c Tue Oct 3 04:01:00 2000 @@ -115,6 +115,17 @@ extern int _etext, _edata, _end; extern unsigned long cpu_khz; +#ifdef CONFIG_X86_PN_OFF +int disable_x86_serial_nr = 1; +#else +int disable_x86_serial_nr = 0; +#endif + +/* + * For the various FPU using kernel accelerator routines + */ +spinlock_t kern_fpu_lock = SPIN_LOCK_UNLOCKED; + /* * This is set up by the setup-routine at boot-time */ @@ -947,7 +958,7 @@ /* It should be possible for the user to override this. */ if(c->cpuid_level > 0 && - (c->x86_vendor == X86_VENDOR_INTEL || c->x86_vendor == X86_VENDOR_TRANSMETA) && + c->x86_vendor == X86_VENDOR_TRANSMETA && c->x86_capability&(1<<18)) { /* Disable processor serial number */ unsigned long lo,hi; @@ -1078,7 +1089,15 @@ } cyrix_model(&boot_cpu_data); } - + +/* + * Setup function for serial number stuff + */ + +__initfunc(void x86_serial_nr_setup(char *str, int *ints)) +{ + disable_x86_serial_nr = !disable_x86_serial_nr; +} static char *cpu_vendor_names[] __initdata = { diff -urN 2.2.18pre14aa1/arch/i386/kernel/signal.c z/arch/i386/kernel/signal.c --- 2.2.18pre14aa1/arch/i386/kernel/signal.c Sun Apr 2 21:07:48 2000 +++ z/arch/i386/kernel/signal.c Tue Oct 3 03:58:03 2000 @@ -21,6 +21,7 @@ #include #include #include +#include #define DEBUG_SIG 0 @@ -153,9 +154,24 @@ static inline int restore_i387_hard(struct _fpstate *buf) { + int err = 0; + unsigned int tmp; struct task_struct *tsk = current; - clear_fpu(tsk); - return __copy_from_user(&tsk->tss.i387.hard, buf, sizeof(*buf)); + + /* make sure the base fpu info is in the task struct */ + err = i387_user_to_hard(&tsk->tss.i387.hard, + (struct user_i387_struct *)buf); + err |= get_user(tmp, &buf->status); + if(!err) { + i387_set_swd(tsk->tss.i387.hard, tmp); + /* + * We got a valid FPU frame back, so make sure we show as + * having a valid FPU state. + */ + tsk->used_math = 1; + tsk->tss.x86_fpustate |= X86_FPUSTATE_FPU_SAVED; + } + return err; } static inline int restore_i387(struct _fpstate *buf) @@ -169,7 +185,6 @@ else err = restore_i387_soft(¤t->tss.i387.soft, buf); #endif - current->used_math = 1; return err; } @@ -305,11 +320,23 @@ static inline int save_i387_hard(struct _fpstate * buf) { + int err = 0; + unsigned long status; struct task_struct *tsk = current; unlazy_fpu(tsk); - tsk->tss.i387.hard.status = tsk->tss.i387.hard.swd; - if (__copy_to_user(buf, &tsk->tss.i387.hard, sizeof(*buf))) + /* + * Clear out the FPU state flags so that if we don't get a valid + * FPU frame back from this signal, then we will cause an fninit + * on the next FPU usage. + */ + tsk->used_math = 0; + tsk->tss.x86_fpustate &= ~X86_FPUSTATE_FPU_SAVED; + err = i387_hard_to_user((struct user_i387_struct *)buf, + &tsk->tss.i387.hard); + i387_get_swd(tsk->tss.i387.hard, status); + err |= put_user(status, &buf->status); + if (err) return -1; return 1; } @@ -319,16 +346,11 @@ if (!current->used_math) return 0; - /* This will cause a "finit" to be triggered by the next - attempted FPU operation by the 'current' process. - */ - current->used_math = 0; - #ifndef CONFIG_MATH_EMULATION return save_i387_hard(buf); #else - return boot_cpu_data.hard_math ? save_i387_hard(buf) - : save_i387_soft(¤t->tss.i387.soft, buf); + return (boot_cpu_data.hard_math ? save_i387_hard(buf) + : save_i387_soft(¤t->tss.i387.soft, buf)); #endif } diff -urN 2.2.18pre14aa1/arch/i386/kernel/smp.c z/arch/i386/kernel/smp.c --- 2.2.18pre14aa1/arch/i386/kernel/smp.c Tue Oct 3 01:35:32 2000 +++ z/arch/i386/kernel/smp.c Tue Oct 3 03:58:03 2000 @@ -1096,6 +1096,8 @@ */ int __init start_secondary(void *unused) { + disable_serial_nr(); + load_default_mxcsr(); /* * Dont put anything before smp_callin(), SMP * booting is too fragile that we want to limit the diff -urN 2.2.18pre14aa1/arch/i386/kernel/traps.c z/arch/i386/kernel/traps.c --- 2.2.18pre14aa1/arch/i386/kernel/traps.c Mon Oct 2 22:28:07 2000 +++ z/arch/i386/kernel/traps.c Tue Oct 3 03:58:03 2000 @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -427,6 +428,7 @@ */ void math_error(void) { + unsigned long irqflags; struct task_struct * task; /* @@ -434,7 +436,12 @@ * (this will also clear the error) */ task = current; - save_fpu(task); + spin_lock_irqsave(&kern_fpu_lock, irqflags); + i387_save_hard(task->tss.i387); + task->used_math=0; + task->tss.x86_fpustate=0; + stts(); + spin_unlock_irqrestore(&kern_fpu_lock, irqflags); task->tss.trap_no = 16; task->tss.error_code = 0; force_sig(SIGFPE, task); @@ -464,18 +471,45 @@ */ asmlinkage void math_state_restore(struct pt_regs regs) { - __asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */ - if(current->used_math) - __asm__("frstor %0": :"m" (current->tss.i387)); - else - { + unsigned long irqflags; + /* + * If we have either of the kernel FPU use states set in the + * fpustate variable, then this will be a kernel math trap. + * Otherwise, this is userspace trying to use the FPU. + */ + clts(); /* Allow maths ops (or we recurse) */ + spin_lock_irqsave(&kern_fpu_lock, irqflags); + current->tss.x86_fpustate |= X86_FPUSTATE_FPU_ENABLED; /* make switch_to() work */ + if (current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) { + if(current->tss.x86_fpustate & X86_FPUSTATE_KERN_MMX) { + __asm__("movq 0x00(%0), %%mm0\n\t" + "movq 0x08(%0), %%mm1\n\t" + "movq 0x10(%0), %%mm2\n\t" + "movq 0x18(%0), %%mm3\n\t" + :: "r" (current->tss.mmx_reg_space)); + } + if(current->tss.x86_fpustate & X86_FPUSTATE_KERN_KNI) { + __asm__("movups 0x00(%0), %%xmm0\n\t" + "movups 0x10(%0), %%xmm1\n\t" + "movups 0x20(%0), %%xmm2\n\t" + "movups 0x30(%0), %%xmm3\n\t" + :: "r" (current->tss.xmm_reg_space)); + } + } else if(current->tss.x86_fpustate & X86_FPUSTATE_FPU_SAVED) { + /* + * Restoring previously saved user state. + */ + i387_restore_hard(current->tss.i387); + current->tss.x86_fpustate &= ~X86_FPUSTATE_FPU_SAVED; + } else { /* * Our first FPU usage, clean the chip. */ __asm__("fninit"); + load_default_mxcsr(); current->used_math = 1; } - current->flags|=PF_USEDFPU; /* So we fnsave on switch_to() */ + spin_unlock_irqrestore(&kern_fpu_lock, irqflags); } #ifndef CONFIG_MATH_EMULATION diff -urN 2.2.18pre14aa1/arch/i386/lib/Makefile z/arch/i386/lib/Makefile --- 2.2.18pre14aa1/arch/i386/lib/Makefile Mon Jan 17 16:44:33 2000 +++ z/arch/i386/lib/Makefile Tue Oct 3 03:58:03 2000 @@ -9,4 +9,8 @@ L_OBJS = checksum.o old-checksum.o semaphore.o delay.o \ usercopy.o getuser.o putuser.o +ifeq ($(CONFIG_X86_CPU_OPTIMIZATIONS),y) + L_OBJS += best_function.o simd.o +endif + include $(TOPDIR)/Rules.make diff -urN 2.2.18pre14aa1/arch/i386/lib/best_function.c z/arch/i386/lib/best_function.c --- 2.2.18pre14aa1/arch/i386/lib/best_function.c Thu Jan 1 01:00:00 1970 +++ z/arch/i386/lib/best_function.c Tue Oct 3 03:58:03 2000 @@ -0,0 +1,196 @@ +/* + * SIMD functions. These replace the functions in asm-i386/string.h + * whenever it makes sense. These also un-inline those functions. + * + * Copyright 1999, Doug Ledford + * + * These functions are simple and trivial, consider them to be + * public domain + */ + +#include +#include +#include +#include + +/* + * We declare our accelerator functions here since this is the only place + * that needs the declarations which makes a header file a pain to deal + * with + */ +extern void * kni_memcpy(void *, const void *, size_t); +extern void * kni_memset(void *, char, size_t); +extern unsigned long kni_copy_to_user(void *, const void *, unsigned long); +extern unsigned long kni_copy_from_user(void *, const void *, unsigned long); +extern unsigned long __kni_copy_to_user_nocheck(void *, const void *, unsigned long); +extern unsigned long __kni_copy_from_user_nocheck(void *, const void *, unsigned long); + +static void * best_memcpy_final(void *, const void *, size_t); +static void * best_memset_final(void *, char, size_t); +static unsigned long best_copy_to_user_final(void *, const void *, unsigned long); +static unsigned long best_copy_from_user_final(void *, const void *, unsigned long); +static unsigned long __best_copy_to_user_final(void *, const void *, unsigned long); +static unsigned long __best_copy_from_user_final(void *, const void *, unsigned long); + +void * best_memcpy(void * to, const void * from, size_t n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)kni_memcpy - BAR; + return(kni_memcpy(to, from, n)); + } else { + *caller = (int)best_memcpy_final - BAR; + return(__memcpy(to, from, n)); + } + } else { + return(__memcpy(to, from, n)); + } +} + +static void * best_memcpy_final(void * to, const void * from, size_t n) +{ + return(__memcpy(to, from, n)); +} + +void * best_memset(void * s, char c, size_t count) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)kni_memset - BAR; + return(kni_memset(s, c, count)); + } else { + *caller = (int)best_memset_final - BAR; + return(__memset_generic(s, c, count)); + } + } else { + return(__memset_generic(s, c, count)); + } +} + +static void * best_memset_final(void * s, char c, size_t count) +{ + return(__memset_generic(s, c, count)); +} + +unsigned long +best_copy_to_user(void *to, const void *from, unsigned long n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)kni_copy_to_user - BAR; + return(kni_copy_to_user(to, from, n)); + } else { + *caller = (int)best_copy_to_user_final - BAR; + return(best_copy_to_user_final(to, from, n)); + } + } else { + if (access_ok(VERIFY_WRITE, to, n)) { + __copy_user(to,from,n); + } + return n; + } +} + +static unsigned long +best_copy_to_user_final(void *to, const void *from, unsigned long n) +{ + if (access_ok(VERIFY_WRITE, to, n)) { + __copy_user(to,from,n); + } + return n; +} + +unsigned long +best_copy_from_user(void *to, const void *from, unsigned long n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)kni_copy_from_user - BAR; + return(kni_copy_from_user(to, from, n)); + } else { + *caller = (int)best_copy_from_user_final - BAR; + return(best_copy_from_user_final(to, from, n)); + } + } else { + if (access_ok(VERIFY_READ, from, n)) { + __copy_user_zeroing(to,from,n); + } + return n; + } +} + +static unsigned long +best_copy_from_user_final(void *to, const void *from, unsigned long n) +{ + if (access_ok(VERIFY_READ, from, n)) { + __copy_user_zeroing(to,from,n); + } + return n; +} + +unsigned long +__best_copy_to_user(void *to, const void *from, unsigned long n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)__kni_copy_to_user_nocheck - BAR; + return(__kni_copy_to_user_nocheck(to, from, n)); + } else { + *caller = (int)__best_copy_to_user_final - BAR; + return(__best_copy_to_user_final(to, from, n)); + } + } else { + __copy_user(to,from,n); + return n; + } +} + +static unsigned long +__best_copy_to_user_final(void *to, const void *from, unsigned long n) +{ + __copy_user(to,from,n); + return n; +} + +unsigned long +__best_copy_from_user(void *to, const void *from, unsigned long n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)__kni_copy_from_user_nocheck - BAR; + return(__kni_copy_from_user_nocheck(to, from, n)); + } else { + *caller = (int)__best_copy_from_user_final - BAR; + return(__best_copy_from_user_final(to, from, n)); + } + } else { + __copy_user_zeroing(to,from,n); + return n; + } +} + +static unsigned long +__best_copy_from_user_final(void *to, const void *from, unsigned long n) +{ + __copy_user_zeroing(to,from,n); + return n; +} + diff -urN 2.2.18pre14aa1/arch/i386/lib/simd.c z/arch/i386/lib/simd.c --- 2.2.18pre14aa1/arch/i386/lib/simd.c Thu Jan 1 01:00:00 1970 +++ z/arch/i386/lib/simd.c Tue Oct 3 03:58:03 2000 @@ -0,0 +1,380 @@ +/* + * SIMD functions. These replace the functions in asm-i386/string.h + * whenever it makes sense. These also un-inline those functions. + * + * Copyright 1999, Doug Ledford + * + * These functions are simple and trivial, consider them to be + * public domain + */ + +#include +#include +#include +#include +#include + +extern void * kni_memcpy(void * to, const void * from, size_t n) +{ + void *ret=to; + size_t size; + char xmm_space[32]; + int recursive; + + kernel_take_fpu_kni(recursive,2,&xmm_space[0],NULL); + + /* + * Align the destination on a 16byte boundary. + * The source doesn't have to be aligned. + */ + if ( (unsigned long)to & 0xf ) { + size = 0x10 - ((unsigned long)to & 0xf); + __memcpy(to, from, size); + n -= size; + from += size; + to += size; + } + /* + * Prefetch the first two cachelines now. + */ + __asm__ __volatile__("prefetchnta 0x00(%0)\n\t" + "prefetchnta 0x20(%0)\n\t" + : + : "r" (from)); + /* + * Copy 32 bytes at a time. The single unroll is good + * for a 30% performance boost in the copy. Additional + * unrolls are not productive. We are guaranteed to + * have at least 32 bytes of data to copy since the + * macro in string.h doesn't call into this function + * with less than 64 bytes of copy and we lost < 32 + * bytes to alignment earlier. + */ + while (n >= 0x20) { + __asm__ __volatile__( + "movups 0x00(%0),%%xmm0\n\t" + "movups 0x10(%0),%%xmm1\n\t" + "movntps %%xmm0,0x00(%1)\n\t" + "movntps %%xmm1,0x10(%1)\n\t" + : + : "r" (from), "r" (to) + : "memory"); + from += 0x20; + /* + * Note: Intermixing the prefetch at *exactly* this point + * in time has been shown to be the fastest possible. + * Timing these prefetch instructions is a complete black + * art with nothing but trial and error showing the way. + * To that extent, this optimum version was found by using + * a userland version of this routine that we clocked for + * lots of runs. We then fiddled with ordering until we + * settled on our highest speen routines. So, the long + * and short of this is, don't mess with instruction ordering + * here or suffer permance penalties you will. + */ + __asm__ __volatile__( + "prefetchnta 0x20(%0)\n\t" + : + : "r" (from)); + to += 0x20; + n -= 0x20; + } + if(n & 0x10) { + __asm__ __volatile__("movups 0x00(%0),%%xmm0\n\t" + "movntps %%xmm0,0x00(%1)\n\t" + : + : "r" (from), "r" (to) + : "memory"); + from += 0x10; + to += 0x10; + n -= 0x10; + } + if (n) { + __memcpy(to, from, n); + } + SFENCE(); + kernel_release_fpu_kni(recursive,2,&xmm_space[0],NULL); + return(ret); +} + +extern void * kni_memset(void * s, char c, size_t count) +{ + size_t size; + void *ret=s; + char xmm_space[16]; + int recursive; + + kernel_take_fpu_kni(recursive,1,&xmm_space[0],NULL); + /* + * align the destination on a 16 byte boundary + */ + size = (0x10 - ((unsigned long)s & 0xf)); + if(size != 0x10) { + __memset_generic(s, c, size + 0x10); + } else { + __memset_generic(s, c, size); + } + /* + * We aligned the destination and also made sure there was at + * least 16 bytes of memory already set so that we could simply + * load that data into our XMM register to initialize it. + */ + __asm__ __volatile__("movups (%0), %%xmm0" + : + : "r" (s)); + count -= size; + s += size; + + /* + * Do the copy by plopping out the register to memory. + * Note: Unrolling this was *totally* unproductive. My benchmark + * showed that one or two plops per iteration produced the same + * speed to within .06 MByte/s of speed. Considering that the + * routine benchmarked at over 3000 MByte/s, .06 is not statistically + * significant and only doing one drop per loop simplifies + * overhead of book keeping. + */ + while(count & ~0xf) { + __asm__ __volatile__("movntps %%xmm0,0x00(%0)\n\t" + : + : "r" (s)); + s += 0x10; + count -= 0x10; + } + /* + * Catch any tailings... + */ + if(count) { + __memset_generic(s, c, count); + } + SFENCE(); + kernel_release_fpu_kni(recursive,1,&xmm_space[0],NULL); + return(ret); +} + +#define __kni_copy_to_user(to,from,size) \ +do { \ + int __d0, __d1; \ + __asm__ __volatile__( \ + " prefetchnta 0x00(%2)\n" \ + " prefetchnta 0x20(%2)\n" \ + " jmp 200f\n" \ + "100: movups 0x00(%2),%%xmm0\n" \ + " movups 0x10(%2),%%xmm1\n" \ + "1: movntps %%xmm0,0x00(%1)\n" \ + "2: movntps %%xmm1,0x10(%1)\n" \ + " addl $0x20,%2\n" \ + " prefetchnta 0x20(%2)\n" \ + " addl $0x20,%1\n" \ + " subl $0x20,%0\n" \ + "200: cmpl $0x1f,%0\n" \ + " ja 100b\n" \ + " cmpl $0xf,%0\n" \ + " jbe 300f\n" \ + " movups 0x00(%2),%%xmm0\n" \ + "3: movntps %%xmm0,0x00(%1)\n" \ + " addl $0x10,%2\n" \ + " addl $0x10,%1\n" \ + " subl $0x10,%0\n" \ + "300:\n" \ + ".section .fixup,\"ax\"\n" \ + "6: jmp 300b\n" \ + "7: addl $0x10,%1\n" \ + " addl $0x10,%2\n" \ + " subl $0x10,%0\n" \ + " jmp 300b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,6b\n" \ + " .long 2b,7b\n" \ + " .long 3b,6b\n" \ + ".previous" \ + : "=&c"(size), "=r" (__d0), "=r" (__d1) \ + : "0"(size), "1"(to), "2"(from) \ + : "memory"); \ +} while (0) + +#define __kni_copy_from_user(to,from,size) \ +do { \ + int __d0, __d1, tmp; \ + __asm__ __volatile__( \ + " prefetchnta 0x00(%2)\n" \ + " prefetchnta 0x20(%2)\n" \ + " jmp 100f\n" \ + "1: movups 0x00(%2),%%xmm0\n" \ + "2: movups 0x10(%2),%%xmm1\n" \ + " movntps %%xmm0,0x00(%1)\n" \ + " movntps %%xmm1,0x10(%1)\n" \ + " addl $0x20,%2\n" \ + " prefetchnta 0x20(%2)\n" \ + " addl $0x20,%1\n" \ + " subl $0x20,%0\n" \ + "100: cmpl $0x1f,%0\n" \ + " ja 1b\n" \ + " cmpl $0xf,%0\n" \ + " jbe 200f\n" \ + "3: movups 0x00(%2),%%xmm0\n" \ + " movntps %%xmm0,0x00(%1)\n" \ + " addl $0x10,%2\n" \ + " addl $0x10,%1\n" \ + " subl $0x10,%0\n" \ + "200:\n" \ + ".section .fixup,\"ax\"\n" \ + "6: addl $0x10,%1\n" \ + " subl $0x10,%0\n" \ + "7: xorps %%xmm0,%%xmm0\n" \ + " movl %0,%3\n" \ + " jmp 500f\n" \ + "400: movntps %%xmm0,0x00(%1)\n" \ + " addl $0x10,%1\n" \ + " subl $0x10,%3\n" \ + "500: cmpl $0x0f,%3\n" \ + " ja 400b\n" \ + " xorl %2,%2\n" \ + "700: testl %3,%3\n" \ + " je 200b\n" \ + " movb %2,(%1)\n" \ + " inc %1\n" \ + " dec %3\n" \ + " jmp 700b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,7b\n" \ + " .long 2b,6b\n" \ + " .long 3b,7b\n" \ + ".previous" \ + : "=&c"(size), "=r" (__d0), "=q" (__d1), "=r"(tmp) \ + : "0"(size), "1"(to), "2"(from) \ + : "memory"); \ +} while (0) + + +unsigned long +__kni_copy_to_user_nocheck(void *to, const void *from, unsigned long n) +{ + unsigned long size, count; + char xmm_space[32]; + char xmm_reg_space[64]; + int recursive; + + if((unsigned long)to & 0xf) { + count = size = 0x10 - ((unsigned long)to & 0xf); + __copy_user(to,from,size); + if(size) + return(n + size); + n -= count; + to += count; + from += count; + } + count = n; + kernel_take_fpu_kni(recursive,2,&xmm_space[0],&xmm_reg_space[0]); + __kni_copy_to_user(to,from,n); + if(n && n < 0x10) { + to += (count - n); + from += (count - n); + __copy_user(to,from,n); + } + SFENCE(); + kernel_release_fpu_kni(recursive,2,&xmm_space[0],&xmm_reg_space[0]); + return n; +} + +unsigned long +__kni_copy_from_user_nocheck(void *to, const void *from, unsigned long n) +{ + unsigned long size, count; + char xmm_space[32]; + char xmm_reg_space[64]; + int recursive; + + if((unsigned long)to & 0xf) { + count = size = 0x10 - ((unsigned long)to & 0xf); + __copy_user_zeroing(to,from,size); + if(size) + return(n + size); + n -= count; + to += count; + from += count; + } + count = n; + kernel_take_fpu_kni(recursive,2,&xmm_space[0],&xmm_reg_space[0]); + __kni_copy_from_user(to,from,n); + if(n && n < 0x10) { + to += (count - n); + from += (count - n); + __copy_user_zeroing(to,from,n); + } + SFENCE(); + kernel_release_fpu_kni(recursive,2,&xmm_space[0],&xmm_reg_space[0]); + return n; +} + + + +unsigned long +kni_copy_to_user(void *to, const void *from, unsigned long n) +{ + unsigned long size, count; + char xmm_space[32]; + char xmm_reg_space[64]; + int recursive; + + if (access_ok(VERIFY_WRITE, to, n)) { + if((unsigned long)to & 0xf) { + count = size = 0x10 - ((unsigned long)to & 0xf); + __copy_user(to,from,size); + if(size) + return(n + size); + n -= count; + to += count; + from += count; + } + count = n; + kernel_take_fpu_kni(recursive,2,&xmm_space[0],&xmm_reg_space[0]); + __kni_copy_to_user(to,from,n); + if(n && n < 0x10) { + to += (count - n); + from += (count - n); + __copy_user(to,from,n); + } + SFENCE(); + kernel_release_fpu_kni(recursive,2,&xmm_space[0],&xmm_reg_space[0]); + } + return n; +} + +unsigned long +kni_copy_from_user(void *to, const void *from, unsigned long n) +{ + unsigned long size, count; + char xmm_space[32]; + char xmm_reg_space[64]; + int recursive; + + if (access_ok(VERIFY_READ, from, n)) { + if((unsigned long)to & 0xf) { + count = size = 0x10 - ((unsigned long)to & 0xf); + __copy_user_zeroing(to,from,size); + if(size) + return(n + size); + n -= count; + to += count; + from += count; + } + count = n; + kernel_take_fpu_kni(recursive,2,&xmm_space[0],&xmm_reg_space[0]); + __kni_copy_from_user(to,from,n); + if(n && n < 0x10) { + to += (count - n); + from += (count - n); + __copy_user_zeroing(to,from,n); + } + SFENCE(); + kernel_release_fpu_kni(recursive,2,&xmm_space[0],&xmm_reg_space[0]); + } + return n; +} + + diff -urN 2.2.18pre14aa1/arch/i386/mm/init.c z/arch/i386/mm/init.c --- 2.2.18pre14aa1/arch/i386/mm/init.c Tue Oct 3 01:35:32 2000 +++ z/arch/i386/mm/init.c Tue Oct 3 03:58:03 2000 @@ -193,34 +193,6 @@ extern char _text, _etext, _edata, __bss_start, _end; extern char __init_begin, __init_end; -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x0008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x0010 /* enable page size extensions */ -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x0040 /* Machine check enable */ -#define X86_CR4_PGE 0x0080 /* enable global pages */ -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ - -/* - * Save the cr4 feature set we're using (ie - * Pentium 4MB enable and PPro Global page - * enable), so that any CPU's that boot up - * after us can get the correct flags. - */ -unsigned long mmu_cr4_features __initdata = 0; - -static inline void set_in_cr4(unsigned long mask) -{ - mmu_cr4_features |= mask; - __asm__("movl %%cr4,%%eax\n\t" - "orl %0,%%eax\n\t" - "movl %%eax,%%cr4\n" - : : "irg" (mask) - :"ax"); -} - /* * allocate page table(s) for compile-time fixed mappings */ diff -urN 2.2.18pre14aa1/include/asm-i386/bugs.h z/include/asm-i386/bugs.h --- 2.2.18pre14aa1/include/asm-i386/bugs.h Mon Oct 2 22:28:15 2000 +++ z/include/asm-i386/bugs.h Tue Oct 3 03:58:03 2000 @@ -18,6 +18,7 @@ */ #include +#include #include #include @@ -69,6 +70,45 @@ #endif return; } +#ifdef CONFIG_X86_FX + /* + * If we got so far we can safely turn on FXSAVE/FXRESTORE, + * but make sure we are 16-byte aligned first. + */ + if (offsetof(struct task_struct, tss.i387.hard.fxsave.fxcwd) & 15) { + /* + * This triggers a link-time error if we manage to + * break alignment somehow. + */ + extern void __buggy_fxsr_alignment(void); + + __buggy_fxsr_alignment(); + } + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { + printk("Enabling extended fast FPU save and restore..."); + set_in_cr4(X86_CR4_OSFXSR); + printk("done.\n"); + } + /* + * Note, Katmai instructions are enabled as soon as you start + * using the FXSAVE/RESTORE stuff. This setting only + * indicates support for the masked/unmasked exceptions on + * the new PIII cpus. We don't have an Exception 16 handler + * for this yet, but we set this bit anyway. It'll kill us + * the first time we take an umasked KNI exception, but since + * no userland apps currently use KNI, it isn't an issue yet. + * We should have the handler added by then. + */ + if (boot_cpu_data.x86_capability & X86_FEATURE_XMM) { + printk("Not enabling KNI unmasked exception support\n"); + printk("Exception 19 error handler not integrated yet\n"); +#if 0 + set_in_cr4(X86_CR4_OSXMMEXCPT); + printk("done.\n"); +#endif + } +#endif + disable_serial_nr(); if (mca_pentium_flag) { /* The IBM Model 95 machines with pentiums lock up on * fpu test, so we avoid it. All pentiums have inbuilt @@ -117,23 +157,23 @@ return; if (!ignore_irq13) { printk("OK, FPU using old IRQ 13 error reporting\n"); - return; + } else { + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&boot_cpu_data.fdiv_bug) + : "m" (*&x), "m" (*&y)); + if (!boot_cpu_data.fdiv_bug) + printk("OK, FPU using exception 16 error reporting.\n"); + else + printk("Hmm, FPU using exception 16 error reporting with FDIV bug.\n"); } - __asm__("fninit\n\t" - "fldl %1\n\t" - "fdivl %2\n\t" - "fmull %2\n\t" - "fldl %1\n\t" - "fsubp %%st,%%st(1)\n\t" - "fistpl %0\n\t" - "fwait\n\t" - "fninit" - : "=m" (*&boot_cpu_data.fdiv_bug) - : "m" (*&x), "m" (*&y)); - if (!boot_cpu_data.fdiv_bug) - printk("OK, FPU using exception 16 error reporting.\n"); - else - printk("Hmm, FPU using exception 16 error reporting with FDIV bug.\n"); } __initfunc(static void check_hlt(void)) @@ -419,5 +459,7 @@ check_amd_k6(); check_pentium_f00f(); check_cyrix_coma(); + boot_cpu_data.enable_fixups = 1; /* should be safe to use MMX/MMX2 */ + /* kernel functions now */ system_utsname.machine[1] = '0' + boot_cpu_data.x86; } diff -urN 2.2.18pre14aa1/include/asm-i386/i387.h z/include/asm-i386/i387.h --- 2.2.18pre14aa1/include/asm-i386/i387.h Thu Jan 1 01:00:00 1970 +++ z/include/asm-i386/i387.h Tue Oct 3 03:58:03 2000 @@ -0,0 +1,449 @@ +/* + * include/asm-i386/i387.h + * + * Copyright (c) 1999 Doug Ledford + * + * Made from various code bits pulled from other files + * in order to put things together in a way that made + * sense. + * + * FX/FPU support: + * Copyright (c) 1999 Ingo Molnar , + * Gabriel Paubert + */ +#include + +#ifndef __ASM_I386_I387_H +#define __ASM_I386_I387_H + +extern int i387_hard_to_user ( struct user_i387_struct * user, + union i387_hard_union * hard); +extern int i387_user_to_hard ( union i387_hard_union * hard, + struct user_i387_struct * user); + +/* + * Fill out the reserved bits, treat it as an fsave struct since the + * union makes this work for both fsave and fxsave structs. + */ +#ifdef CONFIG_X86_FX + +#define i387_save_hard(x) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + __asm__ __volatile__("fxsave %0" \ + : "=m" ((x).hard.fxsave.fxcwd)); \ + } else { \ + __asm__ __volatile__("fnsave %0; fwait;" \ + : "=m" ((x).hard.fsave.cwd)); \ + } \ +} while(0) + +#define i387_restore_hard(x) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + __asm__ __volatile__("fxrstor %0" \ + : \ + : "m" ((x).hard.fxsave.fxcwd)); \ + } else { \ + __asm__ __volatile__("frstor %0" \ + : \ + :"m" ((x).hard.fsave.cwd)); \ + } \ +} while(0) + +#define i387_set_cwd(x,v) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + (x).fxsave.fxcwd = (short)(v); \ + } else { \ + (x).fsave.cwd = ((long)(v) | 0xffff0000); \ + } \ +} while(0) + +#define i387_get_swd(x,v) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + (v) = (unsigned long) (x).fxsave.fxswd; \ + } else { \ + (v) = (unsigned long) (x).fsave.swd; \ + } \ +} while(0) + +#define i387_set_swd(x,v) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + (x).fxsave.fxswd = (short)(v); \ + } else { \ + (x).fsave.swd = ((long)(v) | 0xffff0000); \ + } \ +} while(0) + +#define i387_set_twd(x,v) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + (x).fxsave.fxtwd = (short)(v); \ + } else { \ + (x).fsave.twd = ((long)(v) | 0xffff0000); \ + } \ +} while(0) + +static inline unsigned short fputag_KNI_to_387(unsigned char tb) { + unsigned short tw = tb; + tw = (tw | (tw << 4)) & 0x0f0f; /* zzzz7654zzzz3210 */ + tw = (tw | (tw << 2)) & 0x3333; /* zz76zz54zz32zz10 */ + tw = (tw | (tw << 1)) & 0x5555; /* z7z6z5z4z3z2z1z0 */ + tw = ~(tw * 3); + return tw; +} + +static inline unsigned char fputag_387_to_KNI(unsigned short tw) { + tw = ~tw & 0x5555; /* z7z6z5z4z3z2z1z0 */ + tw = (tw | (tw >> 1)) & 0x3333; /* zz76zz54zz32zz10 */ + tw = (tw | (tw >> 2)) & 0x0f0f; /* zzzz7654zzzz3210 */ + tw = (tw | (tw >> 4)) & 0x00ff; /* zzzzzzzz76543210 */ + return tw; +} + +#else /* CONFIG_X86_FX */ + +#define i387_save_hard(x) \ +do { \ + __asm__ __volatile__("fnsave %0; fwait;" \ + : "=m" ((x).hard.fsave.cwd)); \ +} while(0) + +#define i387_restore_hard(x) \ +do { \ + __asm__ __volatile__("frstor %0" \ + : \ + :"m" ((x).hard.fsave.cwd)); \ +} while(0) + +#define i387_set_cwd(x,v) \ +do { (x).fsave.cwd = ((long)(v) | 0xffff0000); } while(0) + +#define i387_get_swd(x,v) \ +do { (v) = (unsigned long) (x).fsave.swd; } while(0) + +#define i387_set_swd(x,v) \ +do { (x).fsave.swd = ((long)(v) | 0xffff0000); } while(0) + +#define i387_set_twd(x,v) \ +do { (x).fsave.twd = ((long)(v) | 0xffff0000); } while(0) + +#endif /* CONFIG_X86_FX */ + + +/* + * For when we want to use the FPU in kernel code + * + * These functions allow the use of up to 4 KNI based xmm registers on the + * Pentium III processors or up to 4 MMX registers on Pentium MMX and above + * or compatible processors. Pick the routines that you need based on the + * regs you are going to use. Keep in mind that these are intended to be + * used only after you've verified that the processor supports these + * operations. Use them before you've done that and watch your machine go + * boom. Take a look in arch/i386/lib/best_function.c for an example of + * how to fixup the kernel with kni/mmx using functions once the CPU + * capabilities have been determined. + * + * In all of these functions: + * + * recursive - int, used to determine what the state is at restore time + * regs - char * to an array that is 32 bytes for mmx and 64 bytes for kni + * which is then used to save off the contents of the current + * regs to be recursively safe + * task_switch_regs - char * to another array of the same size as the one + * above, but this array is optional. If your function might get + * pre-empted by another task then this pointer should be non-NULL + * so that at unlazy_fpu() time in the switch_to() function we + * can save your register state (copy_*_user functions are an example + * of functions that need this, since they can take a page fault and + * while that fault is being serviced the scheduler is free to run + * another task entirely). + * irqflags - unsigned long used to store IRQ state + */ + +#define SAVE_MMX_REGS(num_regs, regs) \ + switch( (num_regs) ) { \ + case 1: \ + __asm__ __volatile__("movq %%mm0, 0x00(%0)\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + case 2: \ + __asm__ __volatile__("movq %%mm0, 0x00(%0)\n\t" \ + "movq %%mm1, 0x08(%0)\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + case 3: \ + __asm__ __volatile__("movq %%mm0, 0x00(%0)\n\t" \ + "movq %%mm1, 0x08(%0)\n\t" \ + "movq %%mm2, 0x10(%0)\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + case 4: \ + __asm__ __volatile__("movq %%mm0, 0x00(%0)\n\t" \ + "movq %%mm1, 0x08(%0)\n\t" \ + "movq %%mm2, 0x10(%0)\n\t" \ + "movq %%mm3, 0x18(%0)\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + default: \ + panic("Invalid number of regs passed to SAVE_MMX_REGS\n"); \ + } + +#define RESTORE_MMX_REGS(num_regs, regs) \ + switch( (num_regs) ) { \ + case 1: \ + __asm__ __volatile__("movq 0x00(%0), %%mm0\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + case 2: \ + __asm__ __volatile__("movq 0x00(%0), %%mm0\n\t" \ + "movq 0x08(%0), %%mm1\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + case 3: \ + __asm__ __volatile__("movq 0x00(%0), %%mm0\n\t" \ + "movq 0x08(%0), %%mm1\n\t" \ + "movq 0x10(%0), %%mm2\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + case 4: \ + __asm__ __volatile__("movq 0x00(%0), %%mm0\n\t" \ + "movq 0x08(%0), %%mm1\n\t" \ + "movq 0x10(%0), %%mm2\n\t" \ + "movq 0x18(%0), %%mm3\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + default: \ + panic("Invalid number of regs passed to RESTORE_MMX_REGS\n"); \ + } + +#define SAVE_KNI_REGS(num_regs, regs) \ + switch( (num_regs) ) { \ + case 1: \ + __asm__ __volatile__("movups %%xmm0, 0x00(%0)\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + case 2: \ + __asm__ __volatile__("movups %%xmm0, 0x00(%0)\n\t" \ + "movups %%xmm1, 0x10(%0)\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + case 3: \ + __asm__ __volatile__("movups %%xmm0, 0x00(%0)\n\t" \ + "movups %%xmm1, 0x10(%0)\n\t" \ + "movups %%xmm2, 0x20(%0)\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + case 4: \ + __asm__ __volatile__("movups %%xmm0, 0x00(%0)\n\t" \ + "movups %%xmm1, 0x10(%0)\n\t" \ + "movups %%xmm2, 0x20(%0)\n\t" \ + "movups %%xmm3, 0x30(%0)\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + default: \ + panic("Invalid number of regs passed to SAVE_KNI_REGS\n"); \ + } + +#define RESTORE_KNI_REGS(num_regs, regs) \ + switch( (num_regs) ) { \ + case 1: \ + __asm__ __volatile__("movups 0x00(%0), %%xmm0\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + case 2: \ + __asm__ __volatile__("movups 0x00(%0), %%xmm0\n\t" \ + "movups 0x10(%0), %%xmm1\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + case 3: \ + __asm__ __volatile__("movups 0x00(%0), %%xmm0\n\t" \ + "movups 0x10(%0), %%xmm1\n\t" \ + "movups 0x20(%0), %%xmm2\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + case 4: \ + __asm__ __volatile__("movups 0x00(%0), %%xmm0\n\t" \ + "movups 0x10(%0), %%xmm1\n\t" \ + "movups 0x20(%0), %%xmm2\n\t" \ + "movups 0x30(%0), %%xmm3\n\t" \ + : : "r" ((regs)) : "memory" ); \ + break; \ + default: \ + panic("Invalid number of regs passed to RESTORE_KNI_REGS\n"); \ + } + +#define SFENCE() \ + __asm__ __volatile__("sfence":::"memory") + + +extern spinlock_t kern_fpu_lock; + +/* + * BIG FAT WARNING ---- DO NOT MIX KNI AND MMX OPTIMIZATIONS IN THE SAME + * KERNEL OR ELSE USER SPACE FPU REGS WILL SUFFER AS A RESULT!!!!!!!!!!! + * + * SECOND BIG FAT WARNING ---- EVEN THOUGH THE num_regs VARIABLE ONLY NEEDS + * TO BE AS LARGE AS THE REGS YOU ARE USING, THE task_switch_regs ARRAY + * MUST BE LARGE ENOUGH TO HOLD 4 OF THE REGISTERS YOU ARE USING BECAUSE + * unlazy_fpu HAS NO IDEA HOW MANY REGS WE ARE USING. SINCE 4 IS THE + * MAXIMUM NUMBER OF REGS ALLOWED, WE MUST HAVE SPACE FOR 4 ON A TASK + * SWITCH. + */ + +/* + * FPU lazy state save handling.. + */ +#define clear_fpu(tsk) do { \ + unsigned long irqflags; \ + spin_lock_irqsave(&kern_fpu_lock, irqflags); \ + tsk->used_math=0; \ + tsk->tss.x86_fpustate=0; \ + stts(); \ + spin_unlock_irqrestore(&kern_fpu_lock, irqflags); \ +} while (0) + +#define save_kern_fpu(tsk) do { \ + if(tsk->tss.mmx_reg_space != NULL) \ + __asm__("movq %%mm0, 0x00(%0)\n\t" \ + "movq %%mm1, 0x08(%0)\n\t" \ + "movq %%mm2, 0x10(%0)\n\t" \ + "movq %%mm3, 0x18(%0)\n\t" \ + :: "r" (tsk->tss.mmx_reg_space):"memory"); \ + if(tsk->tss.xmm_reg_space != NULL) \ + __asm__("movups %%xmm0, 0x00(%0)\n\t" \ + "movups %%xmm1, 0x10(%0)\n\t" \ + "movups %%xmm2, 0x20(%0)\n\t" \ + "movups %%xmm3, 0x30(%0)\n\t" \ + :: "r" (tsk->tss.xmm_reg_space):"memory"); \ +} while (0) + +#define unlazy_fpu(tsk) do { \ + unsigned long irqflags; \ + spin_lock_irqsave(&kern_fpu_lock, irqflags); \ + if (tsk->tss.x86_fpustate & X86_FPUSTATE_FPU_ENABLED) { \ + if (tsk->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) { \ + save_kern_fpu(tsk); \ + } else if (!(tsk->tss.x86_fpustate & X86_FPUSTATE_FPU_SAVED) && \ + (tsk->used_math)) { \ + i387_save_hard(tsk->tss.i387); \ + tsk->tss.x86_fpustate |= X86_FPUSTATE_FPU_SAVED; \ + } \ + tsk->tss.x86_fpustate &= ~X86_FPUSTATE_FPU_ENABLED; \ + stts(); \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, irqflags); \ +} while (0) + +#define save_user_fpu_context(tsk) do { \ + if (!((tsk)->tss.x86_fpustate & X86_FPUSTATE_FPU_ENABLED)) { \ + clts(); \ + (tsk)->tss.x86_fpustate |= X86_FPUSTATE_FPU_ENABLED; \ + if((tsk)->tss.mmx_reg_space) { \ + __asm__("movq 0x00(%0), %%mm0\n\t" \ + "movq 0x08(%0), %%mm1\n\t" \ + "movq 0x10(%0), %%mm2\n\t" \ + "movq 0x18(%0), %%mm3\n\t" \ + :: "r" ((tsk)->tss.mmx_reg_space)); \ + } else if((tsk)->tss.xmm_reg_space) { \ + __asm__("movups 0x00(%0), %%xmm0\n\t" \ + "movups 0x10(%0), %%xmm1\n\t" \ + "movups 0x20(%0), %%xmm2\n\t" \ + "movups 0x30(%0), %%xmm3\n\t" \ + :: "r" ((tsk)->tss.xmm_reg_space)); \ + } \ + } else if (!((tsk)->tss.x86_fpustate & X86_FPUSTATE_FPU_SAVED) && \ + ((tsk)->used_math)) { \ + i387_save_hard((tsk)->tss.i387); \ + (tsk)->tss.x86_fpustate |= X86_FPUSTATE_FPU_SAVED; \ + } \ +} while (0) + +#define kernel_take_fpu_mmx(recursive, num_regs, regs, task_switch_regs) do { \ + unsigned long irqflags; \ + struct task_struct *tsk=current; \ + spin_lock_irqsave(&kern_fpu_lock, irqflags); \ + (recursive) = tsk->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY; \ + save_user_fpu_context(tsk); \ + if((recursive) & X86_FPUSTATE_KERN_MMX) { \ + SAVE_MMX_REGS((num_regs), (regs)); \ + } else { \ + tsk->tss.x86_fpustate |= X86_FPUSTATE_KERN_MMX; \ + } \ + if ((task_switch_regs) && !tsk->tss.mmx_reg_space) { \ + tsk->tss.mmx_reg_space = (task_switch_regs); \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, irqflags); \ +} while (0) + +#define kernel_release_fpu_mmx(recursive, num_regs, regs, task_switch_regs) do { \ + unsigned long irqflags; \ + struct task_struct *tsk=current; \ + spin_lock_irqsave(&kern_fpu_lock, irqflags); \ + if (tsk->tss.mmx_reg_space == task_switch_regs) { \ + tsk->tss.mmx_reg_space = NULL; \ + } \ + if((recursive) & X86_FPUSTATE_KERN_MMX) { \ + if (!(tsk->tss.x86_fpustate & X86_FPUSTATE_FPU_ENABLED)) { \ + clts(); \ + tsk->tss.x86_fpustate |= X86_FPUSTATE_FPU_ENABLED; \ + } \ + RESTORE_MMX_REGS((num_regs), (regs)); \ + } else { \ + tsk->tss.x86_fpustate &= ~X86_FPUSTATE_KERN_MMX; \ + if(!(recursive) && \ + (tsk->tss.x86_fpustate & X86_FPUSTATE_FPU_ENABLED)) { \ + tsk->tss.x86_fpustate &= ~X86_FPUSTATE_FPU_ENABLED; \ + stts(); \ + } \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, irqflags); \ +} while (0) + +#define kernel_take_fpu_kni(recursive, num_regs, regs, task_switch_regs) do { \ + unsigned long irqflags; \ + struct task_struct *tsk=current; \ + spin_lock_irqsave(&kern_fpu_lock, irqflags); \ + (recursive) = tsk->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY; \ + save_user_fpu_context(tsk); \ + if((recursive) & X86_FPUSTATE_KERN_KNI) { \ + SAVE_KNI_REGS((num_regs), (regs)); \ + } else { \ + tsk->tss.x86_fpustate |= X86_FPUSTATE_KERN_KNI; \ + } \ + if ((task_switch_regs) && !tsk->tss.xmm_reg_space) { \ + tsk->tss.xmm_reg_space = (task_switch_regs); \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, irqflags); \ +} while (0) + + +#define kernel_release_fpu_kni(recursive, num_regs, regs, task_switch_regs) do { \ + unsigned long irqflags; \ + struct task_struct *tsk=current; \ + spin_lock_irqsave(&kern_fpu_lock, irqflags); \ + if (tsk->tss.xmm_reg_space == task_switch_regs) { \ + tsk->tss.xmm_reg_space = NULL; \ + } \ + if((recursive) & X86_FPUSTATE_KERN_KNI) { \ + if(!(tsk->tss.x86_fpustate & X86_FPUSTATE_FPU_ENABLED)) { \ + clts(); \ + tsk->tss.x86_fpustate |= X86_FPUSTATE_FPU_ENABLED; \ + } \ + RESTORE_KNI_REGS((num_regs), (regs)); \ + } else { \ + tsk->tss.x86_fpustate &= ~X86_FPUSTATE_KERN_KNI; \ + if(!(recursive) && \ + (tsk->tss.x86_fpustate & X86_FPUSTATE_FPU_ENABLED)) { \ + tsk->tss.x86_fpustate &= ~X86_FPUSTATE_FPU_ENABLED; \ + stts(); \ + } \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, irqflags); \ +} while (0) + + +#endif /* __ASM_I386_I387_H */ diff -urN 2.2.18pre14aa1/include/asm-i386/io.h z/include/asm-i386/io.h --- 2.2.18pre14aa1/include/asm-i386/io.h Tue Oct 3 01:35:32 2000 +++ z/include/asm-i386/io.h Tue Oct 3 03:58:03 2000 @@ -166,9 +166,9 @@ #define writew(b,addr) (*(volatile unsigned short *) __io_virt(addr) = (b)) #define writel(b,addr) (*(volatile unsigned int *) __io_virt(addr) = (b)) -#define memset_io(a,b,c) memset(__io_virt(a),(b),(c)) -#define memcpy_fromio(a,b,c) memcpy((a),__io_virt(b),(c)) -#define memcpy_toio(a,b,c) memcpy(__io_virt(a),(b),(c)) +#define memset_io(a,b,c) __memset_generic(__io_virt(a),(b),(c)) +#define memcpy_fromio(a,b,c) __memcpy((a),__io_virt(b),(c)) +#define memcpy_toio(a,b,c) __memcpy(__io_virt(a),(b),(c)) /* * Again, i386 does not require mem IO specific function. diff -urN 2.2.18pre14aa1/include/asm-i386/processor.h z/include/asm-i386/processor.h --- 2.2.18pre14aa1/include/asm-i386/processor.h Mon Oct 2 22:28:15 2000 +++ z/include/asm-i386/processor.h Tue Oct 3 04:03:06 2000 @@ -7,10 +7,11 @@ #ifndef __ASM_I386_PROCESSOR_H #define __ASM_I386_PROCESSOR_H +#include #include #include -#include #include +#include /* * CPU type and hardware bug flags. Kept separately for each CPU. @@ -29,6 +30,7 @@ char rfu; int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ __u32 x86_capability; + __u32 mmu_cr4_features; char x86_vendor_id[16]; char x86_model_id[64]; int x86_cache_size; /* in KB - valid for CPUS which support this @@ -36,6 +38,7 @@ int fdiv_bug; int f00f_bug; int coma_bug; + int enable_fixups; unsigned long loops_per_jiffy; unsigned long *pgd_quick; unsigned long *pte_quick; @@ -72,16 +75,16 @@ #define X86_FEATURE_PGE 0x00002000 /* Page Global Enable */ #define X86_FEATURE_MCA 0x00004000 /* Machine Check Architecture */ #define X86_FEATURE_CMOV 0x00008000 /* CMOV instruction (FCMOVCC and FCOMI too if FPU present) */ -#define X86_FEATURE_PAT 0x00010000 /* Page Attribute Table */ +#define X86_FEATURE_PAT 0x00010000 /* Page Attribute Table */ #define X86_FEATURE_PSE36 0x00020000 /* 36-bit PSEs */ -#define X86_FEATURE_18 0x00040000 +#define X86_FEATURE_PN 0x00040000 /* 96 bit CPU serial # */ #define X86_FEATURE_19 0x00080000 #define X86_FEATURE_20 0x00100000 #define X86_FEATURE_21 0x00200000 #define X86_FEATURE_22 0x00400000 #define X86_FEATURE_MMX 0x00800000 /* multimedia extensions */ #define X86_FEATURE_FXSR 0x01000000 /* FXSAVE and FXRSTOR instructions (fast save and restore of FPU context), and CR4.OSFXSR (OS uses these instructions) available */ -#define X86_FEATURE_25 0x02000000 +#define X86_FEATURE_XMM 0x02000000 /* Intel MMX2 instruction set */ #define X86_FEATURE_26 0x04000000 #define X86_FEATURE_27 0x08000000 #define X86_FEATURE_28 0x10000000 @@ -91,6 +94,83 @@ extern struct cpuinfo_x86 boot_cpu_data; +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ +#define X86_CR4_DE 0x0008 /* enable debugging extensions */ +#define X86_CR4_PSE 0x0010 /* enable page size extensions */ +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ +#define X86_CR4_MCE 0x0040 /* Machine check enable */ +#define X86_CR4_PGE 0x0080 /* enable global pages */ +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ +#define X86_CR4_OSFXSR 0x0200 /* fast FPU save/restore */ +#define X86_CR4_OSXMMEXCPT 0x0400 /* KNI (MMX2) unmasked exception 16 */ + /* handler is available */ + +/* + * Some defines for using with the x86_fpu_state variable in the new + * thread struct. We use these because the rest of the kernel doesn't + * like us messing with current->flags at arbitrary times ;-) + */ +#define X86_FPUSTATE_FPU_SAVED 0x0001 +#define X86_FPUSTATE_FPU_ENABLED 0x0002 +#define X86_FPUSTATE_KERN_MMX 0x0004 +#define X86_FPUSTATE_KERN_KNI 0x0008 +#define X86_FPUSTATE_KERN_ANY (X86_FPUSTATE_KERN_MMX|X86_FPUSTATE_KERN_KNI) + +/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up + * after us can get the correct flags. + */ + +static inline void set_in_cr4(unsigned long mask) +{ + boot_cpu_data.mmu_cr4_features |= mask; + __asm__("movl %%cr4,%%eax\n\t" + "orl %0,%%eax\n\t" + "movl %%eax,%%cr4\n" + : : "irg" (mask) + :"ax"); +} + +extern int disable_x86_serial_nr; + +static inline void disable_serial_nr(void) +{ + if ( disable_x86_serial_nr && + (boot_cpu_data.x86_capability & X86_FEATURE_PN) ) { + printk("Disabling CPUID Serial number..."); + __asm__ __volatile__( "movl $0x119,%%ecx\n\t" + "rdmsr\n\t" + "orl $0x00200000,%%eax\n\t" + "wrmsr":::"ax","dx","cx","memory"); + /* + * We might need to re-read the x86 capability set now to + * make sure that the PN bit has been turned off so + * we know that the serial number stuff is disabled + * + * Note: we don't need to re-read the registers. We can tell + * by rebooting that the flag is off since on reboots that + * don't power the machine down the serial number doesn't + * get disabled any more because it already is disabled. + */ + printk("done.\n"); + } +} + +static inline void load_default_mxcsr(void) +{ + long mxcsr = 0x1f80; + + if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + __asm__("ldmxcsr %0": :"m" (mxcsr)); + } +} + + #ifdef __SMP__ extern struct cpuinfo_x86 cpu_data[]; #define current_cpu_data cpu_data[smp_processor_id()] @@ -173,36 +253,61 @@ */ #define IO_BITMAP_SIZE 32 -struct i387_hard_struct { - long cwd; - long swd; - long twd; - long fip; - long fcs; - long foo; - long fos; - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - long status; /* software status information */ +struct i387_hard_fsave { + long cwd; + long swd; + long twd; + long fip; + long fcs; + long foo; + long fos; + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ +}; + +/* + * has to be 128-bit aligned + */ +struct i387_hard_fxsave { + unsigned short fxcwd; + unsigned short fxswd; + unsigned short fxtwd; + unsigned short fxfopcode; + long fxfip; + short fxfcs; + short __reserved_00; + long fxfoo; + short fxfos; + short __reserved_01; + long mxcsr; + long __reserved_02; + long st_space[32]; /* 8*16 bytes for each FP/MMX-reg = 128 bytes */ + long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ + long __reserved_03 [14*4]; /* 14 16byte lines for remainder */ +} __attribute__ ((aligned (16))); + +union i387_hard_union { + struct i387_hard_fxsave fxsave; + struct i387_hard_fsave fsave; }; struct i387_soft_struct { - long cwd; - long swd; - long twd; - long fip; - long fcs; - long foo; - long fos; - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - unsigned char ftop, changed, lookahead, no_update, rm, alimit; - struct info *info; - unsigned long entry_eip; + long cwd; + long swd; + long twd; + long fip; + long fcs; + long foo; + long fos; + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ + unsigned char ftop, changed, lookahead, no_update, rm, alimit; + struct info *info; + unsigned long entry_eip; }; union i387_union { - struct i387_hard_struct hard; + union i387_hard_union hard; struct i387_soft_struct soft; -}; +} __attribute__ ((aligned(16))); typedef struct { unsigned long seg; @@ -244,6 +349,9 @@ struct vm86_struct * vm86_info; unsigned long screen_bitmap; unsigned long v86flags, v86mask, v86mode, saved_esp0; + volatile long x86_fpustate; + char *mmx_reg_space; + char *xmm_reg_space; }; #define INIT_MMAP \ @@ -265,8 +373,9 @@ {~0, }, /* ioperm */ \ _TSS(0), 0, 0, 0, (mm_segment_t) { 0 }, /* obsolete */ \ { 0, }, \ - { { 0, }, }, /* 387 state */ \ + { { { 0, }, }, }, /* 387 state */ \ NULL, 0, 0, 0, 0, 0, /* vm86_info */ \ + 0, NULL, NULL /* fpustate, {mmx,xmm}_reg_space */ \ } #define start_thread(regs, new_eip, new_esp) do { \ @@ -291,27 +400,6 @@ extern void copy_segments(int nr, struct task_struct *p, struct mm_struct * mm); extern void release_segments(struct mm_struct * mm); extern void forget_segments(void); - -/* - * FPU lazy state save handling.. - */ -#define save_fpu(tsk) do { \ - asm volatile("fnsave %0\n\tfwait":"=m" (tsk->tss.i387)); \ - tsk->flags &= ~PF_USEDFPU; \ - stts(); \ -} while (0) - -#define unlazy_fpu(tsk) do { \ - if (tsk->flags & PF_USEDFPU) \ - save_fpu(tsk); \ -} while (0) - -#define clear_fpu(tsk) do { \ - if (tsk->flags & PF_USEDFPU) { \ - tsk->flags &= ~PF_USEDFPU; \ - stts(); \ - } \ -} while (0) /* * Return saved PC of a blocked thread. diff -urN 2.2.18pre14aa1/include/asm-i386/string.h z/include/asm-i386/string.h --- 2.2.18pre14aa1/include/asm-i386/string.h Mon Jan 17 16:44:44 2000 +++ z/include/asm-i386/string.h Tue Oct 3 03:58:03 2000 @@ -14,6 +14,10 @@ #include #else +#ifndef _LINUX_CONFIG_H +#include +#endif + /* * This string-include defines all string functions as inline * functions. Use gcc. It also assumes ds=es=data space, this should be @@ -293,10 +297,22 @@ } #define __HAVE_ARCH_MEMCPY +#ifdef CONFIG_X86_CPU_OPTIMIZATIONS +extern void * best_memcpy(void * to, const void * from, size_t n); +#define memcpy(t, f, n) \ +(__builtin_constant_p(n) ? \ + (((n) < 512) ? \ + __constant_memcpy((t),(f),(n)) : \ + best_memcpy((t),(f),(n))) : \ + (((n) < 512) ? \ + __memcpy((t),(f),(n)) : \ + best_memcpy((t),(f),(n)))) +#else #define memcpy(t, f, n) \ (__builtin_constant_p(n) ? \ __constant_memcpy((t),(f),(n)) : \ __memcpy((t),(f),(n))) +#endif #define __HAVE_ARCH_MEMMOVE extern inline void * memmove(void * dest,const void * src, size_t n) @@ -449,21 +465,33 @@ #undef COMMON } -#define __constant_c_x_memset(s, c, count) \ -(__builtin_constant_p(count) ? \ - __constant_c_and_count_memset((s),(c),(count)) : \ - __constant_c_memset((s),(c),(count))) +#define __constant_x_count_memset(s, c, count) \ +(__builtin_constant_p(c) ? \ + __constant_c_and_count_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) :\ + __constant_count_memset((s),(c),(count))) #define __memset(s, c, count) \ -(__builtin_constant_p(count) ? \ - __constant_count_memset((s),(c),(count)) : \ +(__builtin_constant_p(c) ? \ + __constant_c_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \ __memset_generic((s),(c),(count))) #define __HAVE_ARCH_MEMSET +#ifdef CONFIG_X86_CPU_OPTIMIZATIONS +extern void * best_memset(void * s, char c, size_t count); #define memset(s, c, count) \ -(__builtin_constant_p(c) ? \ - __constant_c_x_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \ +(__builtin_constant_p(count) ? \ + (((count) < 512) ? \ + __constant_x_count_memset((s),(c),(count)) : \ + best_memset((s),(c),(count))) : \ + (((count) < 512) ? \ + __memset((s),(c),(count)) : \ + best_memset((s),(c),(count)))) +#else +#define memset(s, c, count) \ +(__builtin_constant_p(count) ? \ + __constant_x_count_memset((s),(c),(count)) : \ __memset((s),(c),(count))) +#endif /* * find the first occurrence of byte 'c', or 1 past the area if none diff -urN 2.2.18pre14aa1/include/asm-i386/uaccess.h z/include/asm-i386/uaccess.h --- 2.2.18pre14aa1/include/asm-i386/uaccess.h Tue Oct 3 01:35:32 2000 +++ z/include/asm-i386/uaccess.h Tue Oct 3 03:58:03 2000 @@ -589,19 +589,60 @@ return n; } +#ifdef CONFIG_X86_CPU_OPTIMIZATIONS + +unsigned long best_copy_to_user(void *, const void *, unsigned long); +unsigned long best_copy_from_user(void *, const void *, unsigned long); +unsigned long __best_copy_to_user(void *, const void *, unsigned long); +unsigned long __best_copy_from_user(void *, const void *, unsigned long); + #define copy_to_user(to,from,n) \ (__builtin_constant_p(n) ? \ + (((n) < 512) ? \ __constant_copy_to_user((to),(from),(n)) : \ - __generic_copy_to_user((to),(from),(n))) + best_copy_to_user((to),(from),(n))) : \ + (((n) < 512) ? \ + __generic_copy_to_user((to),(from),(n)) : \ + best_copy_to_user((to),(from),(n)))) #define copy_from_user(to,from,n) \ (__builtin_constant_p(n) ? \ + (((n) < 512) ? \ __constant_copy_from_user((to),(from),(n)) : \ - __generic_copy_from_user((to),(from),(n))) + best_copy_from_user((to),(from),(n))) : \ + (((n) < 512) ? \ + __generic_copy_from_user((to),(from),(n)) : \ + best_copy_from_user((to),(from),(n)))) -#define copy_to_user_ret(to,from,n,retval) ({ if (copy_to_user(to,from,n)) return retval; }) +#define __copy_to_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + (((n) < 512) ? \ + __constant_copy_to_user_nocheck((to),(from),(n)) : \ + __best_copy_to_user((to),(from),(n))) : \ + (((n) < 512) ? \ + __generic_copy_to_user_nocheck((to),(from),(n)) : \ + __best_copy_to_user((to),(from),(n)))) -#define copy_from_user_ret(to,from,n,retval) ({ if (copy_from_user(to,from,n)) return retval; }) +#define __copy_from_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + (((n) < 512) ? \ + __constant_copy_from_user_nocheck((to),(from),(n)) : \ + __best_copy_from_user((to),(from),(n))) : \ + (((n) < 512) ? \ + __generic_copy_from_user_nocheck((to),(from),(n)) : \ + __best_copy_from_user((to),(from),(n)))) + +#else /* CONFIG_X86_CPU_OPTIMIZATIONS */ + +#define copy_to_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + __constant_copy_to_user((to),(from),(n)) : \ + __generic_copy_to_user((to),(from),(n))) + +#define copy_from_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + __constant_copy_from_user((to),(from),(n)) : \ + __generic_copy_from_user((to),(from),(n))) #define __copy_to_user(to,from,n) \ (__builtin_constant_p(n) ? \ @@ -612,6 +653,11 @@ (__builtin_constant_p(n) ? \ __constant_copy_from_user_nocheck((to),(from),(n)) : \ __generic_copy_from_user_nocheck((to),(from),(n))) +#endif + +#define copy_to_user_ret(to,from,n,retval) ({ if (copy_to_user(to,from,n)) return retval; }) + +#define copy_from_user_ret(to,from,n,retval) ({ if (copy_from_user(to,from,n)) return retval; }) long strncpy_from_user(char *dst, const char *src, long count); long __strncpy_from_user(char *dst, const char *src, long count); diff -urN 2.2.18pre14aa1/init/main.c z/init/main.c --- 2.2.18pre14aa1/init/main.c Tue Oct 3 01:35:32 2000 +++ z/init/main.c Tue Oct 3 03:58:03 2000 @@ -103,6 +103,7 @@ #ifdef __i386__ extern void ioapic_pirq_setup(char *str, int *ints); extern void ioapic_setup(char *str, int *ints); +extern void x86_serial_nr_setup(char *str, int *ints); #endif #ifdef CONFIG_MICROCODE extern int microcode_init(void); @@ -736,6 +737,9 @@ #ifdef CONFIG_BLK_DEV_INITRD { "noinitrd", no_initrd }, #endif +#endif +#ifdef __i386__ + { "x86_serial_nr", x86_serial_nr_setup }, #endif #ifdef CONFIG_CTC