diff -Nru a/Documentation/ia64/fsys.txt b/Documentation/ia64/fsys.txt --- a/Documentation/ia64/fsys.txt Fri Oct 17 23:12:58 2003 +++ b/Documentation/ia64/fsys.txt Fri Oct 17 23:12:58 2003 @@ -4,7 +4,7 @@ ----------------------------------- Started: 13-Jan-2003 - Last update: 11-Feb-2003 + Last update: 27-Sep-2003 David Mosberger-Tang @@ -146,6 +146,12 @@ task pointer is not considered sensitive: it's already exposed through ar.k6). + o Fsyscall-handlers MUST NOT access user-memory without first + validating access-permission (this can be done typically via + probe.r.fault and/or probe.w.fault) and without guarding against + memory access exceptions (this can be done with the EX() macros + defined by asmmacro.h). + The above restrictions may seem draconian, but remember that it's possible to trade off some of the restrictions by paying a slightly higher overhead. For example, if an fsyscall-handler could benefit @@ -229,3 +235,52 @@ PSR.bn Unchanged. Note: fsys-mode handlers may clear the bit, if needed. Doing so requires clearing PSR.i and PSR.ic as well. PSR.ia Unchanged. Note: the ia64 linux kernel never sets this bit. + +* Using fast system calls + +To use fast system calls, userspace applications need simply call +__kernel_syscall_via_epc(). For example + +-- example fgettimeofday() call -- +-- fgettimeofday.S -- + +#include + +GLOBAL_ENTRY(fgettimeofday) +.prologue +.save ar.pfs, r11 +mov r11 = ar.pfs +.body + +mov r2 = 0xa000000000020660;; // gate address + // found by inspection of System.map for the + // __kernel_syscall_via_epc() function. See + // below for how to do this for real. + +mov b7 = r2 +mov r15 = 1087 // gettimeofday syscall +;; +br.call.sptk.many b6 = b7 +;; + +.restore sp + +mov ar.pfs = r11 +br.ret.sptk.many rp;; // return to caller +END(fgettimeofday) + +-- end fgettimeofday.S -- + +In reality, getting the gate address is accomplished by two extra +values passed via the ELF auxiliary vector (include/asm-ia64/elf.h) + + o AT_SYSINFO : is the address of __kernel_syscall_via_epc() + o AT_SYSINFO_EHDR : is the address of the kernel gate ELF DSO + +The ELF DSO is a pre-linked library that is mapped in by the kernel at +the gate page. It is a proper ELF shared object so, with a dynamic +loader that recognises the library, you should be able to make calls to +the exported functions within it as with any other shared library. +AT_SYSINFO points into the kernel DSO at the +__kernel_syscall_via_epc() function for historical reasons (it was +used before the kernel DSO) and as a convenience. diff -Nru a/arch/ia64/Kconfig b/arch/ia64/Kconfig --- a/arch/ia64/Kconfig Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/Kconfig Fri Oct 17 23:12:58 2003 @@ -57,6 +57,10 @@ config IA64_GENERIC bool "generic" + select NUMA + select ACPI_NUMA + select VIRTUAL_MEM_MAP + select DISCONTIGMEM ---help--- This selects the system type of your hardware. A "generic" kernel will run on any supported IA-64 system. However, if you configure @@ -220,24 +224,8 @@ Access). This option is for configuring high-end multiprocessor server systems. If in doubt, say N. -choice - prompt "Maximum Memory per NUMA Node" if NUMA && IA64_DIG - depends on NUMA && IA64_DIG - default IA64_NODESIZE_16GB - -config IA64_NODESIZE_16GB - bool "16GB" - -config IA64_NODESIZE_64GB - bool "64GB" - -config IA64_NODESIZE_256GB - bool "256GB" - -endchoice - config DISCONTIGMEM - bool "Discontiguous memory support" if (IA64_DIG || IA64_SGI_SN2 || IA64_GENERIC) && NUMA + bool "Discontiguous memory support" if (IA64_DIG || IA64_SGI_SN2 || IA64_GENERIC) && NUMA && VIRTUAL_MEM_MAP default y if (IA64_SGI_SN2 || IA64_GENERIC) && NUMA help Say Y to support efficient handling of discontiguous physical memory, @@ -250,14 +238,10 @@ default y if !IA64_HP_SIM help Say Y to compile the kernel with support for a virtual mem map. - This is an alternate method of supporting large holes in the - physical address space on non NUMA machines. Since the DISCONTIGMEM - option is not supported on machines with the ZX1 chipset, this is - the only way of supporting more than 1 Gb of memory on those - machines. This code also only takes effect if a memory hole of - greater than 1 Gb is found during boot, so it is safe to enable - unless you require the DISCONTIGMEM option for your machine. If you - are unsure, say Y. + This code also only takes effect if a memory hole of greater than + 1 Gb is found during boot. You must turn this option on if you + require the DISCONTIGMEM option for your machine. If you are + unsure, say Y. config IA64_MCA bool "Enable IA-64 Machine Check Abort" @@ -636,6 +620,33 @@ send a BREAK and then within 5 seconds a command keypress. The keys are documented in . Don't say Y unless you really know what this hack does. + +config IA64_EARLY_PRINTK + bool "Early printk support" + depends on DEBUG_KERNEL && !IA64_GENERIC + help + Selecting this option uses the VGA screen or serial console for + printk() output before the consoles are initialised. It is useful + for debugging problems early in the boot process, but only if you + have a suitable VGA/serial console attached. If you're unsure, + select N. + +config IA64_EARLY_PRINTK_UART + bool "Early printk on MMIO serial port" + depends on IA64_EARLY_PRINTK + +config IA64_EARLY_PRINTK_UART_BASE + hex "UART MMIO base address" + depends on IA64_EARLY_PRINTK_UART + default "ff5e0000" + +config IA64_EARLY_PRINTK_VGA + bool "Early printk on VGA" + depends on IA64_EARLY_PRINTK + +config IA64_EARLY_PRINTK_SGI_SN + bool "Early printk on SGI SN serial console" + depends on IA64_EARLY_PRINTK && (IA64_GENERIC || IA64_SGI_SN2) config DEBUG_SLAB bool "Debug memory allocations" diff -Nru a/arch/ia64/Makefile b/arch/ia64/Makefile --- a/arch/ia64/Makefile Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/Makefile Fri Oct 17 23:12:58 2003 @@ -64,7 +64,7 @@ drivers-$(CONFIG_PCI) += arch/ia64/pci/ drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/ drivers-$(CONFIG_IA64_HP_ZX1) += arch/ia64/hp/common/ arch/ia64/hp/zx1/ -drivers-$(CONFIG_IA64_GENERIC) += arch/ia64/hp/common/ arch/ia64/hp/zx1/ arch/ia64/hp/sim/ +drivers-$(CONFIG_IA64_GENERIC) += arch/ia64/hp/common/ arch/ia64/hp/zx1/ arch/ia64/hp/sim/ arch/ia64/sn/ drivers-$(CONFIG_OPROFILE) += arch/ia64/oprofile/ boot := arch/ia64/hp/sim/boot diff -Nru a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c --- a/arch/ia64/ia32/sys_ia32.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/ia32/sys_ia32.c Fri Oct 17 23:12:58 2003 @@ -2486,11 +2486,14 @@ putstat64 (struct stat64 *ubuf, struct kstat *kbuf) { int err; + u64 hdev; if (clear_user(ubuf, sizeof(*ubuf))) return -EFAULT; - err = __put_user(huge_encode_dev(kbuf->dev), &ubuf->st_dev); + hdev = huge_encode_dev(kbuf->dev); + err = __put_user(hdev, (u32*)&ubuf->st_dev); + err |= __put_user(hdev >> 32, ((u32*)&ubuf->st_dev) + 1); err |= __put_user(kbuf->ino, &ubuf->__st_ino); err |= __put_user(kbuf->ino, &ubuf->st_ino_lo); err |= __put_user(kbuf->ino >> 32, &ubuf->st_ino_hi); @@ -2498,7 +2501,9 @@ err |= __put_user(kbuf->nlink, &ubuf->st_nlink); err |= __put_user(kbuf->uid, &ubuf->st_uid); err |= __put_user(kbuf->gid, &ubuf->st_gid); - err |= __put_user(huge_encode_dev(kbuf->rdev), &ubuf->st_rdev); + hdev = huge_encode_dev(kbuf->rdev); + err = __put_user(hdev, (u32*)&ubuf->st_rdev); + err |= __put_user(hdev >> 32, ((u32*)&ubuf->st_rdev) + 1); err |= __put_user(kbuf->size, &ubuf->st_size_lo); err |= __put_user((kbuf->size >> 32), &ubuf->st_size_hi); err |= __put_user(kbuf->atime.tv_sec, &ubuf->st_atime); @@ -2724,8 +2729,8 @@ struct epoll_event32 { u32 events; - u64 data; -} __attribute__((packed)); + u32 data[2]; +}; asmlinkage long sys32_epoll_ctl(int epfd, int op, int fd, struct epoll_event32 *event) @@ -2740,10 +2745,10 @@ return error; __get_user(event64.events, &event->events); - __get_user(data_halfword, (u32*)(&event->data)); + __get_user(data_halfword, &event->data[0]); event64.data = data_halfword; - __get_user(data_halfword, ((u32*)(&event->data) + 1)); - event64.data |= ((u64)data_halfword) << 32; + __get_user(data_halfword, &event->data[1]); + event64.data |= (u64)data_halfword << 32; set_fs(KERNEL_DS); error = sys_epoll_ctl(epfd, op, fd, &event64); @@ -2758,8 +2763,9 @@ { struct epoll_event *events64 = NULL; mm_segment_t old_fs = get_fs(); - int error; + int error, numevents, size; int evt_idx; + int do_free_pages = 0; if (maxevents <= 0) { return -EINVAL; @@ -2770,43 +2776,45 @@ maxevents * sizeof(struct epoll_event32)))) return error; - /* Allocate the space needed for the intermediate copy */ - events64 = kmalloc(maxevents * sizeof(struct epoll_event), GFP_KERNEL); + /* + * Allocate space for the intermediate copy. If the space needed + * is large enough to cause kmalloc to fail, then try again with + * __get_free_pages. + */ + size = maxevents * sizeof(struct epoll_event); + events64 = kmalloc(size, GFP_KERNEL); if (events64 == NULL) { - return -ENOMEM; - } - - /* Expand the 32-bit structures into the 64-bit structures */ - for (evt_idx = 0; evt_idx < maxevents; evt_idx++) { - u32 data_halfword; - __get_user(events64[evt_idx].events, &events[evt_idx].events); - __get_user(data_halfword, (u32*)(&events[evt_idx].data)); - events64[evt_idx].data = data_halfword; - __get_user(data_halfword, ((u32*)(&events[evt_idx].data) + 1)); - events64[evt_idx].data |= ((u64)data_halfword) << 32; + events64 = (struct epoll_event *) + __get_free_pages(GFP_KERNEL, get_order(size)); + if (events64 == NULL) + return -ENOMEM; + do_free_pages = 1; } /* Do the system call */ set_fs(KERNEL_DS); /* copy_to/from_user should work on kernel mem*/ - error = sys_epoll_wait(epfd, events64, maxevents, timeout); + numevents = sys_epoll_wait(epfd, events64, maxevents, timeout); set_fs(old_fs); /* Don't modify userspace memory if we're returning an error */ - if (!error) { + if (numevents > 0) { /* Translate the 64-bit structures back into the 32-bit structures */ - for (evt_idx = 0; evt_idx < maxevents; evt_idx++) { + for (evt_idx = 0; evt_idx < numevents; evt_idx++) { __put_user(events64[evt_idx].events, &events[evt_idx].events); - __put_user((u32)(events64[evt_idx].data), - (u32*)(&events[evt_idx].data)); + __put_user((u32)events64[evt_idx].data, + &events[evt_idx].data[0]); __put_user((u32)(events64[evt_idx].data >> 32), - ((u32*)(&events[evt_idx].data) + 1)); + &events[evt_idx].data[1]); } } - kfree(events64); - return error; + if (do_free_pages) + free_pages((unsigned long) events64, get_order(size)); + else + kfree(events64); + return numevents; } #ifdef NOTYET /* UNTESTED FOR IA64 FROM HERE DOWN */ diff -Nru a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c --- a/arch/ia64/kernel/acpi.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/acpi.c Fri Oct 17 23:12:58 2003 @@ -56,6 +56,7 @@ void (*pm_power_off) (void); unsigned char acpi_kbd_controller_present = 1; +unsigned char acpi_legacy_devices; int acpi_disabled; /* XXX this shouldn't be needed---we can't boot without ACPI! */ @@ -380,7 +381,7 @@ void __init acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *ma) { - unsigned long paddr, size, hole_size, min_hole_size; + unsigned long paddr, size; u8 pxm; struct node_memblk_s *p, *q, *pend; @@ -402,34 +403,6 @@ if (!ma->flags.enabled) return; - /* - * When the chunk is not the first one in the node, check distance - * from the other chunks. When the hole is too huge ignore the chunk. - * This restriction should be removed when multiple chunks per node - * is supported. - */ - pend = &node_memblk[num_memblks]; - min_hole_size = 0; - for (p = &node_memblk[0]; p < pend; p++) { - if (p->nid != pxm) - continue; - if (p->start_paddr < paddr) - hole_size = paddr - (p->start_paddr + p->size); - else - hole_size = p->start_paddr - (paddr + size); - - if (!min_hole_size || hole_size < min_hole_size) - min_hole_size = hole_size; - } - - if (min_hole_size) { - if (min_hole_size > size) { - printk(KERN_ERR "Too huge memory hole. Ignoring %ld MBytes at %lx\n", - size/(1024*1024), paddr); - return; - } - } - /* record this node in proximity bitmap */ pxm_bit_set(pxm); @@ -454,6 +427,12 @@ { int i, j, node_from, node_to; + /* If there's no SRAT, fix the phys_id */ + if (srat_num_cpus == 0) { + node_cpuid[0].phys_id = hard_smp_processor_id(); + return; + } + /* calculate total number of nodes in system from PXM bitmap */ numnodes = 0; /* init total nodes in system */ @@ -531,6 +510,9 @@ if (!(fadt->iapc_boot_arch & BAF_8042_KEYBOARD_CONTROLLER)) acpi_kbd_controller_present = 0; + if (fadt->iapc_boot_arch & BAF_LEGACY_DEVICES) + acpi_legacy_devices = 1; + acpi_register_irq(fadt->sci_int, ACPI_ACTIVE_LOW, ACPI_LEVEL_SENSITIVE); return 0; } @@ -614,6 +596,12 @@ smp_build_cpu_map(); # ifdef CONFIG_NUMA + if (srat_num_cpus == 0) { + int cpu, i = 1; + for (cpu = 0; cpu < smp_boot_data.cpu_count; cpu++) + if (smp_boot_data.cpu_phys_id[cpu] != hard_smp_processor_id()) + node_cpuid[i++].phys_id = smp_boot_data.cpu_phys_id[cpu]; + } build_cpu_to_node_map(); # endif #endif diff -Nru a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c --- a/arch/ia64/kernel/asm-offsets.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/asm-offsets.c Fri Oct 17 23:12:58 2003 @@ -33,16 +33,30 @@ BLANK(); + DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked)); DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader)); + DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending)); DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid)); DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent)); + DEFINE(IA64_TASK_SIGHAND_OFFSET,offsetof (struct task_struct, sighand)); + DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal)); DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid)); DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp)); DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack)); BLANK(); + DEFINE(IA64_SIGHAND_SIGLOCK_OFFSET,offsetof (struct sighand_struct, siglock)); + + BLANK(); + + DEFINE(IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,offsetof (struct signal_struct, + group_stop_count)); + DEFINE(IA64_SIGNAL_SHARED_PENDING_OFFSET,offsetof (struct signal_struct, shared_pending)); + + BLANK(); + DEFINE(IA64_PT_REGS_B6_OFFSET, offsetof (struct pt_regs, b6)); DEFINE(IA64_PT_REGS_B7_OFFSET, offsetof (struct pt_regs, b7)); DEFINE(IA64_PT_REGS_AR_CSD_OFFSET, offsetof (struct pt_regs, ar_csd)); @@ -155,6 +169,10 @@ DEFINE(IA64_SIGCONTEXT_R12_OFFSET, offsetof (struct sigcontext, sc_gr[12])); DEFINE(IA64_SIGCONTEXT_RBS_BASE_OFFSET,offsetof (struct sigcontext, sc_rbs_base)); DEFINE(IA64_SIGCONTEXT_LOADRS_OFFSET, offsetof (struct sigcontext, sc_loadrs)); + + BLANK(); + + DEFINE(IA64_SIGPENDING_SIGNAL_OFFSET, offsetof (struct sigpending, signal)); BLANK(); diff -Nru a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S --- a/arch/ia64/kernel/fsys.S Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/fsys.S Fri Oct 17 23:12:58 2003 @@ -4,6 +4,7 @@ * Copyright (C) 2003 Hewlett-Packard Co * David Mosberger-Tang * + * 25-Sep-03 davidm Implement fsys_rt_sigprocmask(). * 18-Feb-03 louisk Implement fsys_gettimeofday(). * 28-Feb-03 davidm Fixed several bugs in fsys_gettimeofday(). Tuned it some more, * probably broke it along the way... ;-) @@ -15,6 +16,7 @@ #include #include #include +#include #include #include @@ -48,8 +50,7 @@ .body mov r8=ENOSYS mov r10=-1 - MCKINLEY_E9_WORKAROUND - br.ret.sptk.many b6 + FSYS_RETURN END(fsys_ni_syscall) ENTRY(fsys_getpid) @@ -66,8 +67,7 @@ ;; cmp.ne p8,p0=0,r9 (p8) br.spnt.many fsys_fallback_syscall - MCKINLEY_E9_WORKAROUND - br.ret.sptk.many b6 + FSYS_RETURN END(fsys_getpid) ENTRY(fsys_getppid) @@ -114,8 +114,7 @@ mov r18=0 // i must not leak kernel bits... mov r19=0 // i must not leak kernel bits... #endif - MCKINLEY_E9_WORKAROUND - br.ret.sptk.many b6 + FSYS_RETURN END(fsys_getppid) ENTRY(fsys_set_tid_address) @@ -141,8 +140,7 @@ ;; mov r17=0 // i must not leak kernel bits... mov r18=0 // i must not leak kernel bits... - MCKINLEY_E9_WORKAROUND - br.ret.sptk.many b6 + FSYS_RETURN END(fsys_set_tid_address) /* @@ -199,7 +197,7 @@ adds r10=IA64_CPUINFO_ITM_DELTA_OFFSET, r10 (p7) tnat.nz p6,p0=r33 -(p6) br.cond.spnt.few .fail +(p6) br.cond.spnt.few .fail_einval adds r8=IA64_CPUINFO_NSEC_PER_CYC_OFFSET, r3 movl r24=2361183241434822607 // for division hack (only for / 1000) @@ -225,8 +223,8 @@ * to store the result. That's OK as long as the stores are also * protect by EX(). */ -EX(.fail, probe.w.fault r32, 3) // this must come _after_ NaT-check -EX(.fail, probe.w.fault r10, 3) // this must come _after_ NaT-check +EX(.fail_efault, probe.w.fault r32, 3) // this must come _after_ NaT-check +EX(.fail_efault, probe.w.fault r10, 3) // this must come _after_ NaT-check nop 0 ldf8 f10=[r8] // f10 <- local_cpu_data->nsec_per_cyc value @@ -311,14 +309,13 @@ (p7) br.spnt.many 1b // finally: r2 = sec, r3 = usec -EX(.fail, st8 [r32]=r2) +EX(.fail_efault, st8 [r32]=r2) adds r9=8, r32 mov r8=r0 // success ;; -EX(.fail, st8 [r9]=r3) // store them in the timeval struct +EX(.fail_efault, st8 [r9]=r3) // store them in the timeval struct mov r10=0 - MCKINLEY_E9_WORKAROUND - br.ret.sptk.many b6 // return to caller + FSYS_RETURN /* * Note: We are NOT clearing the scratch registers here. Since the only things * in those registers are time-related variables and some addresses (which @@ -326,12 +323,183 @@ * and we should be fine. */ -.fail: adds r8=EINVAL, r0 // r8 = EINVAL - adds r10=-1, r0 // r10 = -1 - MCKINLEY_E9_WORKAROUND - br.ret.spnt.many b6 // return with r8 set to EINVAL +.fail_einval: + mov r8=EINVAL // r8 = EINVAL + mov r10=-1 // r10 = -1 + FSYS_RETURN + +.fail_efault: + mov r8=EFAULT // r8 = EFAULT + mov r10=-1 // r10 = -1 + FSYS_RETURN END(fsys_gettimeofday) +/* + * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize). + */ +#if _NSIG_WORDS != 1 +# error Sorry, fsys_rt_sigprocmask() needs to be updated for _NSIG_WORDS != 1. +#endif +ENTRY(fsys_rt_sigprocmask) + .prologue + .altrp b6 + .body + + mf // ensure reading of current->blocked is ordered + add r2=IA64_TASK_BLOCKED_OFFSET,r16 + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + ;; + /* + * Since we're only reading a single word, we can do it + * atomically without acquiring current->sighand->siglock. To + * be on the safe side, we need a fully-ordered load, though: + */ + ld8.acq r3=[r2] // read/prefetch current->blocked + ld4 r9=[r9] + add r31=IA64_TASK_SIGHAND_OFFSET,r16 + ;; +#ifdef CONFIG_SMP + ld8 r31=[r31] // r31 <- current->sighand +#endif + and r9=TIF_ALLWORK_MASK,r9 + tnat.nz p6,p0=r32 + ;; + cmp.ne p7,p0=0,r9 + tnat.nz.or p6,p0=r35 + tnat.nz p8,p0=r34 + ;; + cmp.ne p15,p0=r0,r34 // oset != NULL? + cmp.ne.or p6,p0=_NSIG_WORDS*8,r35 + tnat.nz.or p8,p0=r33 + +(p6) br.spnt.few .fail_einval // fail with EINVAL +(p7) br.spnt.many fsys_fallback_syscall // got pending kernel work... +(p8) br.spnt.few .fail_efault // fail with EFAULT + ;; + + cmp.eq p6,p7=r0,r33 // set == NULL? + add r31=IA64_SIGHAND_SIGLOCK_OFFSET,r31 // r31 <- current->sighand->siglock +(p6) br.dpnt.many .store_mask // -> short-circuit to just reading the signal mask + + /* Argh, we actually have to do some work and _update_ the signal mask: */ + +EX(.fail_efault, probe.r.fault r33, 3) // verify user has read-access to *set +EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set + mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1)) + ;; + + rsm psr.i // mask interrupt delivery + mov ar.ccv=0 + andcm r14=r14,r17 // filter out SIGKILL & SIGSTOP + +#ifdef CONFIG_SMP + mov r17=1 + ;; + cmpxchg4.acq r18=[r31],r17,ar.ccv // try to acquire the lock + mov r8=EINVAL // default to EINVAL + ;; + ld8 r3=[r2] // re-read current->blocked now that we hold the lock + cmp4.ne p6,p0=r18,r0 +(p6) br.cond.spnt.many .lock_contention + ;; +#else + ld8 r3=[r2] // re-read current->blocked now that we hold the lock + mov r8=EINVAL // default to EINVAL +#endif + add r18=IA64_TASK_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r16 + add r19=IA64_TASK_SIGNAL_OFFSET,r16 + cmp4.eq p6,p0=SIG_BLOCK,r32 + ;; + ld8 r19=[r19] // r19 <- current->signal + cmp4.eq p7,p0=SIG_UNBLOCK,r32 + cmp4.eq p8,p0=SIG_SETMASK,r32 + ;; + ld8 r18=[r18] // r18 <- current->pending.signal + .pred.rel.mutex p6,p7,p8 +(p6) or r3=r3,r14 // SIG_BLOCK +(p7) andcm r3=r3,r14 // SIG_UNBLOCK + +(p8) mov r3=r14 // SIG_SETMASK +(p6) mov r8=0 // clear error code + // recalc_sigpending() + add r17=IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,r19 + + add r19=IA64_SIGNAL_SHARED_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r19 + ;; + ld4 r17=[r17] // r17 <- current->signal->group_stop_count +(p7) mov r8=0 // clear error code + + ld8 r19=[r19] // r19 <- current->signal->shared_pending + ;; + cmp4.gt p6,p7=r17,r0 // p6/p7 <- (current->signal->group_stop_count > 0)? +(p8) mov r8=0 // clear error code + + or r18=r18,r19 // r18 <- current->pending | current->signal->shared_pending + ;; + // r18 <- (current->pending | current->signal->shared_pending) & ~current->blocked: + andcm r18=r18,r3 + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + ;; + +(p7) cmp.ne.or.andcm p6,p7=r18,r0 // p6/p7 <- signal pending + mov r19=0 // i must not leak kernel bits... +(p6) br.cond.dpnt.many .sig_pending + ;; + +1: ld4 r17=[r9] // r17 <- current->thread_info->flags + ;; + mov ar.ccv=r17 + and r18=~_TIF_SIGPENDING,r17 // r18 <- r17 & ~(1 << TIF_SIGPENDING) + ;; + + st8 [r2]=r3 // update current->blocked with new mask + cmpxchg4.acq r14=[r9],r18,ar.ccv // current->thread_info->flags <- r18 + ;; + cmp.ne p6,p0=r17,r14 // update failed? +(p6) br.cond.spnt.few 1b // yes -> retry + +#ifdef CONFIG_SMP + st4.rel [r31]=r0 // release the lock +#endif + ssm psr.i + cmp.ne p9,p0=r8,r0 // check for bad HOW value + ;; + + srlz.d // ensure psr.i is set again + mov r18=0 // i must not leak kernel bits... +(p9) br.spnt.few .fail_einval // bail out for bad HOW value + +.store_mask: +EX(.fail_efault, (p15) probe.w.fault r34, 3) // verify user has write-access to *oset +EX(.fail_efault, (p15) st8 [r34]=r3) + mov r2=0 // i must not leak kernel bits... + mov r3=0 // i must not leak kernel bits... + mov r8=0 // return 0 + mov r9=0 // i must not leak kernel bits... + mov r14=0 // i must not leak kernel bits... + mov r17=0 // i must not leak kernel bits... + mov r31=0 // i must not leak kernel bits... + FSYS_RETURN + +.sig_pending: +#ifdef CONFIG_SMP + st4.rel [r31]=r0 // release the lock +#endif + ssm psr.i + ;; + srlz.d + br.sptk.many fsys_fallback_syscall // with signal pending, do the heavy-weight syscall + +#ifdef CONFIG_SMP +.lock_contention: + /* Rather than spinning here, fall back on doing a heavy-weight syscall. */ + ssm psr.i + ;; + srlz.d + br.sptk.many fsys_fallback_syscall +#endif +END(fsys_rt_sigprocmask) + ENTRY(fsys_fallback_syscall) .prologue .altrp b6 @@ -600,7 +768,7 @@ data8 0 // sigaltstack data8 0 // rt_sigaction data8 0 // rt_sigpending - data8 0 // rt_sigprocmask + data8 fsys_rt_sigprocmask // rt_sigprocmask data8 0 // rt_sigqueueinfo // 1180 data8 0 // rt_sigreturn data8 0 // rt_sigsuspend diff -Nru a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S --- a/arch/ia64/kernel/gate.S Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/gate.S Fri Oct 17 23:12:58 2003 @@ -118,8 +118,7 @@ mov r10=-1 mov r8=ENOSYS - MCKINLEY_E9_WORKAROUND - br.ret.sptk.many b6 + FSYS_RETURN END(__kernel_syscall_via_epc) # define ARG0_OFF (16 + IA64_SIGFRAME_ARG0_OFFSET) diff -Nru a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S --- a/arch/ia64/kernel/head.S Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/head.S Fri Oct 17 23:12:58 2003 @@ -797,6 +797,25 @@ br.ret.sptk.many rp END(ia64_switch_mode_virt) +GLOBAL_ENTRY(ia64_delay_loop) + .prologue +{ nop 0 // work around GAS unwind info generation bug... + .save ar.lc,r2 + mov r2=ar.lc + .body + ;; + mov ar.lc=r32 +} + ;; + // force loop to be 32-byte aligned (GAS bug means we cannot use .align + // inside function body without corrupting unwind info). +{ nop 0 } +1: br.cloop.sptk.few 1b + ;; + mov ar.lc=r2 + br.ret.sptk.many rp +END(ia64_delay_loop) + #ifdef CONFIG_IA64_BRL_EMU /* diff -Nru a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c --- a/arch/ia64/kernel/ia64_ksyms.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/ia64_ksyms.c Fri Oct 17 23:12:58 2003 @@ -34,13 +34,8 @@ #include EXPORT_SYMBOL(probe_irq_mask); -#include #include -/* not coded yet?? EXPORT_SYMBOL(csum_ipv6_magic); */ -EXPORT_SYMBOL(csum_partial_copy_nocheck); -EXPORT_SYMBOL(csum_tcpudp_magic); -EXPORT_SYMBOL(ip_compute_csum); -EXPORT_SYMBOL(ip_fast_csum); +EXPORT_SYMBOL(ip_fast_csum); /* hand-coded assembly */ #include EXPORT_SYMBOL(__ia64_memcpy_fromio); @@ -58,9 +53,11 @@ EXPORT_SYMBOL(clear_page); #ifdef CONFIG_VIRTUAL_MEM_MAP +#include #include EXPORT_SYMBOL(vmalloc_end); EXPORT_SYMBOL(ia64_pfn_valid); +EXPORT_SYMBOL(max_low_pfn); /* defined by bootmem.c, but not exported by generic code */ #endif #include diff -Nru a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c --- a/arch/ia64/kernel/mca.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/mca.c Fri Oct 17 23:12:58 2003 @@ -81,8 +81,6 @@ u64 ia64_mca_sal_data_area[1356]; u64 ia64_tlb_functional; u64 ia64_os_mca_recovery_successful; -/* TODO: need to assign min-state structure to UC memory */ -u64 ia64_mca_min_state_save_info[MIN_STATE_AREA_SIZE] __attribute__((aligned(512))); static void ia64_mca_wakeup_ipi_wait(void); static void ia64_mca_wakeup(int cpu); static void ia64_mca_wakeup_all(void); @@ -466,26 +464,6 @@ #endif /* PLATFORM_MCA_HANDLERS */ /* - * routine to process and prepare to dump min_state_save - * information for debugging purposes. - */ -void -ia64_process_min_state_save (pal_min_state_area_t *pmss) -{ - int i, max = MIN_STATE_AREA_SIZE; - u64 *tpmss_ptr = (u64 *)pmss; - u64 *return_min_state_ptr = ia64_mca_min_state_save_info; - - for (i=0;ivalid.psi_static_struct) { spsi = (sal_processor_static_info_t *)p_data; - - /* copy interrupted context PAL min-state info */ - ia64_process_min_state_save(&spsi->min_state_area); /* Print branch register contents if valid */ if (spsi->valid.br) diff -Nru a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S --- a/arch/ia64/kernel/mca_asm.S Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/mca_asm.S Fri Oct 17 23:12:58 2003 @@ -77,12 +77,11 @@ (p6) movl r10=IA64_MCA_SAME_CONTEXT; \ (p6) add _tmp=0x18,_tmp;; \ (p6) ld8 r9=[_tmp],0x10; \ -(p6) movl r22=ia64_mca_min_state_save_info;; \ +(p6) mov r22=r0;; \ (p7) ld8 r8=[_tmp],0x08;; \ (p7) ld8 r9=[_tmp],0x08;; \ (p7) ld8 r10=[_tmp],0x08;; \ -(p7) ld8 r22=[_tmp],0x08;; \ - DATA_VA_TO_PA(r22) +(p7) ld8 r22=[_tmp],0x08;; // now _tmp is pointing to SAL rtn save location @@ -97,7 +96,6 @@ .global ia64_init_stack .global ia64_mca_sal_data_area .global ia64_tlb_functional - .global ia64_mca_min_state_save_info .text .align 16 @@ -265,15 +263,15 @@ add r4=8,r2 // duplicate r2 in r4 add r6=2*8,r2 // duplicate r2 in r4 - mov r3=cr0 // cr.dcr - mov r5=cr1 // cr.itm - mov r7=cr2;; // cr.iva + mov r3=cr.dcr + mov r5=cr.itm + mov r7=cr.iva;; st8 [r2]=r3,8*8 st8 [r4]=r5,3*8 st8 [r6]=r7,3*8;; // 48 byte rements - mov r3=cr8;; // cr.pta + mov r3=cr.pta;; st8 [r2]=r3,8*8;; // 64 byte rements // if PSR.ic=0, reading interruption registers causes an illegal operation fault @@ -286,23 +284,23 @@ add r4=8,r2 // duplicate r2 in r4 add r6=2*8,r2 // duplicate r2 in r6 - mov r3=cr16 // cr.ipsr - mov r5=cr17 // cr.isr - mov r7=r0;; // cr.ida => cr18 (reserved) + mov r3=cr.ipsr + mov r5=cr.isr + mov r7=r0;; st8 [r2]=r3,3*8 st8 [r4]=r5,3*8 st8 [r6]=r7,3*8;; - mov r3=cr19 // cr.iip - mov r5=cr20 // cr.idtr - mov r7=cr21;; // cr.iitr + mov r3=cr.iip + mov r5=cr.ifa + mov r7=cr.itir;; st8 [r2]=r3,3*8 st8 [r4]=r5,3*8 st8 [r6]=r7,3*8;; - mov r3=cr22 // cr.iipa - mov r5=cr23 // cr.ifs - mov r7=cr24;; // cr.iim + mov r3=cr.iipa + mov r5=cr.ifs + mov r7=cr.iim;; st8 [r2]=r3,3*8 st8 [r4]=r5,3*8 st8 [r6]=r7,3*8;; @@ -311,104 +309,101 @@ st8 [r2]=r3,160;; // 160 byte rement SkipIntrRegs: - st8 [r2]=r0,168 // another 168 byte . + st8 [r2]=r0,152;; // another 152 byte . - mov r3=cr66;; // cr.lid - st8 [r2]=r3,40 // 40 byte rement + add r4=8,r2 // duplicate r2 in r4 + add r6=2*8,r2 // duplicate r2 in r6 - mov r3=cr71;; // cr.ivr - st8 [r2]=r3,8 - - mov r3=cr72;; // cr.tpr - st8 [r2]=r3,24 // 24 byte increment - - mov r3=r0;; // cr.eoi => cr75 - st8 [r2]=r3,168 // 168 byte inc. - - mov r3=r0;; // cr.irr0 => cr96 - st8 [r2]=r3,16 // 16 byte inc. - - mov r3=r0;; // cr.irr1 => cr98 - st8 [r2]=r3,16 // 16 byte inc. - - mov r3=r0;; // cr.irr2 => cr100 - st8 [r2]=r3,16 // 16 byte inc - - mov r3=r0;; // cr.irr3 => cr100 - st8 [r2]=r3,16 // 16b inc. - - mov r3=r0;; // cr.itv => cr114 - st8 [r2]=r3,16 // 16 byte inc. + mov r3=cr.lid +// mov r5=cr.ivr // cr.ivr, don't read it + mov r7=cr.tpr;; + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; - mov r3=r0;; // cr.pmv => cr116 - st8 [r2]=r3,8 + mov r3=r0 // cr.eoi => cr67 + mov r5=r0 // cr.irr0 => cr68 + mov r7=r0;; // cr.irr1 => cr69 + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; - mov r3=r0;; // cr.lrr0 => cr117 - st8 [r2]=r3,8 + mov r3=r0 // cr.irr2 => cr70 + mov r5=r0 // cr.irr3 => cr71 + mov r7=cr.itv;; + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; - mov r3=r0;; // cr.lrr1 => cr118 - st8 [r2]=r3,8 + mov r3=cr.pmv + mov r5=cr.cmcv;; + st8 [r2]=r3,7*8 + st8 [r4]=r5,7*8;; + + mov r3=r0 // cr.lrr0 => cr80 + mov r5=r0;; // cr.lrr1 => cr81 + st8 [r2]=r3,23*8 + st8 [r4]=r5,23*8;; - mov r3=r0;; // cr.cmcv => cr119 - st8 [r2]=r3,8*10;; + adds r2=25*8,r2;; cSaveARs: // save ARs add r4=8,r2 // duplicate r2 in r4 add r6=2*8,r2 // duplicate r2 in r6 - mov r3=ar0 // ar.kro - mov r5=ar1 // ar.kr1 - mov r7=ar2;; // ar.kr2 + mov r3=ar.k0 + mov r5=ar.k1 + mov r7=ar.k2;; st8 [r2]=r3,3*8 st8 [r4]=r5,3*8 st8 [r6]=r7,3*8;; - mov r3=ar3 // ar.kr3 - mov r5=ar4 // ar.kr4 - mov r7=ar5;; // ar.kr5 + mov r3=ar.k3 + mov r5=ar.k4 + mov r7=ar.k5;; st8 [r2]=r3,3*8 st8 [r4]=r5,3*8 st8 [r6]=r7,3*8;; - mov r3=ar6 // ar.kr6 - mov r5=ar7 // ar.kr7 + mov r3=ar.k6 + mov r5=ar.k7 mov r7=r0;; // ar.kr8 st8 [r2]=r3,10*8 st8 [r4]=r5,10*8 st8 [r6]=r7,10*8;; // rement by 72 bytes - mov r3=ar16 // ar.rsc - mov ar16=r0 // put RSE in enforced lazy mode - mov r5=ar17 // ar.bsp + mov r3=ar.rsc + mov ar.rsc=r0 // put RSE in enforced lazy mode + mov r5=ar.bsp ;; - mov r7=ar18;; // ar.bspstore + mov r7=ar.bspstore;; st8 [r2]=r3,3*8 st8 [r4]=r5,3*8 st8 [r6]=r7,3*8;; - mov r3=ar19;; // ar.rnat + mov r3=ar.rnat;; st8 [r2]=r3,8*13 // increment by 13x8 bytes - mov r3=ar32;; // ar.ccv + mov r3=ar.ccv;; st8 [r2]=r3,8*4 - mov r3=ar36;; // ar.unat + mov r3=ar.unat;; st8 [r2]=r3,8*4 - mov r3=ar40;; // ar.fpsr + mov r3=ar.fpsr;; st8 [r2]=r3,8*4 - mov r3=ar44;; // ar.itc + mov r3=ar.itc;; st8 [r2]=r3,160 // 160 - mov r3=ar64;; // ar.pfs + mov r3=ar.pfs;; st8 [r2]=r3,8 - mov r3=ar65;; // ar.lc + mov r3=ar.lc;; st8 [r2]=r3,8 - mov r3=ar66;; // ar.ec + mov r3=ar.ec;; st8 [r2]=r3 add r2=8*62,r2 //padding @@ -417,7 +412,8 @@ movl r4=0x00;; cStRR: - mov r3=rr[r4];; + dep.z r5=r4,61,3;; + mov r3=rr[r5];; st8 [r2]=r3,8 add r4=1,r4 br.cloop.sptk.few cStRR @@ -501,12 +497,12 @@ ld8 r3=[r2],8*8 ld8 r5=[r4],3*8 ld8 r7=[r6],3*8;; // 48 byte increments - mov cr0=r3 // cr.dcr - mov cr1=r5 // cr.itm - mov cr2=r7;; // cr.iva + mov cr.dcr=r3 + mov cr.itm=r5 + mov cr.iva=r7;; ld8 r3=[r2],8*8;; // 64 byte increments -// mov cr8=r3 // cr.pta +// mov cr.pta=r3 // if PSR.ic=1, reading interruption registers causes an illegal operation fault @@ -523,64 +519,66 @@ ld8 r3=[r2],3*8 ld8 r5=[r4],3*8 ld8 r7=[r6],3*8;; - mov cr16=r3 // cr.ipsr - mov cr17=r5 // cr.isr is read only -// mov cr18=r7;; // cr.ida (reserved - don't restore) + mov cr.ipsr=r3 +// mov cr.isr=r5 // cr.isr is read only ld8 r3=[r2],3*8 ld8 r5=[r4],3*8 ld8 r7=[r6],3*8;; - mov cr19=r3 // cr.iip - mov cr20=r5 // cr.idtr - mov cr21=r7;; // cr.iitr + mov cr.iip=r3 + mov cr.ifa=r5 + mov cr.itir=r7;; ld8 r3=[r2],3*8 ld8 r5=[r4],3*8 ld8 r7=[r6],3*8;; - mov cr22=r3 // cr.iipa - mov cr23=r5 // cr.ifs - mov cr24=r7 // cr.iim + mov cr.iipa=r3 + mov cr.ifs=r5 + mov cr.iim=r7 ld8 r3=[r2],160;; // 160 byte increment - mov cr25=r3 // cr.iha + mov cr.iha=r3 rSkipIntrRegs: - ld8 r3=[r2],168;; // another 168 byte inc. - - ld8 r3=[r2],40;; // 40 byte increment - mov cr66=r3 // cr.lid - - ld8 r3=[r2],8;; -// mov cr71=r3 // cr.ivr is read only - ld8 r3=[r2],24;; // 24 byte increment - mov cr72=r3 // cr.tpr - - ld8 r3=[r2],168;; // 168 byte inc. -// mov cr75=r3 // cr.eoi + ld8 r3=[r2],152;; // another 152 byte inc. - ld8 r3=[r2],16;; // 16 byte inc. -// mov cr96=r3 // cr.irr0 is read only + add r4=8,r2 // duplicate r2 in r4 + add r6=2*8,r2;; // duplicate r2 in r6 - ld8 r3=[r2],16;; // 16 byte inc. -// mov cr98=r3 // cr.irr1 is read only + ld8 r3=[r2],8*3 + ld8 r5=[r4],8*3 + ld8 r7=[r6],8*3;; + mov cr.lid=r3 +// mov cr.ivr=r5 // cr.ivr is read only + mov cr.tpr=r7;; + + ld8 r3=[r2],8*3 + ld8 r5=[r4],8*3 + ld8 r7=[r6],8*3;; +// mov cr.eoi=r3 +// mov cr.irr0=r5 // cr.irr0 is read only +// mov cr.irr1=r7;; // cr.irr1 is read only + + ld8 r3=[r2],8*3 + ld8 r5=[r4],8*3 + ld8 r7=[r6],8*3;; +// mov cr.irr2=r3 // cr.irr2 is read only +// mov cr.irr3=r5 // cr.irr3 is read only + mov cr.itv=r7;; + + ld8 r3=[r2],8*7 + ld8 r5=[r4],8*7;; + mov cr.pmv=r3 + mov cr.cmcv=r5;; + + ld8 r3=[r2],8*23 + ld8 r5=[r4],8*23;; + adds r2=8*23,r2 + adds r4=8*23,r4;; +// mov cr.lrr0=r3 +// mov cr.lrr1=r5 - ld8 r3=[r2],16;; // 16 byte inc -// mov cr100=r3 // cr.irr2 is read only - - ld8 r3=[r2],16;; // 16b inc. -// mov cr102=r3 // cr.irr3 is read only - - ld8 r3=[r2],16;; // 16 byte inc. -// mov cr114=r3 // cr.itv - - ld8 r3=[r2],8;; -// mov cr116=r3 // cr.pmv - ld8 r3=[r2],8;; -// mov cr117=r3 // cr.lrr0 - ld8 r3=[r2],8;; -// mov cr118=r3 // cr.lrr1 - ld8 r3=[r2],8*10;; -// mov cr119=r3 // cr.cmcv + adds r2=8*2,r2;; restore_ARs: add r4=8,r2 // duplicate r2 in r4 @@ -589,67 +587,67 @@ ld8 r3=[r2],3*8 ld8 r5=[r4],3*8 ld8 r7=[r6],3*8;; - mov ar0=r3 // ar.kro - mov ar1=r5 // ar.kr1 - mov ar2=r7;; // ar.kr2 + mov ar.k0=r3 + mov ar.k1=r5 + mov ar.k2=r7;; ld8 r3=[r2],3*8 ld8 r5=[r4],3*8 ld8 r7=[r6],3*8;; - mov ar3=r3 // ar.kr3 - mov ar4=r5 // ar.kr4 - mov ar5=r7;; // ar.kr5 + mov ar.k3=r3 + mov ar.k4=r5 + mov ar.k5=r7;; ld8 r3=[r2],10*8 ld8 r5=[r4],10*8 ld8 r7=[r6],10*8;; - mov ar6=r3 // ar.kr6 - mov ar7=r5 // ar.kr7 -// mov ar8=r6 // ar.kr8 + mov ar.k6=r3 + mov ar.k7=r5 ;; ld8 r3=[r2],3*8 ld8 r5=[r4],3*8 ld8 r7=[r6],3*8;; -// mov ar16=r3 // ar.rsc -// mov ar17=r5 // ar.bsp is read only - mov ar16=r0 // make sure that RSE is in enforced lazy mode +// mov ar.rsc=r3 +// mov ar.bsp=r5 // ar.bsp is read only + mov ar.rsc=r0 // make sure that RSE is in enforced lazy mode ;; - mov ar18=r7;; // ar.bspstore + mov ar.bspstore=r7;; ld8 r9=[r2],8*13;; - mov ar19=r9 // ar.rnat + mov ar.rnat=r9 - mov ar16=r3 // ar.rsc + mov ar.rsc=r3 ld8 r3=[r2],8*4;; - mov ar32=r3 // ar.ccv + mov ar.ccv=r3 ld8 r3=[r2],8*4;; - mov ar36=r3 // ar.unat + mov ar.unat=r3 ld8 r3=[r2],8*4;; - mov ar40=r3 // ar.fpsr + mov ar.fpsr=r3 ld8 r3=[r2],160;; // 160 -// mov ar44=r3 // ar.itc +// mov ar.itc=r3 ld8 r3=[r2],8;; - mov ar64=r3 // ar.pfs + mov ar.pfs=r3 ld8 r3=[r2],8;; - mov ar65=r3 // ar.lc + mov ar.lc=r3 ld8 r3=[r2];; - mov ar66=r3 // ar.ec + mov ar.ec=r3 add r2=8*62,r2;; // padding restore_RRs: mov r5=ar.lc mov ar.lc=0x08-1 - movl r4=0x00 + movl r4=0x00;; cStRRr: + dep.z r7=r4,61,3 ld8 r3=[r2],8;; -// mov rr[r4]=r3 // what are its access previledges? + mov rr[r7]=r3 // what are its access previledges? add r4=1,r4 br.cloop.sptk.few cStRRr ;; diff -Nru a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c --- a/arch/ia64/kernel/patch.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/patch.c Fri Oct 17 23:12:58 2003 @@ -130,9 +130,11 @@ while (offp < (s32 *) end) { wp = (u64 *) ia64_imva((char *) offp + *offp); - wp[0] = 0x0000000100000000; + wp[0] = 0x0000000100000000; /* nop.m 0; nop.i 0; nop.i 0 */ wp[1] = 0x0004000000000200; - ia64_fc(wp); + wp[2] = 0x0000000100000011; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */ + wp[3] = 0x0084006880000200; + ia64_fc(wp); ia64_fc(wp + 2); ++offp; } ia64_sync_i(); diff -Nru a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c --- a/arch/ia64/kernel/perfmon.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/perfmon.c Fri Oct 17 23:12:58 2003 @@ -140,7 +140,7 @@ * in UP: * - we need to protect against PMU overflow interrupts (local_irq_disable) * - * spin_lock_irqsave()/spin_unlock_irqrestore(): + * spin_lock_irqsave()/spin_lock_irqrestore(): * in SMP: local_irq_disable + spin_lock * in UP : local_irq_disable * @@ -254,7 +254,6 @@ unsigned long seed; /* seed for random-number generator */ unsigned long mask; /* mask for random-number generator */ unsigned int flags; /* notify/do not notify */ - int next_reset_type;/* PFM_PMD_NO_RESET, PFM_PMD_LONG_RESET, PFM_PMD_SHORT_RESET */ unsigned long eventid; /* overflow event identifier */ } pfm_counter_t; @@ -267,10 +266,10 @@ unsigned int using_dbreg:1; /* using range restrictions (debug registers) */ unsigned int is_sampling:1; /* true if using a custom format */ unsigned int excl_idle:1; /* exclude idle task in system wide session */ - unsigned int unsecure:1; /* exclude idle task in system wide session */ unsigned int going_zombie:1; /* context is zombie (MASKED+blocking) */ unsigned int trap_reason:2; /* reason for going into pfm_handle_work() */ unsigned int no_msg:1; /* no message sent on overflow */ + unsigned int can_restart:1; /* allowed to issue a PFM_RESTART */ unsigned int reserved:22; } pfm_context_flags_t; @@ -356,10 +355,10 @@ #define ctx_fl_using_dbreg ctx_flags.using_dbreg #define ctx_fl_is_sampling ctx_flags.is_sampling #define ctx_fl_excl_idle ctx_flags.excl_idle -#define ctx_fl_unsecure ctx_flags.unsecure #define ctx_fl_going_zombie ctx_flags.going_zombie #define ctx_fl_trap_reason ctx_flags.trap_reason #define ctx_fl_no_msg ctx_flags.no_msg +#define ctx_fl_can_restart ctx_flags.can_restart #define PFM_SET_WORK_PENDING(t, v) do { (t)->thread.pfm_needs_checking = v; } while(0); #define PFM_GET_WORK_PENDING(t) (t)->thread.pfm_needs_checking @@ -493,12 +492,11 @@ typedef struct { unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */ + unsigned long pfm_replay_ovfl_intr_count; /* keep track of replayed ovfl interrupts */ unsigned long pfm_ovfl_intr_count; /* keep track of ovfl interrupts */ unsigned long pfm_ovfl_intr_cycles; /* cycles spent processing ovfl interrupts */ unsigned long pfm_ovfl_intr_cycles_min; /* min cycles spent processing ovfl interrupts */ unsigned long pfm_ovfl_intr_cycles_max; /* max cycles spent processing ovfl interrupts */ - unsigned long pfm_sysupdt_count; - unsigned long pfm_sysupdt_cycles; unsigned long pfm_smpl_handler_calls; unsigned long pfm_smpl_handler_cycles; char pad[SMP_CACHE_BYTES] ____cacheline_aligned; @@ -513,10 +511,8 @@ static struct proc_dir_entry *perfmon_dir; static pfm_uuid_t pfm_null_uuid = {0,}; -static spinlock_t pfm_smpl_fmt_lock; -static pfm_buffer_fmt_t *pfm_buffer_fmt_list; -#define LOCK_BUF_FMT_LIST() spin_lock(&pfm_smpl_fmt_lock) -#define UNLOCK_BUF_FMT_LIST() spin_unlock(&pfm_smpl_fmt_lock) +static spinlock_t pfm_buffer_fmt_lock; +static LIST_HEAD(pfm_buffer_fmt_list); /* sysctl() controls */ static pfm_sysctl_t pfm_sysctl; @@ -544,14 +540,8 @@ close: pfm_vm_close }; -#define pfm_wait_task_inactive(t) wait_task_inactive(t) #define pfm_get_cpu_var(v) __ia64_per_cpu_var(v) #define pfm_get_cpu_data(a,b) per_cpu(a, b) -typedef irqreturn_t pfm_irq_handler_t; -#define PFM_IRQ_HANDLER_RET(v) do { \ - put_cpu_no_resched(); \ - return IRQ_HANDLED; \ - } while(0); static inline void pfm_put_task(struct task_struct *task) @@ -628,7 +618,6 @@ .get_sb = pfmfs_get_sb, .kill_sb = kill_anon_super, }; - DEFINE_PER_CPU(unsigned long, pfm_syst_info); DEFINE_PER_CPU(struct task_struct *, pmu_owner); DEFINE_PER_CPU(pfm_context_t *, pmu_ctx); @@ -734,12 +723,14 @@ static inline void pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val) { - ctx->ctx_pmds[i].val = val & ~pmu_conf.ovfl_val; + unsigned long ovfl_val = pmu_conf.ovfl_val; + + ctx->ctx_pmds[i].val = val & ~ovfl_val; /* * writing to unimplemented part is ignore, so we do not need to * mask off top part */ - ia64_set_pmd(i, val & pmu_conf.ovfl_val); + ia64_set_pmd(i, val & ovfl_val); } static pfm_msg_t * @@ -870,11 +861,12 @@ { pfm_context_t *ctx = PFM_GET_CTX(task); struct thread_struct *th = &task->thread; - unsigned long mask, val; + unsigned long mask, val, ovfl_mask; int i; - DPRINT(("[%d] masking monitoring for [%d]\n", current->pid, task->pid)); + DPRINT_ovfl(("[%d] masking monitoring for [%d]\n", current->pid, task->pid)); + ovfl_mask = pmu_conf.ovfl_val; /* * monitoring can only be masked as a result of a valid * counter overflow. In UP, it means that the PMU still @@ -904,14 +896,14 @@ /* * we rebuild the full 64 bit value of the counter */ - ctx->ctx_pmds[i].val += (val & pmu_conf.ovfl_val); + ctx->ctx_pmds[i].val += (val & ovfl_mask); } else { ctx->ctx_pmds[i].val = val; } - DPRINT(("pmd[%d]=0x%lx hw_pmd=0x%lx\n", + DPRINT_ovfl(("pmd[%d]=0x%lx hw_pmd=0x%lx\n", i, ctx->ctx_pmds[i].val, - val & pmu_conf.ovfl_val)); + val & ovfl_mask)); } /* * mask monitoring by setting the privilege level to 0 @@ -926,6 +918,7 @@ if ((mask & 0x1) == 0UL) continue; ia64_set_pmc(i, th->pmcs[i] & ~0xfUL); th->pmcs[i] &= ~0xfUL; + DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, th->pmcs[i])); } /* * make all of this visible @@ -943,11 +936,12 @@ { pfm_context_t *ctx = PFM_GET_CTX(task); struct thread_struct *th = &task->thread; - unsigned long mask; + unsigned long mask, ovfl_mask; unsigned long psr, val; int i, is_system; is_system = ctx->ctx_fl_system; + ovfl_mask = pmu_conf.ovfl_val; if (task != current) { printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid); @@ -989,8 +983,8 @@ * we split the 64bit value according to * counter width */ - val = ctx->ctx_pmds[i].val & pmu_conf.ovfl_val; - ctx->ctx_pmds[i].val &= ~pmu_conf.ovfl_val; + val = ctx->ctx_pmds[i].val & ovfl_mask; + ctx->ctx_pmds[i].val &= ~ovfl_mask; } else { val = ctx->ctx_pmds[i].val; } @@ -1206,12 +1200,36 @@ return ret; } +static pfm_buffer_fmt_t * +__pfm_find_buffer_fmt(pfm_uuid_t uuid) +{ + struct list_head * pos; + pfm_buffer_fmt_t * entry; - + list_for_each(pos, &pfm_buffer_fmt_list) { + entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list); + if (pfm_uuid_cmp(uuid, entry->fmt_uuid) == 0) + return entry; + } + return NULL; +} + +/* + * find a buffer format based on its uuid + */ +static pfm_buffer_fmt_t * +pfm_find_buffer_fmt(pfm_uuid_t uuid) +{ + pfm_buffer_fmt_t * fmt; + spin_lock(&pfm_buffer_fmt_lock); + fmt = __pfm_find_buffer_fmt(uuid); + spin_unlock(&pfm_buffer_fmt_lock); + return fmt; +} + int pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt) { - pfm_buffer_fmt_t *p; int ret = 0; /* some sanity checks */ @@ -1224,80 +1242,44 @@ * XXX: need check validity of fmt_arg_size */ - LOCK_BUF_FMT_LIST(); - p = pfm_buffer_fmt_list; - + spin_lock(&pfm_buffer_fmt_lock); - while (p) { - if (pfm_uuid_cmp(fmt->fmt_uuid, p->fmt_uuid) == 0) break; - p = p->fmt_next; - } - - if (p) { + if (__pfm_find_buffer_fmt(fmt->fmt_uuid)) { printk(KERN_ERR "perfmon: duplicate sampling format: %s\n", fmt->fmt_name); ret = -EBUSY; - } else { - fmt->fmt_prev = NULL; - fmt->fmt_next = pfm_buffer_fmt_list; - pfm_buffer_fmt_list = fmt; - printk(KERN_ERR "perfmon: added sampling format %s\n", fmt->fmt_name); - } - UNLOCK_BUF_FMT_LIST(); + goto out; + } + list_add(&fmt->fmt_list, &pfm_buffer_fmt_list); + printk(KERN_INFO "perfmon: added sampling format %s\n", fmt->fmt_name); - return ret; +out: + spin_unlock(&pfm_buffer_fmt_lock); + return ret; } int pfm_unregister_buffer_fmt(pfm_uuid_t uuid) { - pfm_buffer_fmt_t *p; + pfm_buffer_fmt_t *fmt; int ret = 0; - LOCK_BUF_FMT_LIST(); - p = pfm_buffer_fmt_list; - while (p) { - if (memcmp(uuid, p->fmt_uuid, sizeof(pfm_uuid_t)) == 0) break; - p = p->fmt_next; - } - if (p) { - if (p->fmt_prev) - p->fmt_prev->fmt_next = p->fmt_next; - else - pfm_buffer_fmt_list = p->fmt_next; - - if (p->fmt_next) - p->fmt_next->fmt_prev = p->fmt_prev; + spin_lock(&pfm_buffer_fmt_lock); - printk(KERN_ERR "perfmon: removed sampling format: %s\n", p->fmt_name); - p->fmt_next = p->fmt_prev = NULL; - } else { + fmt = __pfm_find_buffer_fmt(uuid); + if (!fmt) { printk(KERN_ERR "perfmon: cannot unregister format, not found\n"); ret = -EINVAL; + goto out; } - UNLOCK_BUF_FMT_LIST(); + list_del_init(&fmt->fmt_list); + printk(KERN_INFO "perfmon: removed sampling format: %s\n", fmt->fmt_name); +out: + spin_unlock(&pfm_buffer_fmt_lock); return ret; } -/* - * find a buffer format based on its uuid - */ -static pfm_buffer_fmt_t * -pfm_find_buffer_fmt(pfm_uuid_t uuid, int nolock) -{ - pfm_buffer_fmt_t *p; - - LOCK_BUF_FMT_LIST(); - for (p = pfm_buffer_fmt_list; p ; p = p->fmt_next) { - if (pfm_uuid_cmp(uuid, p->fmt_uuid) == 0) break; - } - - UNLOCK_BUF_FMT_LIST(); - - return p; -} - static int pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu) { @@ -2113,7 +2095,7 @@ return 1; } static struct dentry_operations pfmfs_dentry_operations = { - .d_delete = pfmfs_delete_dentry, + .d_delete = pfmfs_delete_dentry, }; @@ -2420,7 +2402,7 @@ #define PFM_CTXARG_BUF_ARG(a) (pfm_buffer_fmt_t *)(a+1) /* invoke and lock buffer format, if found */ - fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id, 0); + fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id); if (fmt == NULL) { DPRINT(("[%d] cannot find buffer format\n", task->pid)); return -EINVAL; @@ -2528,8 +2510,7 @@ if (!pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) return 0; - /* no buffer locking here, will be called again */ - fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id, 1); + fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id); if (fmt == NULL) { DPRINT(("cannot find buffer format\n")); return -EINVAL; @@ -2588,7 +2569,7 @@ /* * make sure the task is off any CPU */ - pfm_wait_task_inactive(task); + wait_task_inactive(task); /* more to come... */ @@ -2679,7 +2660,6 @@ */ ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0; ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0; - ctx->ctx_fl_unsecure = (ctx_flags & PFM_FL_UNSECURE) ? 1: 0; ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */ ctx->ctx_fl_no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0; /* @@ -2705,13 +2685,12 @@ init_waitqueue_head(&ctx->ctx_msgq_wait); init_waitqueue_head(&ctx->ctx_zombieq); - DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d unsecure=%d no_msg=%d ctx_fd=%d \n", + DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n", ctx, ctx_flags, ctx->ctx_fl_system, ctx->ctx_fl_block, ctx->ctx_fl_excl_idle, - ctx->ctx_fl_unsecure, ctx->ctx_fl_no_msg, ctx->ctx_fd)); @@ -2755,14 +2734,12 @@ } static void -pfm_reset_regs_masked(pfm_context_t *ctx, unsigned long *ovfl_regs, int flag) +pfm_reset_regs_masked(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset) { unsigned long mask = ovfl_regs[0]; unsigned long reset_others = 0UL; unsigned long val; - int i, is_long_reset = (flag == PFM_PMD_LONG_RESET); - - DPRINT_ovfl(("ovfl_regs=0x%lx flag=%d\n", ovfl_regs[0], flag)); + int i; /* * now restore reset value on sampling overflowed counters @@ -2793,19 +2770,17 @@ } static void -pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int flag) +pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset) { unsigned long mask = ovfl_regs[0]; unsigned long reset_others = 0UL; unsigned long val; - int i, is_long_reset = (flag == PFM_PMD_LONG_RESET); - - DPRINT_ovfl(("ovfl_regs=0x%lx flag=%d\n", ovfl_regs[0], flag)); + int i; - if (flag == PFM_PMD_NO_RESET) return; + DPRINT_ovfl(("ovfl_regs=0x%lx is_long_reset=%d\n", ovfl_regs[0], is_long_reset)); if (ctx->ctx_state == PFM_CTX_MASKED) { - pfm_reset_regs_masked(ctx, ovfl_regs, flag); + pfm_reset_regs_masked(ctx, ovfl_regs, is_long_reset); return; } @@ -3084,7 +3059,7 @@ { struct thread_struct *thread = NULL; pfarg_reg_t *req = (pfarg_reg_t *)arg; - unsigned long value, hw_value; + unsigned long value, hw_value, ovfl_mask; unsigned int cnum; int i, can_access_pmu = 0, state; int is_counting, is_loaded, is_system; @@ -3094,6 +3069,7 @@ state = ctx->ctx_state; is_loaded = state == PFM_CTX_LOADED ? 1 : 0; is_system = ctx->ctx_fl_system; + ovfl_mask = pmu_conf.ovfl_val; if (state == PFM_CTX_TERMINATED || state == PFM_CTX_ZOMBIE) return -EINVAL; @@ -3162,22 +3138,21 @@ * when context is load we use the split value */ if (is_loaded) { - hw_value = value & pmu_conf.ovfl_val; - value = value & ~pmu_conf.ovfl_val; + hw_value = value & ovfl_mask; + value = value & ~ovfl_mask; } - - /* - * update sampling periods - */ - ctx->ctx_pmds[cnum].long_reset = req->reg_long_reset; - ctx->ctx_pmds[cnum].short_reset = req->reg_short_reset; - - /* - * update randomization parameters - */ - ctx->ctx_pmds[cnum].seed = req->reg_random_seed; - ctx->ctx_pmds[cnum].mask = req->reg_random_mask; } + /* + * update reset values (not just for counters) + */ + ctx->ctx_pmds[cnum].long_reset = req->reg_long_reset; + ctx->ctx_pmds[cnum].short_reset = req->reg_short_reset; + + /* + * update randomization parameters (not just for counters) + */ + ctx->ctx_pmds[cnum].seed = req->reg_random_seed; + ctx->ctx_pmds[cnum].mask = req->reg_random_mask; /* * update context value @@ -3284,7 +3259,7 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { struct thread_struct *thread = NULL; - unsigned long val = 0UL, lval ; + unsigned long val = 0UL, lval, ovfl_mask; pfarg_reg_t *req = (pfarg_reg_t *)arg; unsigned int cnum, reg_flags = 0; int i, can_access_pmu = 0, state; @@ -3299,6 +3274,7 @@ state = ctx->ctx_state; is_loaded = state == PFM_CTX_LOADED ? 1 : 0; is_system = ctx->ctx_fl_system; + ovfl_mask = pmu_conf.ovfl_val; if (state == PFM_CTX_ZOMBIE) return -EINVAL; @@ -3368,7 +3344,7 @@ /* * XXX: need to check for overflow when loaded */ - val &= pmu_conf.ovfl_val; + val &= ovfl_mask; val += ctx->ctx_pmds[cnum].val; lval = ctx->ctx_pmds[cnum].lval; @@ -3672,22 +3648,48 @@ */ ctx->ctx_state = PFM_CTX_LOADED; + /* + * XXX: not really useful for self monitoring + */ + ctx->ctx_fl_can_restart = 0; + return 0; } - /* restart another task */ + + /* + * restart another task + */ + + /* + * When PFM_CTX_MASKED, we cannot issue a restart before the previous + * one is seen by the task. + */ + if (state == PFM_CTX_MASKED) { + if (ctx->ctx_fl_can_restart == 0) return -EINVAL; + /* + * will prevent subsequent restart before this one is + * seen by other task + */ + ctx->ctx_fl_can_restart = 0; + } /* - * if blocking, then post the semaphore. + * if blocking, then post the semaphore is PFM_CTX_MASKED, i.e. + * the task is blocked or on its way to block. That's the normal + * restart path. If the monitoring is not masked, then the task + * can be actively monitoring and we cannot directly intervene. + * Therefore we use the trap mechanism to catch the task and + * force it to reset the buffer/reset PMDs. + * * if non-blocking, then we ensure that the task will go into * pfm_handle_work() before returning to user mode. + * * We cannot explicitely reset another task, it MUST always * be done by the task itself. This works for system wide because - * the tool that is controlling the session is doing "self-monitoring". - * - * XXX: what if the task never goes back to user? - * + * the tool that is controlling the session is logically doing + * "self-monitoring". */ - if (CTX_OVFL_NOBLOCK(ctx) == 0) { + if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) { DPRINT(("unblocking [%d] \n", task->pid)); up(&ctx->ctx_restart_sem); } else { @@ -3725,6 +3727,9 @@ return 0; } +/* + * arg can be NULL and count can be zero for this function + */ static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { @@ -3783,21 +3788,22 @@ /* * check for debug registers in system wide mode * - * We make the reservation even when context is not loaded - * to make sure we get our slot. Note that the PFM_LOAD_CONTEXT - * may still fail if the task has DBG_VALID set. + * If though a check is done in pfm_context_load(), + * we must repeat it here, in case the registers are + * written after the context is loaded */ - LOCK_PFS(); + if (is_loaded) { + LOCK_PFS(); - if (first_time && is_system) { - if (pfm_sessions.pfs_ptrace_use_dbregs) - ret = -EBUSY; - else - pfm_sessions.pfs_sys_use_dbregs++; + if (first_time && is_system) { + if (pfm_sessions.pfs_ptrace_use_dbregs) + ret = -EBUSY; + else + pfm_sessions.pfs_sys_use_dbregs++; + } + UNLOCK_PFS(); } - UNLOCK_PFS(); - if (ret != 0) return ret; /* @@ -4158,7 +4164,7 @@ unsigned long *pmcs_source, *pmds_source; int the_cpu; int ret = 0; - int state, is_system; + int state, is_system, set_dbregs = 0; state = ctx->ctx_state; is_system = ctx->ctx_fl_system; @@ -4173,7 +4179,7 @@ return -EINVAL; } - DPRINT(("load_pid [%d]\n", req->load_pid)); + DPRINT(("load_pid [%d] using_dbreg=%d\n", req->load_pid, ctx->ctx_fl_using_dbreg)); if (CTX_OVFL_NOBLOCK(ctx) == 0 && req->load_pid == current->pid) { DPRINT(("cannot use blocking mode on self for [%d]\n", current->pid)); @@ -4200,15 +4206,33 @@ thread = &task->thread; - ret = -EBUSY; - + ret = 0; /* * cannot load a context which is using range restrictions, * into a task that is being debugged. */ - if (ctx->ctx_fl_using_dbreg && (thread->flags & IA64_THREAD_DBG_VALID)) { - DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid)); - goto error; + if (ctx->ctx_fl_using_dbreg) { + if (thread->flags & IA64_THREAD_DBG_VALID) { + ret = -EBUSY; + DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid)); + goto error; + } + LOCK_PFS(); + + if (is_system) { + if (pfm_sessions.pfs_ptrace_use_dbregs) { + DPRINT(("cannot load [%d] dbregs in use\n", task->pid)); + ret = -EBUSY; + } else { + pfm_sessions.pfs_sys_use_dbregs++; + DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs)); + set_dbregs = 1; + } + } + + UNLOCK_PFS(); + + if (ret) goto error; } /* @@ -4228,13 +4252,13 @@ */ the_cpu = ctx->ctx_cpu = smp_processor_id(); + ret = -EBUSY; /* * now reserve the session */ ret = pfm_reserve_session(current, is_system, the_cpu); if (ret) goto error; - ret = -EBUSY; /* * task is necessarily stopped at this point. * @@ -4342,11 +4366,6 @@ /* initial saved psr (stopped) */ ctx->ctx_saved_psr_up = 0UL; ia64_psr(regs)->up = ia64_psr(regs)->pp = 0; - - if (ctx->ctx_fl_unsecure) { - ia64_psr(regs)->sp = 0; - DPRINT(("context unsecured for [%d]\n", task->pid)); - } } ret = 0; @@ -4355,6 +4374,14 @@ if (ret) pfm_unreserve_session(ctx, ctx->ctx_fl_system, the_cpu); error: /* + * we must undo the dbregs setting (for system-wide) + */ + if (ret && set_dbregs) { + LOCK_PFS(); + pfm_sessions.pfs_sys_use_dbregs--; + UNLOCK_PFS(); + } + /* * release task, there is now a link with the context */ if (is_system == 0 && task != current) { @@ -4455,7 +4482,7 @@ */ tregs = task == current ? regs : ia64_task_regs(task); - if (task == current || ctx->ctx_fl_unsecure) { + if (task == current) { /* * cancel user level control */ @@ -4493,7 +4520,10 @@ ctx->ctx_task = NULL; PFM_SET_WORK_PENDING(task, 0); - ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE; + + ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE; + ctx->ctx_fl_can_restart = 0; + ctx->ctx_fl_going_zombie = 0; DPRINT(("disconnected [%d] from context\n", task->pid)); @@ -4686,7 +4716,7 @@ UNPROTECT_CTX(ctx, flags); - pfm_wait_task_inactive(task); + wait_task_inactive(task); PROTECT_CTX(ctx, flags); @@ -4725,7 +4755,8 @@ PFM_CMD_IDX(cmd), PFM_CMD_IS_VALID(cmd), PFM_CMD_NARG(cmd), - PFM_CMD_ARG_SIZE(cmd), count)); + PFM_CMD_ARG_SIZE(cmd), + count)); /* * check if number of arguments matches what the command expects @@ -4842,8 +4873,10 @@ { pfm_buffer_fmt_t *fmt = ctx->ctx_buf_fmt; pfm_ovfl_ctrl_t rst_ctrl; + int state; int ret = 0; + state = ctx->ctx_state; /* * Unlock sampling buffer and reset index atomically * XXX: not really needed when blocking @@ -4853,9 +4886,10 @@ rst_ctrl.bits.mask_monitoring = 0; rst_ctrl.bits.reset_ovfl_pmds = 1; - /* XXX: check return value */ - if (fmt->fmt_restart) - ret = (*fmt->fmt_restart)(current, &rst_ctrl, ctx->ctx_smpl_hdr, regs); + if (state == PFM_CTX_LOADED) + ret = pfm_buf_fmt_restart_active(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs); + else + ret = pfm_buf_fmt_restart(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs); } else { rst_ctrl.bits.mask_monitoring = 0; rst_ctrl.bits.reset_ovfl_pmds = 1; @@ -4876,7 +4910,6 @@ } } - /* * context MUST BE LOCKED when calling * can only be called for current @@ -4954,7 +4987,7 @@ reason = ctx->ctx_fl_trap_reason; ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE; - DPRINT(("[%d] reason=%d\n", current->pid, reason)); + DPRINT(("[%d] reason=%d state=%d\n", current->pid, reason, ctx->ctx_state)); /* * must be done before we check non-blocking mode @@ -5085,7 +5118,7 @@ { pfm_ovfl_arg_t ovfl_arg; unsigned long mask; - unsigned long old_val; + unsigned long old_val, ovfl_val; unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL; unsigned long tstamp; pfm_ovfl_ctrl_t ovfl_ctrl; @@ -5101,7 +5134,8 @@ tstamp = ia64_get_itc(); - mask = pmc0 >> PMU_FIRST_COUNTER; + mask = pmc0 >> PMU_FIRST_COUNTER; + ovfl_val = pmu_conf.ovfl_val; DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s " "used_pmds=0x%lx reload_pmcs=0x%lx\n", @@ -5133,7 +5167,7 @@ * pfm_read_pmds(). */ old_val = ctx->ctx_pmds[i].val; - ctx->ctx_pmds[i].val += 1 + pmu_conf.ovfl_val; + ctx->ctx_pmds[i].val += 1 + ovfl_val; /* * check for overflow condition @@ -5145,7 +5179,7 @@ DPRINT_ovfl(("ctx_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx smpl_pmds=0x%lx\n", i, ctx->ctx_pmds[i].val, old_val, - ia64_get_pmd(i) & pmu_conf.ovfl_val, ovfl_pmds, ovfl_notify, smpl_pmds)); + ia64_get_pmd(i) & ovfl_val, ovfl_pmds, ovfl_notify, smpl_pmds)); } /* @@ -5196,6 +5230,7 @@ for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) { if ((smpl_pmds & 0x1) == 0) continue; ovfl_arg.smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ? pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j); + DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg.smpl_pmds_values[k-1])); } } @@ -5294,6 +5329,7 @@ if (ovfl_ctrl.bits.mask_monitoring) { pfm_mask_monitoring(task); ctx->ctx_state = PFM_CTX_MASKED; + ctx->ctx_fl_can_restart = 1; } /* @@ -5376,12 +5412,10 @@ */ /* sanity check */ - if (!ctx) goto report_spurious; + if (!ctx) goto report_spurious1; - if (ctx->ctx_fl_system == 0 && (task->thread.flags & IA64_THREAD_PM_VALID) == 0) { - printk("perfmon: current [%d] owner = [%d] PMVALID=0 state=%d\n", current->pid, task->pid, ctx->ctx_state); - goto report_spurious; - } + if (ctx->ctx_fl_system == 0 && (task->thread.flags & IA64_THREAD_PM_VALID) == 0) + goto report_spurious2; PROTECT_CTX_NOPRINT(ctx, flags); @@ -5400,14 +5434,20 @@ return retval; -report_spurious: +report_spurious1: printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n", this_cpu, task->pid); pfm_unfreeze_pmu(); return -1; +report_spurious2: + printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n", + this_cpu, + task->pid); + pfm_unfreeze_pmu(); + return -1; } -static pfm_irq_handler_t +static irqreturn_t pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs) { unsigned long start_cycles, total_cycles; @@ -5436,7 +5476,8 @@ pfm_stats[this_cpu].pfm_ovfl_intr_cycles += total_cycles; } - PFM_IRQ_HANDLER_RET(); + put_cpu_no_resched(); + return IRQ_HANDLED; } @@ -5445,10 +5486,13 @@ pfm_proc_info(char *page) { char *p = page; - pfm_buffer_fmt_t *b; + struct list_head * pos; + pfm_buffer_fmt_t * entry; unsigned long psr; + int online_cpus = 0; int i; + p += sprintf(p, "perfmon version : %u.%u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN); p += sprintf(p, "model : %s\n", pmu_conf.pmu_name); p += sprintf(p, "fastctxsw : %s\n", pfm_sysctl.fastctxsw > 0 ? "Yes": "No"); p += sprintf(p, "ovfl_mask : 0x%lx\n", pmu_conf.ovfl_val); @@ -5462,17 +5506,17 @@ p += sprintf(p, "CPU%-2d smpl handler calls : %lu\n", i, pfm_stats[i].pfm_smpl_handler_calls); p += sprintf(p, "CPU%-2d smpl handler cycles : %lu\n", i, pfm_stats[i].pfm_smpl_handler_cycles); p += sprintf(p, "CPU%-2d spurious intrs : %lu\n", i, pfm_stats[i].pfm_spurious_ovfl_intr_count); - p += sprintf(p, "CPU%-2d sysupdt count : %lu\n", i, pfm_stats[i].pfm_sysupdt_count); - p += sprintf(p, "CPU%-2d sysupdt cycles : %lu\n", i, pfm_stats[i].pfm_sysupdt_cycles); + p += sprintf(p, "CPU%-2d replay intrs : %lu\n", i, pfm_stats[i].pfm_replay_ovfl_intr_count); p += sprintf(p, "CPU%-2d syst_wide : %d\n" , i, pfm_get_cpu_data(pfm_syst_info, i) & PFM_CPUINFO_SYST_WIDE ? 1 : 0); p += sprintf(p, "CPU%-2d dcr_pp : %d\n" , i, pfm_get_cpu_data(pfm_syst_info, i) & PFM_CPUINFO_DCR_PP ? 1 : 0); p += sprintf(p, "CPU%-2d exclude idle : %d\n" , i, pfm_get_cpu_data(pfm_syst_info, i) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0); p += sprintf(p, "CPU%-2d owner : %d\n" , i, pfm_get_cpu_data(pmu_owner, i) ? pfm_get_cpu_data(pmu_owner, i)->pid: -1); p += sprintf(p, "CPU%-2d context : %p\n" , i, pfm_get_cpu_data(pmu_ctx, i)); p += sprintf(p, "CPU%-2d activations : %lu\n", i, pfm_get_cpu_data(pmu_activation_number,i)); + online_cpus++; } - if (num_online_cpus() == 1) + if (online_cpus == 1) { psr = pfm_get_psr(); ia64_srlz_d(); @@ -5485,7 +5529,7 @@ } LOCK_PFS(); - p += sprintf(p, "proc_sessions : %u\n" + p += sprintf(p, "proc_sessions : %u\n" "sys_sessions : %u\n" "sys_use_dbregs : %u\n" "ptrace_use_dbregs : %u\n", @@ -5495,29 +5539,30 @@ pfm_sessions.pfs_ptrace_use_dbregs); UNLOCK_PFS(); - LOCK_BUF_FMT_LIST(); + spin_lock(&pfm_buffer_fmt_lock); - for (b = pfm_buffer_fmt_list; b ; b = b->fmt_next) { + list_for_each(pos, &pfm_buffer_fmt_list) { + entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list); p += sprintf(p, "format : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n", - b->fmt_uuid[0], - b->fmt_uuid[1], - b->fmt_uuid[2], - b->fmt_uuid[3], - b->fmt_uuid[4], - b->fmt_uuid[5], - b->fmt_uuid[6], - b->fmt_uuid[7], - b->fmt_uuid[8], - b->fmt_uuid[9], - b->fmt_uuid[10], - b->fmt_uuid[11], - b->fmt_uuid[12], - b->fmt_uuid[13], - b->fmt_uuid[14], - b->fmt_uuid[15], - b->fmt_name); + entry->fmt_uuid[0], + entry->fmt_uuid[1], + entry->fmt_uuid[2], + entry->fmt_uuid[3], + entry->fmt_uuid[4], + entry->fmt_uuid[5], + entry->fmt_uuid[6], + entry->fmt_uuid[7], + entry->fmt_uuid[8], + entry->fmt_uuid[9], + entry->fmt_uuid[10], + entry->fmt_uuid[11], + entry->fmt_uuid[12], + entry->fmt_uuid[13], + entry->fmt_uuid[14], + entry->fmt_uuid[15], + entry->fmt_name); } - UNLOCK_BUF_FMT_LIST(); + spin_unlock(&pfm_buffer_fmt_lock); return p - page; } @@ -5546,7 +5591,7 @@ * local_cpu_data->pfm_syst_info */ void -pfm_do_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin) +pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin) { struct pt_regs *regs; unsigned long dcr; @@ -5591,21 +5636,10 @@ } } -void -pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin) -{ - unsigned long start, end; - - pfm_stats[smp_processor_id()].pfm_sysupdt_count++; - start = ia64_get_itc(); - - pfm_do_syst_wide_update_task(task, info, is_ctxswin); - - end = ia64_get_itc(); - pfm_stats[smp_processor_id()].pfm_sysupdt_cycles += end-start; -} - #ifdef CONFIG_SMP +/* + * in 2.6, interrupts are masked when we come here and the runqueue lock is held + */ void pfm_save_regs(struct task_struct *task) { @@ -5706,14 +5740,11 @@ /* * unfreeze PMU if had pending overflows */ - if (t->pmcs[0] & ~1UL) pfm_unfreeze_pmu(); + if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu(); /* - * finally, unmask interrupts and allow context - * access. - * Any pended overflow interrupt may be delivered - * here and will be treated as spurious because we - * have have no PMU owner anymore. + * finally, allow context access. + * interrupts will still be masked after this call. */ pfm_unprotect_ctx_ctxsw(ctx, flags); @@ -5726,10 +5757,6 @@ } #else /* !CONFIG_SMP */ - -/* - * in 2.5, interrupts are masked when we come here - */ void pfm_save_regs(struct task_struct *task) { @@ -5836,6 +5863,9 @@ #endif /* CONFIG_SMP */ #ifdef CONFIG_SMP +/* + * in 2.6, interrupts are masked when we come here and the runqueue lock is held + */ void pfm_load_regs (struct task_struct *task) { @@ -5959,20 +5989,24 @@ * was saved. */ if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) { - struct pt_regs *regs = ia64_task_regs(task); - pfm_overflow_handler(task, ctx, t->pmcs[0], regs); + /* + * reload pmc0 with the overflow information + * On McKinley PMU, this will trigger a PMU interrupt + */ + ia64_set_pmc(0, t->pmcs[0]); + ia64_srlz_d(); + t->pmcs[0] = 0UL; +#ifndef CONFIG_MCKINLEY + /* + * will replay the PMU interrupt + */ + DPRINT(("perfmon: resend irq for [%d]\n", task->pid)); + hw_resend_irq(NULL, IA64_PERFMON_VECTOR); +#endif + pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++; } /* - * we clear PMC0, to ensure that any in flight interrupt - * will not be attributed to the new context we are installing - * because the actual overflow has been processed above already. - * No real effect until we unmask interrupts at the end of the - * function. - */ - pfm_unfreeze_pmu(); - - /* * we just did a reload, so we reset the partial reload fields */ ctx->ctx_reload_pmcs[0] = 0UL; @@ -5990,13 +6024,15 @@ SET_ACTIVATION(ctx); /* - * establish new ownership. Interrupts - * are still masked at this point. + * establish new ownership. */ SET_PMU_OWNER(task, ctx); /* - * restore the psr.up bit + * restore the psr.up bit. measurement + * is active again. + * no PMU interrupt can happen at this point + * because we still have interrupts disabled. */ if (likely(psr_up)) pfm_set_psr_up(); @@ -6091,42 +6127,39 @@ pfm_restore_pmcs(t->pmcs, pmc_mask); /* - * Check for pending overflow when state was last saved. - * invoked handler is overflow status bits set. - * - * Any PMU overflow in flight at this point, will still - * be treated as spurious because we have no declared - * owner. Note that the first level interrupt handler - * DOES NOT TOUCH any PMC except PMC0 for which we have - * a copy already. + * check for pending overflow at the time the state + * was saved. */ if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) { - struct pt_regs *regs = ia64_task_regs(task); - pfm_overflow_handler(task, ctx, t->pmcs[0], regs); - } + /* + * reload pmc0 with the overflow information + * On McKinley PMU, this will trigger a PMU interrupt + */ + ia64_set_pmc(0, t->pmcs[0]); + ia64_srlz_d(); - /* - * we clear PMC0, to ensure that any in flight interrupt - * will not be attributed to the new context we are installing - * because the actual overflow has been processed above already. - * - * This is an atomic operation. - */ - pfm_unfreeze_pmu(); + t->pmcs[0] = 0UL; + +#ifndef CONFIG_MCKINLEY + /* + * will replay the PMU interrupt + */ + DPRINT(("perfmon: resend irq for [%d]\n", task->pid)); + hw_resend_irq(NULL, IA64_PERFMON_VECTOR); +#endif + pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++; + } /* - * establish new ownership. If there was an in-flight - * overflow interrupt, it will be treated as spurious - * before and after the call, because no overflow - * status bit can possibly be set. No new overflow - * can be generated because, at this point, psr.up - * is still cleared. + * establish new ownership. */ SET_PMU_OWNER(task, ctx); /* - * restore the psr. This is the point at which - * new overflow interrupts can be generated again. + * restore the psr.up bit. measurement + * is active again. + * no PMU interrupt can happen at this point + * because we still have interrupts disabled. */ if (likely(psr_up)) pfm_set_psr_up(); } @@ -6139,7 +6172,7 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx) { u64 pmc0; - unsigned long mask2, val, pmd_val; + unsigned long mask2, val, pmd_val, ovfl_val; int i, can_access_pmu = 0; int is_self; @@ -6187,7 +6220,7 @@ */ task->thread.pmcs[0] &= ~0x1; } - + ovfl_val = pmu_conf.ovfl_val; /* * we save all the used pmds * we take care of overflows for counting PMDs @@ -6210,12 +6243,12 @@ task->pid, i, ctx->ctx_pmds[i].val, - val & pmu_conf.ovfl_val)); + val & ovfl_val)); /* * we rebuild the full 64 bit value of the counter */ - val = ctx->ctx_pmds[i].val + (val & pmu_conf.ovfl_val); + val = ctx->ctx_pmds[i].val + (val & ovfl_val); /* * now everything is in ctx_pmds[] and we need @@ -6228,7 +6261,7 @@ * take care of overflow inline */ if (pmc0 & (1UL << i)) { - val += 1 + pmu_conf.ovfl_val; + val += 1 + ovfl_val; DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i)); } } @@ -6338,7 +6371,7 @@ * initialize all our spinlocks */ spin_lock_init(&pfm_sessions.pfs_lock); - spin_lock_init(&pfm_smpl_fmt_lock); + spin_lock_init(&pfm_buffer_fmt_lock); init_pfm_fs(); @@ -6352,6 +6385,9 @@ __initcall(pfm_init); +/* + * this function is called before pfm_init() + */ void pfm_init_percpu (void) { @@ -6363,7 +6399,6 @@ */ pfm_clear_psr_pp(); pfm_clear_psr_up(); - if (smp_processor_id() == 0) register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); diff -Nru a/arch/ia64/kernel/perfmon_itanium.h b/arch/ia64/kernel/perfmon_itanium.h --- a/arch/ia64/kernel/perfmon_itanium.h Fri Oct 17 23:12:59 2003 +++ b/arch/ia64/kernel/perfmon_itanium.h Fri Oct 17 23:12:59 2003 @@ -81,6 +81,8 @@ */ if (cnum == 13 && ((*val & 0x1) == 0UL) && ctx->ctx_fl_using_dbreg == 0) { + DPRINT(("pmc[%d]=0x%lx has active pmc13.ta cleared, clearing ibr\n", cnum, *val)); + /* don't mix debug with perfmon */ if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; @@ -97,6 +99,8 @@ * before they are written (fl_using_dbreg==0) to avoid picking up stale information. */ if (cnum == 11 && ((*val >> 28)& 0x1) == 0 && ctx->ctx_fl_using_dbreg == 0) { + + DPRINT(("pmc[%d]=0x%lx has active pmc11.pt cleared, clearing dbr\n", cnum, *val)); /* don't mix debug with perfmon */ if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; diff -Nru a/arch/ia64/kernel/perfmon_mckinley.h b/arch/ia64/kernel/perfmon_mckinley.h --- a/arch/ia64/kernel/perfmon_mckinley.h Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/perfmon_mckinley.h Fri Oct 17 23:12:58 2003 @@ -109,10 +109,20 @@ if (ctx == NULL) return -EINVAL; /* - * we must clear the debug registers if any pmc13.ena_dbrpX bit is enabled - * before they are written (fl_using_dbreg==0) to avoid picking up stale information. + * we must clear the debug registers if pmc13 has a value which enable + * memory pipeline event constraints. In this case we need to clear the + * the debug registers if they have not yet been accessed. This is required + * to avoid picking stale state. + * PMC13 is "active" if: + * one of the pmc13.cfg_dbrpXX field is different from 0x3 + * AND + * at the corresponding pmc13.ena_dbrpXX is set. + * + * For now, we just check on cfg_dbrXX != 0x3. */ - if (cnum == 13 && (*val & (0xfUL << 45)) && ctx->ctx_fl_using_dbreg == 0) { + if (cnum == 13 && ((*val & 0x18181818UL) != 0x18181818UL) && ctx->ctx_fl_using_dbreg == 0) { + + DPRINT(("pmc[%d]=0x%lx has active pmc13 settings, clearing dbr\n", cnum, *val)); /* don't mix debug with perfmon */ if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; @@ -128,7 +138,9 @@ * we must clear the (instruction) debug registers if any pmc14.ibrpX bit is enabled * before they are (fl_using_dbreg==0) to avoid picking up stale information. */ - if (cnum == 14 && ((*val & 0x2222) != 0x2222) && ctx->ctx_fl_using_dbreg == 0) { + if (cnum == 14 && ((*val & 0x2222UL) != 0x2222UL) && ctx->ctx_fl_using_dbreg == 0) { + + DPRINT(("pmc[%d]=0x%lx has active pmc14 settings, clearing ibr\n", cnum, *val)); /* don't mix debug with perfmon */ if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; @@ -170,7 +182,7 @@ && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0) ||(((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0)); - if (ret) printk("perfmon: failure check_case1\n"); + if (ret) DPRINT((KERN_DEBUG "perfmon: failure check_case1\n")); } return ret ? -EINVAL : 0; diff -Nru a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c --- a/arch/ia64/kernel/process.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/process.c Fri Oct 17 23:12:58 2003 @@ -685,12 +685,16 @@ (*efi.reset_system)(EFI_RESET_WARM, 0, 0, 0); } +EXPORT_SYMBOL(machine_restart); + void machine_halt (void) { cpu_halt(); } +EXPORT_SYMBOL(machine_halt); + void machine_power_off (void) { @@ -698,3 +702,5 @@ pm_power_off(); machine_halt(); } + +EXPORT_SYMBOL(machine_power_off); diff -Nru a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c --- a/arch/ia64/kernel/setup.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/setup.c Fri Oct 17 23:12:58 2003 @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include #include @@ -43,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -101,7 +104,7 @@ filter_rsvd_memory (unsigned long start, unsigned long end, void *arg) { unsigned long range_start, range_end, prev_start; - void (*func)(unsigned long, unsigned long); + void (*func)(unsigned long, unsigned long, int); int i; #if IGNORE_PFN0 @@ -122,11 +125,7 @@ range_end = min(end, rsvd_region[i].start); if (range_start < range_end) -#ifdef CONFIG_DISCONTIGMEM - call_pernode_memory(__pa(range_start), __pa(range_end), func); -#else - (*func)(__pa(range_start), range_end - range_start); -#endif + call_pernode_memory(__pa(range_start), range_end - range_start, func); /* nothing more available in this segment */ if (range_end == end) return 0; @@ -225,6 +224,25 @@ #endif } +#ifdef CONFIG_SERIAL_8250_CONSOLE +static void __init +setup_serial_legacy (void) +{ + struct uart_port port; + unsigned int i, iobase[] = {0x3f8, 0x2f8}; + + printk(KERN_INFO "Registering legacy COM ports for serial console\n"); + memset(&port, 0, sizeof(port)); + port.iotype = SERIAL_IO_PORT; + port.uartclk = BASE_BAUD * 16; + for (i = 0; i < ARRAY_SIZE(iobase); i++) { + port.line = i; + port.iobase = iobase[i]; + early_serial_setup(&port); + } +} +#endif + void __init setup_arch (char **cmdline_p) { @@ -239,7 +257,6 @@ strlcpy(saved_command_line, *cmdline_p, sizeof(saved_command_line)); efi_init(); - find_memory(); #ifdef CONFIG_ACPI_BOOT /* Initialize the ACPI boot-time table parser */ @@ -253,6 +270,8 @@ # endif #endif /* CONFIG_APCI_BOOT */ + find_memory(); + /* process SAL system table: */ ia64_sal_init(efi.sal_systab); @@ -297,11 +316,24 @@ #ifdef CONFIG_SERIAL_8250_HCDP if (efi.hcdp) { void setup_serial_hcdp(void *); - - /* Setup the serial ports described by HCDP */ setup_serial_hcdp(efi.hcdp); } #endif +#ifdef CONFIG_SERIAL_8250_CONSOLE + /* + * Without HCDP, we won't discover any serial ports until the serial driver looks + * in the ACPI namespace. If ACPI claims there are some legacy devices, register + * the legacy COM ports so serial console works earlier. This is slightly dangerous + * because we don't *really* know whether there's anything there, but we hope that + * all new boxes will implement HCDP. + */ + { + extern unsigned char acpi_legacy_devices; + if (!efi.hcdp && acpi_legacy_devices) + setup_serial_legacy(); + } +#endif + #ifdef CONFIG_VT # if defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; @@ -544,28 +576,7 @@ struct cpuinfo_ia64 *cpu_info; void *cpu_data; -#ifdef CONFIG_SMP - int cpu; - - /* - * get_free_pages() cannot be used before cpu_init() done. BSP allocates - * "NR_CPUS" pages for all CPUs to avoid that AP calls get_zeroed_page(). - */ - if (smp_processor_id() == 0) { - cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS, PERCPU_PAGE_SIZE, - __pa(MAX_DMA_ADDRESS)); - for (cpu = 0; cpu < NR_CPUS; cpu++) { - memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start); - __per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start; - cpu_data += PERCPU_PAGE_SIZE; - - per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; - } - } - cpu_data = __per_cpu_start + __per_cpu_offset[smp_processor_id()]; -#else /* !CONFIG_SMP */ - cpu_data = __phys_per_cpu_start; -#endif /* !CONFIG_SMP */ + cpu_data = per_cpu_init(); get_max_cacheline_size(); @@ -576,9 +587,6 @@ * accessing cpu_data() through the canonical per-CPU address. */ cpu_info = cpu_data + ((char *) &__ia64_per_cpu_var(cpu_info) - __per_cpu_start); -#ifdef CONFIG_NUMA - cpu_info->node_data = get_node_data_ptr(); -#endif identify_cpu(cpu_info); #ifdef CONFIG_MCKINLEY diff -Nru a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c --- a/arch/ia64/kernel/time.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/kernel/time.c Fri Oct 17 23:12:58 2003 @@ -65,8 +65,12 @@ } /* - * Return the number of nano-seconds that elapsed since the last update to jiffy. The - * xtime_lock must be at least read-locked when calling this routine. + * Return the number of nano-seconds that elapsed since the last + * update to jiffy. It is quite possible that the timer interrupt + * will interrupt this and result in a race for any of jiffies, + * wall_jiffies or itm_next. Thus, the xtime_lock must be at least + * read synchronised when calling this routine (see do_gettimeofday() + * below for an example). */ unsigned long itc_get_offset (void) @@ -77,11 +81,6 @@ last_tick = (cpu_data(TIME_KEEPER_ID)->itm_next - (lost + 1)*cpu_data(TIME_KEEPER_ID)->itm_delta); - if (unlikely((long) (now - last_tick) < 0)) { - printk(KERN_ERR "CPU %d: now < last_tick (now=0x%lx,last_tick=0x%lx)!\n", - smp_processor_id(), now, last_tick); - return last_nsec_offset; - } elapsed_cycles = now - last_tick; return (elapsed_cycles*local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT; } diff -Nru a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c --- a/arch/ia64/lib/checksum.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/lib/checksum.c Fri Oct 17 23:12:58 2003 @@ -1,8 +1,8 @@ /* * Network checksum routines * - * Copyright (C) 1999 Hewlett-Packard Co - * Copyright (C) 1999 Stephane Eranian + * Copyright (C) 1999, 2003 Hewlett-Packard Co + * Stephane Eranian * * Most of the code coming from arch/alpha/lib/checksum.c * @@ -10,6 +10,7 @@ * in an architecture-specific manner due to speed.. */ +#include #include #include @@ -40,6 +41,8 @@ ((unsigned long) proto << 8)); } +EXPORT_SYMBOL(csum_tcpudp_magic); + unsigned int csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len, unsigned short proto, unsigned int sum) @@ -84,6 +87,7 @@ return result; } +EXPORT_SYMBOL(csum_partial); /* * this routine is used for miscellaneous IP-like checksums, mainly @@ -94,3 +98,5 @@ { return ~do_csum(buff,len); } + +EXPORT_SYMBOL(ip_compute_csum); diff -Nru a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c --- a/arch/ia64/lib/csum_partial_copy.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/lib/csum_partial_copy.c Fri Oct 17 23:12:58 2003 @@ -1,12 +1,13 @@ /* * Network Checksum & Copy routine * - * Copyright (C) 1999 Hewlett-Packard Co - * Copyright (C) 1999 Stephane Eranian + * Copyright (C) 1999, 2003 Hewlett-Packard Co + * Stephane Eranian * * Most of the code has been imported from Linux/Alpha */ +#include #include #include @@ -146,3 +147,4 @@ return do_csum_partial_copy_from_user(src, dst, len, sum, NULL); } +EXPORT_SYMBOL(csum_partial_copy_nocheck); diff -Nru a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c --- a/arch/ia64/mm/contig.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/mm/contig.c Fri Oct 17 23:12:58 2003 @@ -25,6 +25,10 @@ #include #include +#ifdef CONFIG_VIRTUAL_MEM_MAP +static unsigned long num_dma_physpages; +#endif + /** * show_mem - display a memory statistics summary * @@ -160,4 +164,134 @@ reserve_bootmem(bootmap_start, bootmap_size); find_initrd(); +} + +#ifdef CONFIG_SMP +/** + * per_cpu_init - setup per-cpu variables + * + * Allocate and setup per-cpu data areas. + */ +void * +per_cpu_init (void) +{ + void *cpu_data; + int cpu; + + /* + * get_free_pages() cannot be used before cpu_init() done. BSP + * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls + * get_zeroed_page(). + */ + if (smp_processor_id() == 0) { + cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS, + PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + for (cpu = 0; cpu < NR_CPUS; cpu++) { + memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start); + __per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start; + cpu_data += PERCPU_PAGE_SIZE; + per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; + } + } + return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; +} +#endif /* CONFIG_SMP */ + +static int +count_pages (u64 start, u64 end, void *arg) +{ + unsigned long *count = arg; + + *count += (end - start) >> PAGE_SHIFT; + return 0; +} + +#ifdef CONFIG_VIRTUAL_MEM_MAP +static int +count_dma_pages (u64 start, u64 end, void *arg) +{ + unsigned long *count = arg; + + if (end <= MAX_DMA_ADDRESS) + *count += (end - start) >> PAGE_SHIFT; + return 0; +} +#endif + +/* + * Set up the page tables. + */ + +void +paging_init (void) +{ + unsigned long max_dma; + unsigned long zones_size[MAX_NR_ZONES]; +#ifdef CONFIG_VIRTUAL_MEM_MAP + unsigned long zholes_size[MAX_NR_ZONES]; + unsigned long max_gap; +#endif + + /* initialize mem_map[] */ + + memset(zones_size, 0, sizeof(zones_size)); + + num_physpages = 0; + efi_memmap_walk(count_pages, &num_physpages); + + max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; + +#ifdef CONFIG_VIRTUAL_MEM_MAP + memset(zholes_size, 0, sizeof(zholes_size)); + + num_dma_physpages = 0; + efi_memmap_walk(count_dma_pages, &num_dma_physpages); + + if (max_low_pfn < max_dma) { + zones_size[ZONE_DMA] = max_low_pfn; + zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages; + } else { + zones_size[ZONE_DMA] = max_dma; + zholes_size[ZONE_DMA] = max_dma - num_dma_physpages; + if (num_physpages > num_dma_physpages) { + zones_size[ZONE_NORMAL] = max_low_pfn - max_dma; + zholes_size[ZONE_NORMAL] = + ((max_low_pfn - max_dma) - + (num_physpages - num_dma_physpages)); + } + } + + max_gap = 0; + efi_memmap_walk(find_largest_hole, (u64 *)&max_gap); + if (max_gap < LARGE_GAP) { + vmem_map = (struct page *) 0; + free_area_init_node(0, &contig_page_data, NULL, zones_size, 0, + zholes_size); + mem_map = contig_page_data.node_mem_map; + } else { + unsigned long map_size; + + /* allocate virtual_mem_map */ + + map_size = PAGE_ALIGN(max_low_pfn * sizeof(struct page)); + vmalloc_end -= map_size; + vmem_map = (struct page *) vmalloc_end; + efi_memmap_walk(create_mem_map_page_table, 0); + + free_area_init_node(0, &contig_page_data, vmem_map, zones_size, + 0, zholes_size); + + mem_map = contig_page_data.node_mem_map; + printk("Virtual mem_map starts at 0x%p\n", mem_map); + } +#else /* !CONFIG_VIRTUAL_MEM_MAP */ + if (max_low_pfn < max_dma) + zones_size[ZONE_DMA] = max_low_pfn; + else { + zones_size[ZONE_DMA] = max_dma; + zones_size[ZONE_NORMAL] = max_low_pfn - max_dma; + } + free_area_init(zones_size); +#endif /* !CONFIG_VIRTUAL_MEM_MAP */ + zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } diff -Nru a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c --- a/arch/ia64/mm/discontig.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/mm/discontig.c Fri Oct 17 23:12:58 2003 @@ -17,72 +17,57 @@ #include #include #include +#include #include - - -/* - * Round an address upward to the next multiple of GRANULE size. - */ -#define GRANULEROUNDUP(n) (((n)+IA64_GRANULE_SIZE-1) & ~(IA64_GRANULE_SIZE-1)) - -static struct ia64_node_data *node_data[MAX_NUMNODES]; -static long boot_pg_data[8*MAX_NUMNODES+sizeof(pg_data_t)] __initdata; -static pg_data_t *pg_data_ptr[MAX_NUMNODES] __initdata; -static bootmem_data_t bdata[MAX_NUMNODES][NR_BANKS_PER_NODE+1] __initdata; -/* - * Return the compact node number of this cpu. Used prior to - * setting up the cpu_data area. - * Note - not fast, intended for boot use only!! - */ -int -boot_get_local_nodeid(void) -{ - int i; - - for (i = 0; i < NR_CPUS; i++) - if (node_cpuid[i].phys_id == hard_smp_processor_id()) - return node_cpuid[i].nid; - - /* node info missing, so nid should be 0.. */ - return 0; -} +#include +#include /* - * Return a pointer to the pg_data structure for a node. - * This function is used ONLY in early boot before the cpu_data - * structure is available. + * Track per-node information needed to setup the boot memory allocator, the + * per-node areas, and the real VM. */ -pg_data_t* __init -boot_get_pg_data_ptr(long node) -{ - return pg_data_ptr[node]; -} - - -/* - * Return a pointer to the node data for the current node. - * (boottime initialization only) +struct early_node_data { + struct ia64_node_data *node_data; + pg_data_t *pgdat; + unsigned long pernode_addr; + unsigned long pernode_size; + struct bootmem_data bootmem_data; + unsigned long num_physpages; + unsigned long num_dma_physpages; + unsigned long min_pfn; + unsigned long max_pfn; +}; + +static struct early_node_data mem_data[NR_NODES] __initdata; + +/* + * To prevent cache aliasing effects, align per-node structures so that they + * start at addresses that are strided by node number. + */ +#define NODEDATA_ALIGN(addr, node) \ + ((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE) + +/** + * build_node_maps - callback to setup bootmem structs for each node + * @start: physical start of range + * @len: length of range + * @node: node where this range resides + * + * We allocate a struct bootmem_data for each piece of memory that we wish to + * treat as a virtually contiguous block (i.e. each node). Each such block + * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down + * if necessary. Any non-existent pages will simply be part of the virtual + * memmap. We also update min_low_pfn and max_low_pfn here as we receive + * memory ranges from the caller. */ -struct ia64_node_data * -get_node_data_ptr(void) +static int __init build_node_maps(unsigned long start, unsigned long len, + int node) { - return node_data[boot_get_local_nodeid()]; -} + unsigned long cstart, epfn, end = start + len; + struct bootmem_data *bdp = &mem_data[node].bootmem_data; -/* - * We allocate one of the bootmem_data_t structs for each piece of memory - * that we wish to treat as a contiguous block. Each such block must start - * on a BANKSIZE boundary. Multiple banks per node is not supported. - */ -static int __init -build_maps(unsigned long pstart, unsigned long length, int node) -{ - bootmem_data_t *bdp; - unsigned long cstart, epfn; - - bdp = pg_data_ptr[node]->bdata; - epfn = GRANULEROUNDUP(pstart + length) >> PAGE_SHIFT; - cstart = pstart & ~(BANKSIZE - 1); + epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT; + cstart = GRANULEROUNDDOWN(start); if (!bdp->node_low_pfn) { bdp->node_boot_start = cstart; @@ -98,34 +83,143 @@ return 0; } -/* - * Find space on each node for the bootmem map. +/** + * early_nr_cpus_node - return number of cpus on a given node + * @node: node to check * - * Called by efi_memmap_walk to find boot memory on each node. Note that - * only blocks that are free are passed to this routine (currently filtered by - * free_available_memory). + * Count the number of cpus on @node. We can't use nr_cpus_node() yet because + * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been + * called yet. */ -static int __init -find_bootmap_space(unsigned long pstart, unsigned long length, int node) +static int early_nr_cpus_node(int node) { - unsigned long mapsize, pages, epfn; - bootmem_data_t *bdp; + int cpu, n = 0; - epfn = (pstart + length) >> PAGE_SHIFT; - bdp = &pg_data_ptr[node]->bdata[0]; + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (node == node_cpuid[cpu].nid) + n++; + + return n; +} - if (pstart < bdp->node_boot_start || epfn > bdp->node_low_pfn) +/** + * find_pernode_space - allocate memory for memory map and per-node structures + * @start: physical start of range + * @len: length of range + * @node: node where this range resides + * + * This routine reserves space for the per-cpu data struct, the list of + * pg_data_ts and the per-node data struct. Each node will have something like + * the following in the first chunk of addr. space large enough to hold it. + * + * ________________________ + * | | + * |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first + * | PERCPU_PAGE_SIZE * | start and length big enough + * | NR_CPUS | + * |------------------------| + * | local pg_data_t * | + * |------------------------| + * | local ia64_node_data | + * |------------------------| + * | ??? | + * |________________________| + * + * Once this space has been set aside, the bootmem maps are initialized. We + * could probably move the allocation of the per-cpu and ia64_node_data space + * outside of this function and use alloc_bootmem_node(), but doing it here + * is straightforward and we get the alignments we want so... + */ +static int __init find_pernode_space(unsigned long start, unsigned long len, + int node) +{ + unsigned long epfn, cpu, cpus; + unsigned long pernodesize = 0, pernode; + void *cpu_data; + struct bootmem_data *bdp = &mem_data[node].bootmem_data; + + epfn = (start + len) >> PAGE_SHIFT; + + /* + * Make sure this memory falls within this node's usable memory + * since we may have thrown some away in build_maps(). + */ + if (start < bdp->node_boot_start || + epfn > bdp->node_low_pfn) return 0; - if (!bdp->node_bootmem_map) { - pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT); + /* Don't setup this node's local space twice... */ + if (!mem_data[node].pernode_addr) { + /* + * Calculate total size needed, incl. what's necessary + * for good alignment and alias prevention. + */ + cpus = early_nr_cpus_node(node); + pernodesize += PERCPU_PAGE_SIZE * cpus; + pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t)); + pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); + pernodesize = PAGE_ALIGN(pernodesize); + pernode = NODEDATA_ALIGN(start, node); + + /* Is this range big enough for what we want to store here? */ + if (start + len > (pernode + pernodesize)) { + mem_data[node].pernode_addr = pernode; + mem_data[node].pernode_size = pernodesize; + memset(__va(pernode), 0, pernodesize); + + cpu_data = (void *)pernode; + pernode += PERCPU_PAGE_SIZE * cpus; + + mem_data[node].pgdat = __va(pernode); + pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); + + mem_data[node].node_data = __va(pernode); + pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); + + mem_data[node].pgdat->bdata = bdp; + pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); + + /* + * Copy the static per-cpu data into the region we + * just set aside and then setup __per_cpu_offset + * for each CPU on this node. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (node == node_cpuid[cpu].nid) { + memcpy(cpu_data, __phys_per_cpu_start, + __per_cpu_end-__per_cpu_start); + __per_cpu_offset[cpu] = + (char*)__va(cpu_data) - + __per_cpu_start; + cpu_data += PERCPU_PAGE_SIZE; + } + } + } + } + + pernode = mem_data[node].pernode_addr; + pernodesize = mem_data[node].pernode_size; + if (pernode && !bdp->node_bootmem_map) { + unsigned long pages, mapsize, map = 0; + + pages = bdp->node_low_pfn - + (bdp->node_boot_start >> PAGE_SHIFT); mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT; - if (length > mapsize) { - init_bootmem_node( - BOOT_NODE_DATA(node), - pstart>>PAGE_SHIFT, - bdp->node_boot_start>>PAGE_SHIFT, - bdp->node_low_pfn); + + /* + * The map will either contain the pernode area or begin + * after it. + */ + if (pernode - start > mapsize) + map = start; + else if (start + len - pernode - pernodesize > mapsize) + map = pernode + pernodesize; + + if (map) { + init_bootmem_node(mem_data[node].pgdat, + map>>PAGE_SHIFT, + bdp->node_boot_start>>PAGE_SHIFT, + bdp->node_low_pfn); } } @@ -133,85 +227,93 @@ return 0; } - -/* - * Free available memory to the bootmem allocator. - * - * Note that only blocks that are free are passed to this routine (currently - * filtered by free_available_memory). +/** + * free_node_bootmem - free bootmem allocator memory for use + * @start: physical start of range + * @len: length of range + * @node: node where this range resides * + * Simply calls the bootmem allocator to free the specified ranged from + * the given pg_data_t's bdata struct. After this function has been called + * for all the entries in the EFI memory map, the bootmem allocator will + * be ready to service allocation requests. */ -static int __init -discontig_free_bootmem_node(unsigned long pstart, unsigned long length, int node) +static int __init free_node_bootmem(unsigned long start, unsigned long len, + int node) { - free_bootmem_node(BOOT_NODE_DATA(node), pstart, length); + free_bootmem_node(mem_data[node].pgdat, start, len); return 0; } - -/* - * Reserve the space used by the bootmem maps. - */ -static void __init -discontig_reserve_bootmem(void) -{ - int node; - unsigned long mapbase, mapsize, pages; - bootmem_data_t *bdp; +/** + * reserve_pernode_space - reserve memory for per-node space + * + * Reserve the space used by the bootmem maps & per-node space in the boot + * allocator so that when we actually create the real mem maps we don't + * use their memory. + */ +static void __init reserve_pernode_space(void) +{ + unsigned long base, size, pages; + struct bootmem_data *bdp; + int node; for (node = 0; node < numnodes; node++) { - bdp = BOOT_NODE_DATA(node)->bdata; + pg_data_t *pdp = mem_data[node].pgdat; + bdp = pdp->bdata; + + /* First the bootmem_map itself */ pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT); - mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT; - mapbase = __pa(bdp->node_bootmem_map); - reserve_bootmem_node(BOOT_NODE_DATA(node), mapbase, mapsize); + size = bootmem_bootmap_pages(pages) << PAGE_SHIFT; + base = __pa(bdp->node_bootmem_map); + reserve_bootmem_node(pdp, base, size); + + /* Now the per-node space */ + size = mem_data[node].pernode_size; + base = __pa(mem_data[node].pernode_addr); + reserve_bootmem_node(pdp, base, size); } } -/* - * Allocate per node tables. - * - the pg_data structure is allocated on each node. This minimizes offnode - * memory references - * - the node data is allocated & initialized. Portions of this structure is read-only (after - * boot) and contains node-local pointers to usefuls data structures located on - * other nodes. - * - * We also switch to using the "real" pg_data structures at this point. Earlier in boot, we - * use a different structure. The only use for pg_data prior to the point in boot is to get - * the pointer to the bdata for the node. - */ -static void __init -allocate_pernode_structures(void) -{ - pg_data_t *pgdat=0, *new_pgdat_list=0; - int node, mynode; - - mynode = boot_get_local_nodeid(); - for (node = numnodes - 1; node >= 0 ; node--) { - node_data[node] = alloc_bootmem_node(BOOT_NODE_DATA(node), sizeof (struct ia64_node_data)); - pgdat = __alloc_bootmem_node(BOOT_NODE_DATA(node), sizeof(pg_data_t), SMP_CACHE_BYTES, 0); - pgdat->bdata = &(bdata[node][0]); - pg_data_ptr[node] = pgdat; - pgdat->pgdat_next = new_pgdat_list; - new_pgdat_list = pgdat; - } +/** + * initialize_pernode_data - fixup per-cpu & per-node pointers + * + * Each node's per-node area has a copy of the global pg_data_t list, so + * we copy that to each node here, as well as setting the per-cpu pointer + * to the local node data structure. The active_cpus field of the per-node + * structure gets setup by the platform_cpu_init() function later. + */ +static void __init initialize_pernode_data(void) +{ + int cpu, node; + pg_data_t *pgdat_list[NR_NODES]; - memcpy(node_data[mynode]->pg_data_ptrs, pg_data_ptr, sizeof(pg_data_ptr)); - memcpy(node_data[mynode]->node_data_ptrs, node_data, sizeof(node_data)); + for (node = 0; node < numnodes; node++) + pgdat_list[node] = mem_data[node].pgdat; - pgdat_list = new_pgdat_list; + /* Copy the pg_data_t list to each node and init the node field */ + for (node = 0; node < numnodes; node++) { + memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list, + sizeof(pgdat_list)); + } + + /* Set the node_data pointer for each per-cpu struct */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + node = node_cpuid[cpu].nid; + per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data; + } } -/* - * Called early in boot to setup the boot memory allocator, and to - * allocate the node-local pg_data & node-directory data structures.. +/** + * find_memory - walk the EFI memory map and setup the bootmem allocator + * + * Called early in boot to setup the bootmem allocator, and to + * allocate the per-cpu and per-node structures. */ void __init find_memory(void) { - int node; - reserve_memory(); if (numnodes == 0) { @@ -219,94 +321,48 @@ numnodes = 1; } - for (node = 0; node < numnodes; node++) { - pg_data_ptr[node] = (pg_data_t*) &boot_pg_data[node]; - pg_data_ptr[node]->bdata = &bdata[node][0]; - } - min_low_pfn = -1; max_low_pfn = 0; - efi_memmap_walk(filter_rsvd_memory, build_maps); - efi_memmap_walk(filter_rsvd_memory, find_bootmap_space); - efi_memmap_walk(filter_rsvd_memory, discontig_free_bootmem_node); - discontig_reserve_bootmem(); - allocate_pernode_structures(); - - find_initrd(); -} - -/* - * Initialize the paging system. - * - determine sizes of each node - * - initialize the paging system for the node - * - build the nodedir for the node. This contains pointers to - * the per-bank mem_map entries. - * - fix the page struct "virtual" pointers. These are bank specific - * values that the paging system doesn't understand. - * - replicate the nodedir structure to other nodes - */ - -void __init -discontig_paging_init(void) -{ - int node, mynode; - unsigned long max_dma, zones_size[MAX_NR_ZONES]; - unsigned long kaddr, ekaddr, bid; - struct page *page; - bootmem_data_t *bdp; - - max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; + /* These actually end up getting called by call_pernode_memory() */ + efi_memmap_walk(filter_rsvd_memory, build_node_maps); + efi_memmap_walk(filter_rsvd_memory, find_pernode_space); + efi_memmap_walk(filter_rsvd_memory, free_node_bootmem); - mynode = boot_get_local_nodeid(); - for (node = 0; node < numnodes; node++) { - long pfn, startpfn; + reserve_pernode_space(); + initialize_pernode_data(); - memset(zones_size, 0, sizeof(zones_size)); + max_pfn = max_low_pfn; - startpfn = -1; - bdp = BOOT_NODE_DATA(node)->bdata; - pfn = bdp->node_boot_start >> PAGE_SHIFT; - if (startpfn == -1) - startpfn = pfn; - if (pfn > max_dma) - zones_size[ZONE_NORMAL] += (bdp->node_low_pfn - pfn); - else if (bdp->node_low_pfn < max_dma) - zones_size[ZONE_DMA] += (bdp->node_low_pfn - pfn); - else { - zones_size[ZONE_DMA] += (max_dma - pfn); - zones_size[ZONE_NORMAL] += (bdp->node_low_pfn - max_dma); - } - - free_area_init_node(node, NODE_DATA(node), NULL, zones_size, startpfn, 0); - - page = NODE_DATA(node)->node_mem_map; + find_initrd(); +} - bdp = BOOT_NODE_DATA(node)->bdata; +/** + * per_cpu_init - setup per-cpu variables + * + * find_pernode_space() does most of this already, we just need to set + * local_per_cpu_offset + */ +void *per_cpu_init(void) +{ + int cpu; - kaddr = (unsigned long)__va(bdp->node_boot_start); - ekaddr = (unsigned long)__va(bdp->node_low_pfn << PAGE_SHIFT); - while (kaddr < ekaddr) { - if (paddr_to_nid(__pa(kaddr)) == node) { - bid = BANK_MEM_MAP_INDEX(kaddr); - node_data[mynode]->node_id_map[bid] = node; - node_data[mynode]->bank_mem_map_base[bid] = page; - } - kaddr += BANKSIZE; - page += BANKSIZE/PAGE_SIZE; + if (smp_processor_id() == 0) { + for (cpu = 0; cpu < NR_CPUS; cpu++) { + per_cpu(local_per_cpu_offset, cpu) = + __per_cpu_offset[cpu]; } } - /* - * Finish setting up the node data for this node, then copy it to the other nodes. - */ - for (node=0; node < numnodes; node++) - if (mynode != node) { - memcpy(node_data[node], node_data[mynode], sizeof(struct ia64_node_data)); - node_data[node]->node = node; - } + return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; } +/** + * show_mem - give short summary of memory stats + * + * Shows a simple page count of reserved and used pages in the system. + * For discontig machines, it does this on a per-pgdat basis. + */ void show_mem(void) { int i, reserved = 0; @@ -335,7 +391,12 @@ printk("%d free buffer pages\n", nr_free_buffer_pages()); } -/* +/** + * call_pernode_memory - use SRAT to call callback functions with node info + * @start: physical start of range + * @len: length of range + * @arg: function to call for each range + * * efi_memmap_walk() knows nothing about layout of memory across nodes. Find * out to which node a block of memory belongs. Ignore memory that we cannot * identify, and split blocks that run across multiple nodes. @@ -343,10 +404,10 @@ * Take this opportunity to round the start address up and the end address * down to page boundaries. */ -void call_pernode_memory(unsigned long start, unsigned long end, void *arg) +void call_pernode_memory(unsigned long start, unsigned long len, void *arg) { - unsigned long rs, re; - void (*func)(unsigned long, unsigned long, int, int); + unsigned long rs, re, end = start + len; + void (*func)(unsigned long, unsigned long, int); int i; start = PAGE_ALIGN(start); @@ -357,21 +418,127 @@ func = arg; if (!num_memblks) { - /* - * This machine doesn't have SRAT, so call func with - * nid=0, bank=0. - */ + /* No SRAT table, to assume one node (node 0) */ if (start < end) - (*func)(start, end - start, 0, 0); + (*func)(start, len, 0); return; } for (i = 0; i < num_memblks; i++) { rs = max(start, node_memblk[i].start_paddr); - re = min(end, node_memblk[i].start_paddr+node_memblk[i].size); + re = min(end, node_memblk[i].start_paddr + + node_memblk[i].size); if (rs < re) - (*func)(rs, re-rs, node_memblk[i].nid, - node_memblk[i].bank); + (*func)(rs, re - rs, node_memblk[i].nid); + + if (re == end) + break; + } +} + +/** + * count_node_pages - callback to build per-node memory info structures + * @start: physical start of range + * @len: length of range + * @node: node where this range resides + * + * Each node has it's own number of physical pages, DMAable pages, start, and + * end page frame number. This routine will be called by call_pernode_memory() + * for each piece of usable memory and will setup these values for each node. + * Very similar to build_maps(). + */ +static int count_node_pages(unsigned long start, unsigned long len, int node) +{ + unsigned long end = start + len; + + mem_data[node].num_physpages += len >> PAGE_SHIFT; + if (start <= __pa(MAX_DMA_ADDRESS)) + mem_data[node].num_dma_physpages += + (min(end, __pa(MAX_DMA_ADDRESS)) - start) >>PAGE_SHIFT; + start = GRANULEROUNDDOWN(start); + start = ORDERROUNDDOWN(start); + end = GRANULEROUNDUP(end); + mem_data[node].max_pfn = max(mem_data[node].max_pfn, + end >> PAGE_SHIFT); + mem_data[node].min_pfn = min(mem_data[node].min_pfn, + start >> PAGE_SHIFT); + + return 0; +} + +/** + * paging_init - setup page tables + * + * paging_init() sets up the page tables for each node of the system and frees + * the bootmem allocator memory for general use. + */ +void paging_init(void) +{ + unsigned long max_dma; + unsigned long zones_size[MAX_NR_ZONES]; + unsigned long zholes_size[MAX_NR_ZONES]; + unsigned long max_gap, pfn_offset = 0; + int node; + + max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_gap = 0; + efi_memmap_walk(find_largest_hole, &max_gap); + + /* so min() will work in count_node_pages */ + for (node = 0; node < numnodes; node++) + mem_data[node].min_pfn = ~0UL; + + efi_memmap_walk(filter_rsvd_memory, count_node_pages); + + for (node = 0; node < numnodes; node++) { + memset(zones_size, 0, sizeof(zones_size)); + memset(zholes_size, 0, sizeof(zholes_size)); + + num_physpages += mem_data[node].num_physpages; + + if (mem_data[node].min_pfn >= max_dma) { + /* All of this node's memory is above ZONE_DMA */ + zones_size[ZONE_NORMAL] = mem_data[node].max_pfn - + mem_data[node].min_pfn; + zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn - + mem_data[node].min_pfn - + mem_data[node].num_physpages; + } else if (mem_data[node].max_pfn < max_dma) { + /* All of this node's memory is in ZONE_DMA */ + zones_size[ZONE_DMA] = mem_data[node].max_pfn - + mem_data[node].min_pfn; + zholes_size[ZONE_DMA] = mem_data[node].max_pfn - + mem_data[node].min_pfn - + mem_data[node].num_dma_physpages; + } else { + /* This node has memory in both zones */ + zones_size[ZONE_DMA] = max_dma - + mem_data[node].min_pfn; + zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - + mem_data[node].num_dma_physpages; + zones_size[ZONE_NORMAL] = mem_data[node].max_pfn - + max_dma; + zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] - + (mem_data[node].num_physpages - + mem_data[node].num_dma_physpages); + } + + if (node == 0) { + vmalloc_end -= + PAGE_ALIGN(max_low_pfn * sizeof(struct page)); + vmem_map = (struct page *) vmalloc_end; + + efi_memmap_walk(create_mem_map_page_table, 0); + printk("Virtual mem_map starts at 0x%p\n", vmem_map); + } + + pfn_offset = mem_data[node].min_pfn; + + free_area_init_node(node, NODE_DATA(node), + vmem_map + pfn_offset, zones_size, + pfn_offset, zholes_size); } + + zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } diff -Nru a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c --- a/arch/ia64/mm/hugetlbpage.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/mm/hugetlbpage.c Fri Oct 17 23:12:58 2003 @@ -20,13 +20,46 @@ #define TASK_HPAGE_BASE (REGION_HPAGE << REGION_SHIFT) -static long htlbpagemem; -int htlbpage_max; -static long htlbzone_pages; +static long htlbpagemem; +int htlbpage_max; +static long htlbzone_pages; -static LIST_HEAD(htlbpage_freelist); +static struct list_head hugepage_freelists[MAX_NUMNODES]; static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED; +static void enqueue_huge_page(struct page *page) +{ + list_add(&page->list, + &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); +} + +static struct page *dequeue_huge_page(void) +{ + int nid = numa_node_id(); + struct page *page = NULL; + + if (list_empty(&hugepage_freelists[nid])) { + for (nid = 0; nid < MAX_NUMNODES; ++nid) + if (!list_empty(&hugepage_freelists[nid])) + break; + } + if (nid >= 0 && nid < MAX_NUMNODES && + !list_empty(&hugepage_freelists[nid])) { + page = list_entry(hugepage_freelists[nid].next, struct page, list); + list_del(&page->list); + } + return page; +} + +static struct page *alloc_fresh_huge_page(void) +{ + static int nid = 0; + struct page *page; + page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER); + nid = (nid + 1) % numnodes; + return page; +} + void free_huge_page(struct page *page); static struct page *alloc_hugetlb_page(void) @@ -35,13 +68,11 @@ struct page *page; spin_lock(&htlbpage_lock); - if (list_empty(&htlbpage_freelist)) { + page = dequeue_huge_page(); + if (!page) { spin_unlock(&htlbpage_lock); return NULL; } - - page = list_entry(htlbpage_freelist.next, struct page, list); - list_del(&page->list); htlbpagemem--; spin_unlock(&htlbpage_lock); set_page_count(page, 1); @@ -228,7 +259,7 @@ INIT_LIST_HEAD(&page->list); spin_lock(&htlbpage_lock); - list_add(&page->list, &htlbpage_freelist); + enqueue_huge_page(page); htlbpagemem++; spin_unlock(&htlbpage_lock); } @@ -371,7 +402,7 @@ map = NULL; spin_lock(&htlbpage_lock); - list_for_each(p, &htlbpage_freelist) { + list_for_each(p, &hugepage_freelists[0]) { if (map) { list_del(&map->list); update_and_free_page(map); @@ -408,11 +439,11 @@ return (int)htlbzone_pages; if (lcount > 0) { /* Increase the mem size. */ while (lcount--) { - page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER); + page = alloc_fresh_huge_page(); if (page == NULL) break; spin_lock(&htlbpage_lock); - list_add(&page->list, &htlbpage_freelist); + enqueue_huge_page(page); htlbpagemem++; htlbzone_pages++; spin_unlock(&htlbpage_lock); @@ -449,17 +480,18 @@ static int __init hugetlb_init(void) { - int i, j; + int i; struct page *page; + for (i = 0; i < MAX_NUMNODES; ++i) + INIT_LIST_HEAD(&hugepage_freelists[i]); + for (i = 0; i < htlbpage_max; ++i) { - page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER); + page = alloc_fresh_huge_page(); if (!page) break; - for (j = 0; j < HPAGE_SIZE/PAGE_SIZE; ++j) - SetPageReserved(&page[j]); spin_lock(&htlbpage_lock); - list_add(&page->list, &htlbpage_freelist); + enqueue_huge_page(page); spin_unlock(&htlbpage_lock); } htlbpage_max = htlbpagemem = htlbzone_pages = i; diff -Nru a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c --- a/arch/ia64/mm/init.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/mm/init.c Fri Oct 17 23:12:58 2003 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -40,10 +42,10 @@ unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL; #ifdef CONFIG_VIRTUAL_MEM_MAP -# define LARGE_GAP 0x40000000 /* Use virtual mem map if hole is > than this */ unsigned long vmalloc_end = VMALLOC_END_INIT; - static struct page *vmem_map; - static unsigned long num_dma_physpages; + struct page *vmem_map; + + EXPORT_SYMBOL(vmem_map); #endif static int pgt_cache_water[2] = { 25, 50 }; @@ -337,11 +339,12 @@ #ifdef CONFIG_VIRTUAL_MEM_MAP -static int +int create_mem_map_page_table (u64 start, u64 end, void *arg) { unsigned long address, start_page, end_page; struct page *map_start, *map_end; + int node; pgd_t *pgd; pmd_t *pmd; pte_t *pte; @@ -351,19 +354,20 @@ start_page = (unsigned long) map_start & PAGE_MASK; end_page = PAGE_ALIGN((unsigned long) map_end); + node = paddr_to_nid(__pa(start)); for (address = start_page; address < end_page; address += PAGE_SIZE) { pgd = pgd_offset_k(address); if (pgd_none(*pgd)) - pgd_populate(&init_mm, pgd, alloc_bootmem_pages(PAGE_SIZE)); + pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)); pmd = pmd_offset(pgd, address); if (pmd_none(*pmd)) - pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages(PAGE_SIZE)); + pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)); pte = pte_offset_kernel(pmd, address); if (pte_none(*pte)) - set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages(PAGE_SIZE)) >> PAGE_SHIFT, + set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)) >> PAGE_SHIFT, PAGE_KERNEL)); } return 0; @@ -433,17 +437,7 @@ return __get_user(byte, (char *) pfn_to_page(pfn)) == 0; } -static int -count_dma_pages (u64 start, u64 end, void *arg) -{ - unsigned long *count = arg; - - if (end <= MAX_DMA_ADDRESS) - *count += (end - start) >> PAGE_SHIFT; - return 0; -} - -static int +int find_largest_hole (u64 start, u64 end, void *arg) { u64 *max_gap = arg; @@ -458,103 +452,6 @@ return 0; } #endif /* CONFIG_VIRTUAL_MEM_MAP */ - -static int -count_pages (u64 start, u64 end, void *arg) -{ - unsigned long *count = arg; - - *count += (end - start) >> PAGE_SHIFT; - return 0; -} - -/* - * Set up the page tables. - */ - -#ifdef CONFIG_DISCONTIGMEM -void -paging_init (void) -{ - extern void discontig_paging_init(void); - - discontig_paging_init(); - efi_memmap_walk(count_pages, &num_physpages); - zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); -} -#else /* !CONFIG_DISCONTIGMEM */ -void -paging_init (void) -{ - unsigned long max_dma; - unsigned long zones_size[MAX_NR_ZONES]; -# ifdef CONFIG_VIRTUAL_MEM_MAP - unsigned long zholes_size[MAX_NR_ZONES]; - unsigned long max_gap; -# endif - - /* initialize mem_map[] */ - - memset(zones_size, 0, sizeof(zones_size)); - - num_physpages = 0; - efi_memmap_walk(count_pages, &num_physpages); - - max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; - -# ifdef CONFIG_VIRTUAL_MEM_MAP - memset(zholes_size, 0, sizeof(zholes_size)); - - num_dma_physpages = 0; - efi_memmap_walk(count_dma_pages, &num_dma_physpages); - - if (max_low_pfn < max_dma) { - zones_size[ZONE_DMA] = max_low_pfn; - zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages; - } else { - zones_size[ZONE_DMA] = max_dma; - zholes_size[ZONE_DMA] = max_dma - num_dma_physpages; - if (num_physpages > num_dma_physpages) { - zones_size[ZONE_NORMAL] = max_low_pfn - max_dma; - zholes_size[ZONE_NORMAL] = ((max_low_pfn - max_dma) - - (num_physpages - num_dma_physpages)); - } - } - - max_gap = 0; - efi_memmap_walk(find_largest_hole, (u64 *)&max_gap); - if (max_gap < LARGE_GAP) { - vmem_map = (struct page *) 0; - free_area_init_node(0, &contig_page_data, NULL, zones_size, 0, zholes_size); - mem_map = contig_page_data.node_mem_map; - } - else { - unsigned long map_size; - - /* allocate virtual_mem_map */ - - map_size = PAGE_ALIGN(max_low_pfn * sizeof(struct page)); - vmalloc_end -= map_size; - vmem_map = (struct page *) vmalloc_end; - efi_memmap_walk(create_mem_map_page_table, 0); - - free_area_init_node(0, &contig_page_data, vmem_map, zones_size, 0, zholes_size); - - mem_map = contig_page_data.node_mem_map; - printk("Virtual mem_map starts at 0x%p\n", mem_map); - } -# else /* !CONFIG_VIRTUAL_MEM_MAP */ - if (max_low_pfn < max_dma) - zones_size[ZONE_DMA] = max_low_pfn; - else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = max_low_pfn - max_dma; - } - free_area_init(zones_size); -# endif /* !CONFIG_VIRTUAL_MEM_MAP */ - zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); -} -#endif /* !CONFIG_DISCONTIGMEM */ static int count_reserved_pages (u64 start, u64 end, void *arg) diff -Nru a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c --- a/arch/ia64/mm/numa.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/mm/numa.c Fri Oct 17 23:12:58 2003 @@ -11,12 +11,19 @@ */ #include +#include #include +#include #include +#include #include #include #include +static struct memblk *sysfs_memblks; +static struct node *sysfs_nodes; +static struct cpu *sysfs_cpus; + /* * The following structures are usually initialized by ACPI or * similar mechanisms and describe the NUMA characteristics of the machine. @@ -43,3 +50,49 @@ return (i < num_memblks) ? node_memblk[i].nid : (num_memblks ? -1 : 0); } + +static int __init topology_init(void) +{ + int i, err = 0; + + sysfs_nodes = kmalloc(sizeof(struct node) * numnodes, GFP_KERNEL); + if (!sysfs_nodes) { + err = -ENOMEM; + goto out; + } + + sysfs_memblks = kmalloc(sizeof(struct memblk) * num_memblks, + GFP_KERNEL); + if (!sysfs_memblks) { + kfree(sysfs_nodes); + err = -ENOMEM; + goto out; + } + + sysfs_cpus = kmalloc(sizeof(struct cpu) * NR_CPUS, GFP_KERNEL); + if (!sysfs_cpus) { + kfree(sysfs_memblks); + kfree(sysfs_nodes); + err = -ENOMEM; + goto out; + } + + for (i = 0; i < numnodes; i++) + if ((err = register_node(&sysfs_nodes[i], i, 0))) + goto out; + + for (i = 0; i < num_memblks; i++) + if ((err = register_memblk(&sysfs_memblks[i], i, + &sysfs_nodes[memblk_to_node(i)]))) + goto out; + + for (i = 0; i < NR_CPUS; i++) + if (cpu_online(i)) + if((err = register_cpu(&sysfs_cpus[i], i, + &sysfs_nodes[cpu_to_node(i)]))) + goto out; + out: + return err; +} + +__initcall(topology_init); diff -Nru a/arch/ia64/sn/io/machvec/pci_bus_cvlink.c b/arch/ia64/sn/io/machvec/pci_bus_cvlink.c --- a/arch/ia64/sn/io/machvec/pci_bus_cvlink.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/sn/io/machvec/pci_bus_cvlink.c Fri Oct 17 23:12:58 2003 @@ -867,6 +867,9 @@ int i = 0; struct pci_controller *controller; + if (!ia64_platform_is("sn2")) + return 0; + /* * set pci_raw_ops, etc. */ diff -Nru a/arch/ia64/sn/io/sn2/ml_SN_intr.c b/arch/ia64/sn/io/sn2/ml_SN_intr.c --- a/arch/ia64/sn/io/sn2/ml_SN_intr.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/sn/io/sn2/ml_SN_intr.c Fri Oct 17 23:12:58 2003 @@ -285,7 +285,6 @@ cpuid_t intr_heuristic(vertex_hdl_t dev, int req_bit, int *resp_bit) { cpuid_t cpuid; - cpuid_t candidate = CPU_NONE; vertex_hdl_t pconn_vhdl; pcibr_soft_t pcibr_soft; int bit; @@ -293,30 +292,32 @@ /* XXX: gross layering violation.. */ if (hwgraph_edge_get(dev, EDGE_LBL_PCI, &pconn_vhdl) == GRAPH_SUCCESS) { pcibr_soft = pcibr_soft_get(pconn_vhdl); - if (pcibr_soft && pcibr_soft->bsi_err_intr) - candidate = ((hub_intr_t)pcibr_soft->bsi_err_intr)->i_cpuid; - } - - if (candidate != CPU_NONE) { - /* - * The cpu was chosen already when we assigned - * the error interrupt. - */ - bit = intr_reserve_level(candidate, req_bit); - if (bit >= 0) { - *resp_bit = bit; - return candidate; + if (pcibr_soft && pcibr_soft->bsi_err_intr) { + /* + * The cpu was chosen already when we assigned + * the error interrupt. + */ + cpuid = ((hub_intr_t)pcibr_soft->bsi_err_intr)->i_cpuid; + goto done; } - - printk("Cannot target interrupt to target node (%ld).\n",candidate); - return CPU_NONE; } /* * Need to choose one. Try the controlling c-brick first. */ cpuid = intr_cpu_choose_from_node(master_node_get(dev)); - if (cpuid != CPU_NONE) - return cpuid; - return intr_cpu_choose_node(); + if (cpuid == CPU_NONE) + cpuid = intr_cpu_choose_node(); + + done: + if (cpuid != CPU_NONE) { + bit = intr_reserve_level(cpuid, req_bit); + if (bit >= 0) { + *resp_bit = bit; + return cpuid; + } + } + + printk("Cannot target interrupt to target cpu (%ld).\n", cpuid); + return CPU_NONE; } diff -Nru a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c --- a/arch/ia64/sn/kernel/setup.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/sn/kernel/setup.c Fri Oct 17 23:12:58 2003 @@ -147,7 +147,6 @@ * Sets up an initial console to aid debugging. Intended primarily * for bringup. See start_kernel() in init/main.c. */ -#if defined(CONFIG_IA64_EARLY_PRINTK_SGI_SN) || defined(CONFIG_IA64_SGI_SN_SIM) void __init early_sn_setup(void) @@ -189,7 +188,6 @@ printk(KERN_DEBUG "early_sn_setup: setting master_node_bedrock_address to 0x%lx\n", master_node_bedrock_address); } } -#endif /* CONFIG_IA64_EARLY_PRINTK_SGI_SN */ #ifdef CONFIG_IA64_MCA extern int platform_intr_list[]; diff -Nru a/arch/ia64/sn/kernel/sn2/io.c b/arch/ia64/sn/kernel/sn2/io.c --- a/arch/ia64/sn/kernel/sn2/io.c Fri Oct 17 23:12:58 2003 +++ b/arch/ia64/sn/kernel/sn2/io.c Fri Oct 17 23:12:58 2003 @@ -11,6 +11,8 @@ #include +#ifdef CONFIG_IA64_GENERIC + #undef __sn_inb #undef __sn_inw #undef __sn_inl @@ -81,3 +83,5 @@ { return ___sn_readq (addr); } + +#endif diff -Nru a/drivers/acpi/tables.c b/drivers/acpi/tables.c --- a/drivers/acpi/tables.c Fri Oct 17 23:12:58 2003 +++ b/drivers/acpi/tables.c Fri Oct 17 23:12:58 2003 @@ -262,10 +262,17 @@ /* Map the DSDT header via the pointer in the FADT */ if (id == ACPI_DSDT) { - struct acpi_table_fadt *fadt = (struct acpi_table_fadt *) *header; + struct fadt_descriptor_rev2 *fadt = (struct fadt_descriptor_rev2 *) *header; + + if (fadt->header.revision == 3 && fadt->Xdsdt) { + *header = (void *) __acpi_map_table(fadt->Xdsdt, + sizeof(struct acpi_table_header)); + } else if (fadt->V1_dsdt) { + *header = (void *) __acpi_map_table(fadt->V1_dsdt, + sizeof(struct acpi_table_header)); + } else + *header = 0; - *header = (void *) __acpi_map_table(fadt->dsdt_addr, - sizeof(struct acpi_table_header)); if (!*header) { printk(KERN_WARNING PREFIX "Unable to map DSDT\n"); return -ENODEV; diff -Nru a/drivers/media/radio/Makefile b/drivers/media/radio/Makefile --- a/drivers/media/radio/Makefile Fri Oct 17 23:12:58 2003 +++ b/drivers/media/radio/Makefile Fri Oct 17 23:12:58 2003 @@ -2,6 +2,8 @@ # Makefile for the kernel character device drivers. # +obj-y := dummy.o + miropcm20-objs := miropcm20-rds-core.o miropcm20-radio.o obj-$(CONFIG_RADIO_AZTECH) += radio-aztech.o diff -Nru a/drivers/media/radio/dummy.c b/drivers/media/radio/dummy.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/drivers/media/radio/dummy.c Fri Oct 17 23:12:59 2003 @@ -0,0 +1 @@ +/* just so the linker knows what kind of object files it's deadling with... */ diff -Nru a/drivers/media/video/Makefile b/drivers/media/video/Makefile --- a/drivers/media/video/Makefile Fri Oct 17 23:12:58 2003 +++ b/drivers/media/video/Makefile Fri Oct 17 23:12:58 2003 @@ -7,6 +7,7 @@ zoran-objs := zr36120.o zr36120_i2c.o zr36120_mem.o zr36067-objs := zoran_procfs.o zoran_device.o \ zoran_driver.o zoran_card.o +obj-y := dummy.o obj-$(CONFIG_VIDEO_DEV) += videodev.o v4l2-common.o v4l1-compat.o diff -Nru a/drivers/media/video/dummy.c b/drivers/media/video/dummy.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/drivers/media/video/dummy.c Fri Oct 17 23:12:59 2003 @@ -0,0 +1 @@ +/* just so the linker knows what kind of object files it's deadling with... */ diff -Nru a/drivers/net/tulip/media.c b/drivers/net/tulip/media.c --- a/drivers/net/tulip/media.c Fri Oct 17 23:12:58 2003 +++ b/drivers/net/tulip/media.c Fri Oct 17 23:12:58 2003 @@ -278,6 +278,10 @@ for (i = 0; i < init_length; i++) outl(init_sequence[i], ioaddr + CSR12); } + + (void) inl(ioaddr + CSR6); /* flush CSR12 writes */ + udelay(500); /* Give MII time to recover */ + tmp_info = get_u16(&misc_info[1]); if (tmp_info) tp->advertising[phy_num] = tmp_info | 1; diff -Nru a/drivers/serial/8250.c b/drivers/serial/8250.c --- a/drivers/serial/8250.c Fri Oct 17 23:12:58 2003 +++ b/drivers/serial/8250.c Fri Oct 17 23:12:58 2003 @@ -2086,6 +2086,9 @@ int __init early_serial_setup(struct uart_port *port) { + if (port->line >= ARRAY_SIZE(serial8250_ports)) + return -ENODEV; + serial8250_isa_init_ports(); serial8250_ports[port->line].port = *port; serial8250_ports[port->line].port.ops = &serial8250_pops; diff -Nru a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c --- a/drivers/serial/serial_core.c Fri Oct 17 23:12:58 2003 +++ b/drivers/serial/serial_core.c Fri Oct 17 23:12:58 2003 @@ -1859,6 +1859,9 @@ if (flow == 'r') termios.c_cflag |= CRTSCTS; + if (!port->ops) + return 0; + port->ops->set_termios(port, &termios, NULL); co->cflag = termios.c_cflag; diff -Nru a/include/asm-ia64/asmmacro.h b/include/asm-ia64/asmmacro.h --- a/include/asm-ia64/asmmacro.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/asmmacro.h Fri Oct 17 23:12:58 2003 @@ -68,20 +68,25 @@ * we'll patch out the work-around bundles with NOPs, so their impact is minimal. */ #define DO_MCKINLEY_E9_WORKAROUND + #ifdef DO_MCKINLEY_E9_WORKAROUND .section ".data.patch.mckinley_e9", "a" .previous /* workaround for Itanium 2 Errata 9: */ -# define MCKINLEY_E9_WORKAROUND \ - .xdata4 ".data.patch.mckinley_e9", 1f-.;\ -1:{ .mib; \ - nop.m 0; \ - nop.i 0; \ - br.call.sptk.many b7=1f;; \ - }; \ -1: +# define FSYS_RETURN \ + .xdata4 ".data.patch.mckinley_e9", 1f-.; \ +1:{ .mib; \ + nop.m 0; \ + mov r16=ar.pfs; \ + br.call.sptk.many b7=2f;; \ + }; \ +2:{ .mib; \ + nop.m 0; \ + mov ar.pfs=r16; \ + br.ret.sptk.many b6;; \ + } #else -# define MCKINLEY_E9_WORKAROUND +# define FSYS_RETURN br.ret.sptk.many b6 #endif #endif /* _ASM_IA64_ASMMACRO_H */ diff -Nru a/include/asm-ia64/delay.h b/include/asm-ia64/delay.h --- a/include/asm-ia64/delay.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/delay.h Fri Oct 17 23:12:58 2003 @@ -67,14 +67,15 @@ return result; } +extern void ia64_delay_loop (unsigned long loops); + static __inline__ void __delay (unsigned long loops) { - if (loops < 1) + if (unlikely(loops < 1)) return; - while (loops--) - ia64_nop(0); + ia64_delay_loop (loops - 1); } static __inline__ void diff -Nru a/include/asm-ia64/machvec_sn2.h b/include/asm-ia64/machvec_sn2.h --- a/include/asm-ia64/machvec_sn2.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/machvec_sn2.h Fri Oct 17 23:12:58 2003 @@ -99,4 +99,6 @@ #define platform_dma_sync_sg sn_dma_sync_sg #define platform_dma_supported sn_dma_supported +#include + #endif /* _ASM_IA64_MACHVEC_SN2_H */ diff -Nru a/include/asm-ia64/mca.h b/include/asm-ia64/mca.h --- a/include/asm-ia64/mca.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/mca.h Fri Oct 17 23:12:58 2003 @@ -108,8 +108,6 @@ IA64_MCA_NEW_CONTEXT = -1 /* SAL to return to new context */ }; -#define MIN_STATE_AREA_SIZE 57 - typedef struct ia64_mca_os_to_sal_state_s { u64 imots_os_status; /* OS status to SAL as to what happened * with the MCA handling. diff -Nru a/include/asm-ia64/mca_asm.h b/include/asm-ia64/mca_asm.h --- a/include/asm-ia64/mca_asm.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/mca_asm.h Fri Oct 17 23:12:58 2003 @@ -110,10 +110,9 @@ ;; \ dep temp1 = -1, temp1, PSR_MC, 1; \ ;; \ - movl temp2 = start_addr; \ mov cr.ipsr = temp1; \ ;; \ - INST_VA_TO_PA(temp2); \ + LOAD_PHYSICAL(p0, temp2, start_addr); \ ;; \ mov cr.iip = temp2; \ mov cr.ifs = r0; \ diff -Nru a/include/asm-ia64/meminit.h b/include/asm-ia64/meminit.h --- a/include/asm-ia64/meminit.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/meminit.h Fri Oct 17 23:12:58 2003 @@ -7,6 +7,8 @@ * for more details. */ +#include + /* * Entries defined so far: * - boot param structure itself @@ -32,10 +34,27 @@ extern void find_initrd (void); extern int filter_rsvd_memory (unsigned long start, unsigned long end, void *arg); +/* + * For rounding an address to the next IA64_GRANULE_SIZE or order + */ +#define GRANULEROUNDDOWN(n) ((n) & ~(IA64_GRANULE_SIZE-1)) +#define GRANULEROUNDUP(n) (((n)+IA64_GRANULE_SIZE-1) & ~(IA64_GRANULE_SIZE-1)) +#define ORDERROUNDDOWN(n) ((n) & ~((PAGE_SIZE< than this */ + extern unsigned long vmalloc_end; + extern struct page *vmem_map; + extern int find_largest_hole (u64 start, u64 end, void *arg); + extern int create_mem_map_page_table (u64 start, u64 end, void *arg); +#endif #endif /* meminit_h */ diff -Nru a/include/asm-ia64/mmzone.h b/include/asm-ia64/mmzone.h --- a/include/asm-ia64/mmzone.h Fri Oct 17 23:12:59 2003 +++ b/include/asm-ia64/mmzone.h Fri Oct 17 23:12:59 2003 @@ -3,7 +3,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (c) 2000 Silicon Graphics, Inc. All rights reserved. + * Copyright (c) 2000,2003 Silicon Graphics, Inc. All rights reserved. * Copyright (c) 2002 NEC Corp. * Copyright (c) 2002 Erich Focht * Copyright (c) 2002 Kimio Suganuma @@ -12,148 +12,26 @@ #define _ASM_IA64_MMZONE_H #include -#include - -/* - * Given a kaddr, find the base mem_map address for the start of the mem_map - * entries for the bank containing the kaddr. - */ -#define BANK_MEM_MAP_BASE(kaddr) local_node_data->bank_mem_map_base[BANK_MEM_MAP_INDEX(kaddr)] - -/* - * Given a kaddr, this macro return the relative map number - * within the bank. - */ -#define BANK_MAP_NR(kaddr) (BANK_OFFSET(kaddr) >> PAGE_SHIFT) - -/* - * Given a pte, this macro returns a pointer to the page struct for the pte. - */ -#define pte_page(pte) virt_to_page(PAGE_OFFSET | (pte_val(pte)&_PFN_MASK)) - -/* - * Determine if a kaddr is a valid memory address of memory that - * actually exists. - * - * The check consists of 2 parts: - * - verify that the address is a region 7 address & does not - * contain any bits that preclude it from being a valid platform - * memory address - * - verify that the chunk actually exists. - * - * Note that IO addresses are NOT considered valid addresses. - * - * Note, many platforms can simply check if kaddr exceeds a specific size. - * (However, this won't work on SGI platforms since IO space is embedded - * within the range of valid memory addresses & nodes have holes in the - * address range between banks). - */ -#define kern_addr_valid(kaddr) ({long _kav=(long)(kaddr); \ - VALID_MEM_KADDR(_kav);}) - -/* - * Given a kaddr, return a pointer to the page struct for the page. - * If the kaddr does not represent RAM memory that potentially exists, return - * a pointer the page struct for max_mapnr. IO addresses will - * return the page for max_nr. Addresses in unpopulated RAM banks may - * return undefined results OR may panic the system. - * - */ -#define virt_to_page(kaddr) ({long _kvtp=(long)(kaddr); \ - (VALID_MEM_KADDR(_kvtp)) \ - ? BANK_MEM_MAP_BASE(_kvtp) + BANK_MAP_NR(_kvtp) \ - : NULL;}) - -/* - * Given a page struct entry, return the physical address that the page struct represents. - * Since IA64 has all memory in the DMA zone, the following works: - */ -#define page_to_phys(page) __pa(page_address(page)) - -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) - -#define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) - -#define pfn_to_page(pfn) (struct page *)(node_mem_map(pfn_to_nid(pfn)) + node_localnr(pfn, pfn_to_nid(pfn))) - -#define pfn_to_nid(pfn) local_node_data->node_id_map[(pfn << PAGE_SHIFT) >> BANKSHIFT] - -#define page_to_pfn(page) (long)((page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn) +#include +#include +#ifdef CONFIG_DISCONTIGMEM -/* - * pfn_valid should be made as fast as possible, and the current definition - * is valid for machines that are NUMA, but still contiguous, which is what - * is currently supported. A more generalised, but slower definition would - * be something like this - mbligh: - * ( pfn_to_pgdat(pfn) && (pfn < node_end_pfn(pfn_to_nid(pfn))) ) - */ -#define pfn_valid(pfn) (pfn < max_low_pfn) -extern unsigned long max_low_pfn; - - -#if defined(CONFIG_IA64_DIG) - -/* - * Platform definitions for DIG platform with contiguous memory. - */ -#define MAX_PHYSNODE_ID 8 /* Maximum node number +1 */ -#define MAX_PHYS_MEMORY (1UL << 40) /* 1 TB */ - -/* - * Bank definitions. - * Configurable settings for DIG: 512MB/bank: 16GB/node, - * 2048MB/bank: 64GB/node, - * 8192MB/bank: 256GB/node. - */ -#define NR_BANKS_PER_NODE 32 -#if defined(CONFIG_IA64_NODESIZE_16GB) -# define BANKSHIFT 29 -#elif defined(CONFIG_IA64_NODESIZE_64GB) -# define BANKSHIFT 31 -#elif defined(CONFIG_IA64_NODESIZE_256GB) -# define BANKSHIFT 33 -#else -# error Unsupported bank and nodesize! +#ifdef CONFIG_IA64_DIG /* DIG systems are small */ +# define MAX_PHYSNODE_ID 8 +# define NR_NODES 8 +# define NR_MEMBLKS (NR_NODES * 32) +#else /* sn2 is the biggest case, so we use that if !DIG */ +# define MAX_PHYSNODE_ID 2048 +# define NR_NODES 256 +# define NR_MEMBLKS (NR_NODES) #endif -#define BANKSIZE (1UL << BANKSHIFT) -#elif defined(CONFIG_IA64_SGI_SN2) - -/* - * SGI SN2 discontig definitions - */ -#define MAX_PHYSNODE_ID 2048 /* 2048 node ids (also called nasid) */ -#define MAX_PHYS_MEMORY (1UL << 49) - -#define NR_BANKS_PER_NODE 4 -#define BANKSHIFT 38 -#define SN2_NODE_SIZE (64UL*1024*1024*1024) /* 64GB per node */ -#define BANKSIZE (SN2_NODE_SIZE/NR_BANKS_PER_NODE) - -#endif /* CONFIG_IA64_DIG */ - -#if defined(CONFIG_IA64_DIG) || defined (CONFIG_IA64_SGI_SN2) -/* Common defines for both platforms */ -#include -#define BANK_OFFSET(addr) ((unsigned long)(addr) & (BANKSIZE-1)) -#define NR_BANKS (NR_BANKS_PER_NODE * (1 << NODES_SHIFT)) -#define NR_MEMBLKS (NR_BANKS) - -/* - * VALID_MEM_KADDR returns a boolean to indicate if a kaddr is - * potentially a valid cacheable identity mapped RAM memory address. - * Note that the RAM may or may not actually be present!! - */ -#define VALID_MEM_KADDR(kaddr) 1 - -/* - * Given a nodeid & a bank number, find the address of the mem_map - * entry for the first page of the bank. - */ -#define BANK_MEM_MAP_INDEX(kaddr) \ - (((unsigned long)(kaddr) & (MAX_PHYS_MEMORY-1)) >> BANKSHIFT) +extern unsigned long max_low_pfn; -#endif /* CONFIG_IA64_DIG || CONFIG_IA64_SGI_SN2 */ +#define pfn_valid(pfn) (((pfn) < max_low_pfn) && ia64_pfn_valid(pfn)) +#define page_to_pfn(page) ((unsigned long) (page - vmem_map)) +#define pfn_to_page(pfn) (vmem_map + (pfn)) +#endif /* CONFIG_DISCONTIGMEM */ #endif /* _ASM_IA64_MMZONE_H */ diff -Nru a/include/asm-ia64/nodedata.h b/include/asm-ia64/nodedata.h --- a/include/asm-ia64/nodedata.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/nodedata.h Fri Oct 17 23:12:58 2003 @@ -11,9 +11,14 @@ #ifndef _ASM_IA64_NODEDATA_H #define _ASM_IA64_NODEDATA_H +#include #include + +#include #include +#ifdef CONFIG_DISCONTIGMEM + /* * Node Data. One of these structures is located on each node of a NUMA system. */ @@ -22,10 +27,7 @@ struct ia64_node_data { short active_cpu_count; short node; - struct pglist_data *pg_data_ptrs[MAX_NUMNODES]; - struct page *bank_mem_map_base[NR_BANKS]; - struct ia64_node_data *node_data_ptrs[MAX_NUMNODES]; - short node_id_map[NR_BANKS]; + struct pglist_data *pg_data_ptrs[NR_NODES]; }; @@ -34,41 +36,17 @@ */ #define local_node_data (local_cpu_data->node_data) - -/* - * Return a pointer to the node_data structure for the specified node. - */ -#define node_data(node) (local_node_data->node_data_ptrs[node]) - -/* - * Get a pointer to the node_id/node_data for the current cpu. - * (boot time only) - */ -extern int boot_get_local_nodeid(void); -extern struct ia64_node_data *get_node_data_ptr(void); - /* * Given a node id, return a pointer to the pg_data_t for the node. - * The following 2 macros are similar. * * NODE_DATA - should be used in all code not related to system * initialization. It uses pernode data structures to minimize * offnode memory references. However, these structure are not * present during boot. This macro can be used once cpu_init * completes. - * - * BOOT_NODE_DATA - * - should be used during system initialization - * prior to freeing __initdata. It does not depend on the percpu - * area being present. - * - * NOTE: The names of these macros are misleading but are difficult to change - * since they are used in generic linux & on other architecures. */ #define NODE_DATA(nid) (local_node_data->pg_data_ptrs[nid]) -#define BOOT_NODE_DATA(nid) boot_get_pg_data_ptr((long)(nid)) -struct pglist_data; -extern struct pglist_data * __init boot_get_pg_data_ptr(long); +#endif /* CONFIG_DISCONTIGMEM */ #endif /* _ASM_IA64_NODEDATA_H */ diff -Nru a/include/asm-ia64/numa.h b/include/asm-ia64/numa.h --- a/include/asm-ia64/numa.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/numa.h Fri Oct 17 23:12:58 2003 @@ -4,7 +4,7 @@ * for more details. * * This file contains NUMA specific prototypes and definitions. - * + * * 2002/08/05 Erich Focht * */ @@ -12,12 +12,17 @@ #define _ASM_IA64_NUMA_H #include -#include #ifdef CONFIG_NUMA -#include #include +#include +#include +#include +#include +#include + +#include extern volatile char cpu_to_node_map[NR_CPUS] __cacheline_aligned; extern volatile cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; @@ -60,6 +65,10 @@ extern int paddr_to_nid(unsigned long paddr); #define local_nodeid (cpu_to_node_map[smp_processor_id()]) + +#else /* !CONFIG_NUMA */ + +#define paddr_to_nid(addr) 0 #endif /* CONFIG_NUMA */ diff -Nru a/include/asm-ia64/page.h b/include/asm-ia64/page.h --- a/include/asm-ia64/page.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/page.h Fri Oct 17 23:12:58 2003 @@ -94,18 +94,20 @@ #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#ifdef CONFIG_VIRTUAL_MEM_MAP +extern int ia64_pfn_valid (unsigned long pfn); +#else +# define ia64_pfn_valid(pfn) 1 +#endif + #ifndef CONFIG_DISCONTIGMEM -# ifdef CONFIG_VIRTUAL_MEM_MAP - extern int ia64_pfn_valid (unsigned long pfn); -# define pfn_valid(pfn) (((pfn) < max_mapnr) && ia64_pfn_valid(pfn)) -# else -# define pfn_valid(pfn) ((pfn) < max_mapnr) -# endif -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) +#define pfn_valid(pfn) (((pfn) < max_mapnr) && ia64_pfn_valid(pfn)) #define page_to_pfn(page) ((unsigned long) (page - mem_map)) #define pfn_to_page(pfn) (mem_map + (pfn)) +#endif /* CONFIG_DISCONTIGMEM */ + #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) -#endif +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) typedef union ia64_va { struct { diff -Nru a/include/asm-ia64/pal.h b/include/asm-ia64/pal.h --- a/include/asm-ia64/pal.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/pal.h Fri Oct 17 23:12:58 2003 @@ -405,10 +405,11 @@ * generated. * (Trap Lost ) */ - op : 3, /* Operation that - * caused the machine - * check + mi : 1, /* More information available + * call PAL_MC_ERROR_INFO */ + pi : 1, /* Precise instruction pointer */ + pm : 1, /* Precise min-state save area */ dy : 1, /* Processor dynamic * state valid @@ -450,11 +451,12 @@ * by the processor */ - reserved2 : 12, + reserved2 : 11, cc : 1, /* Cache check */ tc : 1, /* TLB check */ bc : 1, /* Bus check */ - uc : 1; /* Unknown check */ + rc : 1, /* Register file check */ + uc : 1; /* Uarch check */ } pal_processor_state_info_t; diff -Nru a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h --- a/include/asm-ia64/percpu.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/percpu.h Fri Oct 17 23:12:58 2003 @@ -46,11 +46,13 @@ extern void percpu_modcopy(void *pcpudst, const void *src, unsigned long size); extern void setup_per_cpu_areas (void); +extern void *per_cpu_init(void); #else /* ! SMP */ #define per_cpu(var, cpu) ((void)cpu, per_cpu__##var) #define __get_cpu_var(var) per_cpu__##var +#define per_cpu_init() (__phys_per_cpu_start) #endif /* SMP */ diff -Nru a/include/asm-ia64/perfmon.h b/include/asm-ia64/perfmon.h --- a/include/asm-ia64/perfmon.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/perfmon.h Fri Oct 17 23:12:58 2003 @@ -38,7 +38,6 @@ */ #define PFM_FL_NOTIFY_BLOCK 0x01 /* block task on user level notifications */ #define PFM_FL_SYSTEM_WIDE 0x02 /* create a system wide context */ -#define PFM_FL_UNSECURE 0x04 /* allow unsecure monitoring for non self-monitoring task */ #define PFM_FL_OVFL_NO_MSG 0x80 /* do not post overflow/end messages for notification */ /* @@ -162,8 +161,6 @@ */ #define PFM_VERSION_MAJ 2U #define PFM_VERSION_MIN 0U -#define PFM_SMPL_HDR_VERSION_MAJ 2U -#define PFM_SMPL_HDR_VERSION_MIN 0U #define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|(PFM_VERSION_MIN & 0xffff)) #define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff) #define PFM_VERSION_MINOR(x) ((x) & 0xffff) @@ -194,9 +191,8 @@ /* * Reset PMD register flags */ -#define PFM_PMD_NO_RESET 0 +#define PFM_PMD_SHORT_RESET 0 #define PFM_PMD_LONG_RESET 1 -#define PFM_PMD_SHORT_RESET 2 typedef union { unsigned int val; @@ -223,7 +219,7 @@ } pfm_ovfl_arg_t; -typedef struct _pfm_buffer_fmt_t { +typedef struct { char *fmt_name; pfm_uuid_t fmt_uuid; size_t fmt_arg_size; @@ -237,8 +233,7 @@ int (*fmt_restart_active)(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs); int (*fmt_exit)(struct task_struct *task, void *buf, struct pt_regs *regs); - struct _pfm_buffer_fmt_t *fmt_next; - struct _pfm_buffer_fmt_t *fmt_prev; + struct list_head fmt_list; } pfm_buffer_fmt_t; extern int pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt); diff -Nru a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h --- a/include/asm-ia64/pgtable.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/pgtable.h Fri Oct 17 23:12:58 2003 @@ -174,7 +174,6 @@ return (addr & (local_cpu_data->unimpl_pa_mask)) == 0; } -#ifndef CONFIG_DISCONTIGMEM /* * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel * memory. For the return value to be meaningful, ADDR must be >= @@ -190,7 +189,6 @@ */ #define kern_addr_valid(addr) (1) -#endif /* * Now come the defines and routines to manage and access the three-level @@ -240,10 +238,8 @@ #define pte_none(pte) (!pte_val(pte)) #define pte_present(pte) (pte_val(pte) & (_PAGE_P | _PAGE_PROTNONE)) #define pte_clear(pte) (pte_val(*(pte)) = 0UL) -#ifndef CONFIG_DISCONTIGMEM /* pte_page() returns the "struct page *" corresponding to the PTE: */ #define pte_page(pte) virt_to_page(((pte_val(pte) & _PFN_MASK) + PAGE_OFFSET)) -#endif #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (!ia64_phys_addr_valid(pmd_val(pmd))) diff -Nru a/include/asm-ia64/posix_types.h b/include/asm-ia64/posix_types.h --- a/include/asm-ia64/posix_types.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/posix_types.h Fri Oct 17 23:12:58 2003 @@ -10,7 +10,7 @@ * David Mosberger-Tang */ -typedef unsigned int __kernel_ino_t; +typedef unsigned long __kernel_ino_t; typedef unsigned int __kernel_mode_t; typedef unsigned int __kernel_nlink_t; typedef long __kernel_off_t; diff -Nru a/include/asm-ia64/serial.h b/include/asm-ia64/serial.h --- a/include/asm-ia64/serial.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/serial.h Fri Oct 17 23:12:58 2003 @@ -4,8 +4,6 @@ * Derived from the i386 version. */ -#include - /* * This assumes you have a 1.8432 MHz clock for your UART. * @@ -15,107 +13,7 @@ */ #define BASE_BAUD ( 1843200 / 16 ) -#define CONFIG_SERIAL_DETECT_IRQ /* on IA-64, we always want to autodetect irqs */ - -/* Standard COM flags (except for COM4, because of the 8514 problem) */ -#ifdef CONFIG_SERIAL_DETECT_IRQ -#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ) -#define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_AUTO_IRQ) -#else -#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST) -#define STD_COM4_FLAGS ASYNC_BOOT_AUTOCONF -#endif - -#ifdef CONFIG_SERIAL_MANY_PORTS -#define FOURPORT_FLAGS ASYNC_FOURPORT -#define ACCENT_FLAGS 0 -#define BOCA_FLAGS 0 -#define HUB6_FLAGS 0 -#define RS_TABLE_SIZE 64 -#else -#define RS_TABLE_SIZE -#endif - /* - * The following define the access methods for the HUB6 card. All - * access is through two ports for all 24 possible chips. The card is - * selected through the high 2 bits, the port on that card with the - * "middle" 3 bits, and the register on that port with the bottom - * 3 bits. - * - * While the access port and interrupt is configurable, the default - * port locations are 0x302 for the port control register, and 0x303 - * for the data read/write register. Normally, the interrupt is at irq3 - * but can be anything from 3 to 7 inclusive. Note that using 3 will - * require disabling com2. - */ - -#define C_P(card,port) (((card)<<6|(port)<<3) + 1) - -#define STD_SERIAL_PORT_DEFNS \ - /* UART CLK PORT IRQ FLAGS */ \ - { 0, BASE_BAUD, 0x3F8, 4, STD_COM_FLAGS }, /* ttyS0 */ \ - { 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS }, /* ttyS1 */ \ - { 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS }, /* ttyS2 */ \ - { 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */ - -#ifdef CONFIG_SERIAL_MANY_PORTS -#define EXTRA_SERIAL_PORT_DEFNS \ - { 0, BASE_BAUD, 0x1A0, 9, FOURPORT_FLAGS }, /* ttyS4 */ \ - { 0, BASE_BAUD, 0x1A8, 9, FOURPORT_FLAGS }, /* ttyS5 */ \ - { 0, BASE_BAUD, 0x1B0, 9, FOURPORT_FLAGS }, /* ttyS6 */ \ - { 0, BASE_BAUD, 0x1B8, 9, FOURPORT_FLAGS }, /* ttyS7 */ \ - { 0, BASE_BAUD, 0x2A0, 5, FOURPORT_FLAGS }, /* ttyS8 */ \ - { 0, BASE_BAUD, 0x2A8, 5, FOURPORT_FLAGS }, /* ttyS9 */ \ - { 0, BASE_BAUD, 0x2B0, 5, FOURPORT_FLAGS }, /* ttyS10 */ \ - { 0, BASE_BAUD, 0x2B8, 5, FOURPORT_FLAGS }, /* ttyS11 */ \ - { 0, BASE_BAUD, 0x330, 4, ACCENT_FLAGS }, /* ttyS12 */ \ - { 0, BASE_BAUD, 0x338, 4, ACCENT_FLAGS }, /* ttyS13 */ \ - { 0, BASE_BAUD, 0x000, 0, 0 }, /* ttyS14 (spare) */ \ - { 0, BASE_BAUD, 0x000, 0, 0 }, /* ttyS15 (spare) */ \ - { 0, BASE_BAUD, 0x100, 12, BOCA_FLAGS }, /* ttyS16 */ \ - { 0, BASE_BAUD, 0x108, 12, BOCA_FLAGS }, /* ttyS17 */ \ - { 0, BASE_BAUD, 0x110, 12, BOCA_FLAGS }, /* ttyS18 */ \ - { 0, BASE_BAUD, 0x118, 12, BOCA_FLAGS }, /* ttyS19 */ \ - { 0, BASE_BAUD, 0x120, 12, BOCA_FLAGS }, /* ttyS20 */ \ - { 0, BASE_BAUD, 0x128, 12, BOCA_FLAGS }, /* ttyS21 */ \ - { 0, BASE_BAUD, 0x130, 12, BOCA_FLAGS }, /* ttyS22 */ \ - { 0, BASE_BAUD, 0x138, 12, BOCA_FLAGS }, /* ttyS23 */ \ - { 0, BASE_BAUD, 0x140, 12, BOCA_FLAGS }, /* ttyS24 */ \ - { 0, BASE_BAUD, 0x148, 12, BOCA_FLAGS }, /* ttyS25 */ \ - { 0, BASE_BAUD, 0x150, 12, BOCA_FLAGS }, /* ttyS26 */ \ - { 0, BASE_BAUD, 0x158, 12, BOCA_FLAGS }, /* ttyS27 */ \ - { 0, BASE_BAUD, 0x160, 12, BOCA_FLAGS }, /* ttyS28 */ \ - { 0, BASE_BAUD, 0x168, 12, BOCA_FLAGS }, /* ttyS29 */ \ - { 0, BASE_BAUD, 0x170, 12, BOCA_FLAGS }, /* ttyS30 */ \ - { 0, BASE_BAUD, 0x178, 12, BOCA_FLAGS }, /* ttyS31 */ -#else -#define EXTRA_SERIAL_PORT_DEFNS -#endif - -/* You can have up to four HUB6's in the system, but I've only - * included two cards here for a total of twelve ports. + * All legacy serial ports should be enumerated via ACPI namespace, so + * we need not list them here. */ -#if (defined(CONFIG_HUB6) && defined(CONFIG_SERIAL_MANY_PORTS)) -#define HUB6_SERIAL_PORT_DFNS \ - { 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(0,0) }, /* ttyS32 */ \ - { 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(0,1) }, /* ttyS33 */ \ - { 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(0,2) }, /* ttyS34 */ \ - { 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(0,3) }, /* ttyS35 */ \ - { 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(0,4) }, /* ttyS36 */ \ - { 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(0,5) }, /* ttyS37 */ \ - { 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(1,0) }, /* ttyS38 */ \ - { 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(1,1) }, /* ttyS39 */ \ - { 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(1,2) }, /* ttyS40 */ \ - { 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(1,3) }, /* ttyS41 */ \ - { 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(1,4) }, /* ttyS42 */ \ - { 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(1,5) }, /* ttyS43 */ -#else -#define HUB6_SERIAL_PORT_DFNS -#endif - -#define SERIAL_PORT_DFNS \ - STD_SERIAL_PORT_DEFNS \ - EXTRA_SERIAL_PORT_DEFNS \ - HUB6_SERIAL_PORT_DFNS - diff -Nru a/include/asm-ia64/sn/nodepda.h b/include/asm-ia64/sn/nodepda.h --- a/include/asm-ia64/sn/nodepda.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/sn/nodepda.h Fri Oct 17 23:12:58 2003 @@ -128,7 +128,7 @@ * Check if given a compact node id the corresponding node has all the * cpus disabled. */ -#define is_headless_node(cnode) (!any_online_cpu(node_to_cpumask(cnode))) +#define is_headless_node(cnode) (!node_to_cpu_mask[cnode]) /* * Check if given a node vertex handle the corresponding node has all the diff -Nru a/include/asm-ia64/uaccess.h b/include/asm-ia64/uaccess.h --- a/include/asm-ia64/uaccess.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/uaccess.h Fri Oct 17 23:12:58 2003 @@ -408,11 +408,7 @@ extern void handle_exception (struct pt_regs *regs, const struct exception_table_entry *e); extern const struct exception_table_entry *search_exception_tables (unsigned long addr); -#ifdef GAS_HAS_LOCAL_TAGS # define SEARCH_EXCEPTION_TABLE(regs) search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri) -#else -# define SEARCH_EXCEPTION_TABLE(regs) search_exception_tables(regs->cr_iip) -#endif static inline int done_with_exception (struct pt_regs *regs) diff -Nru a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h --- a/include/asm-ia64/unistd.h Fri Oct 17 23:12:58 2003 +++ b/include/asm-ia64/unistd.h Fri Oct 17 23:12:58 2003 @@ -237,17 +237,17 @@ #define __NR_epoll_wait 1245 #define __NR_restart_syscall 1246 #define __NR_semtimedop 1247 -#define __NR_sys_timer_create 1248 -#define __NR_sys_timer_settime 1249 -#define __NR_sys_timer_gettime 1250 -#define __NR_sys_timer_getoverrun 1251 -#define __NR_sys_timer_delete 1252 -#define __NR_sys_clock_settime 1253 -#define __NR_sys_clock_gettime 1254 -#define __NR_sys_clock_getres 1255 -#define __NR_sys_clock_nanosleep 1256 -#define __NR_sys_fstatfs64 1257 -#define __NR_sys_statfs64 1258 +#define __NR_timer_create 1248 +#define __NR_timer_settime 1249 +#define __NR_timer_gettime 1250 +#define __NR_timer_getoverrun 1251 +#define __NR_timer_delete 1252 +#define __NR_clock_settime 1253 +#define __NR_clock_gettime 1254 +#define __NR_clock_getres 1255 +#define __NR_clock_nanosleep 1256 +#define __NR_fstatfs64 1257 +#define __NR_statfs64 1258 #ifdef __KERNEL__ diff -Nru a/include/linux/module.h b/include/linux/module.h --- a/include/linux/module.h Fri Oct 17 23:12:58 2003 +++ b/include/linux/module.h Fri Oct 17 23:12:58 2003 @@ -60,10 +60,11 @@ #define __module_cat(a,b) ___module_cat(a,b) #define __MODULE_INFO(tag, name, info) \ static const char __module_cat(name,__LINE__)[] \ + __attribute_used__ \ __attribute__((section(".modinfo"),unused)) = __stringify(tag) "=" info -#define MODULE_GENERIC_TABLE(gtype,name) \ -extern const struct gtype##_id __mod_##gtype##_table \ +#define MODULE_GENERIC_TABLE(gtype,name) \ +extern const struct gtype##_id __mod_##gtype##_table \ __attribute__ ((unused, alias(__stringify(name)))) #define THIS_MODULE (&__this_module) @@ -142,6 +143,7 @@ #define __CRC_SYMBOL(sym, sec) \ extern void *__crc_##sym __attribute__((weak)); \ static const unsigned long __kcrctab_##sym \ + __attribute_used__ \ __attribute__((section("__kcrctab" sec), unused)) \ = (unsigned long) &__crc_##sym; #else @@ -155,6 +157,7 @@ __attribute__((section("__ksymtab_strings"))) \ = MODULE_SYMBOL_PREFIX #sym; \ static const struct kernel_symbol __ksymtab_##sym \ + __attribute_used__ \ __attribute__((section("__ksymtab" sec), unused)) \ = { (unsigned long)&sym, __kstrtab_##sym } diff -Nru a/include/linux/moduleparam.h b/include/linux/moduleparam.h --- a/include/linux/moduleparam.h Fri Oct 17 23:12:58 2003 +++ b/include/linux/moduleparam.h Fri Oct 17 23:12:58 2003 @@ -52,6 +52,7 @@ #define __module_param_call(prefix, name, set, get, arg, perm) \ static char __param_str_##name[] __initdata = prefix #name; \ static struct kernel_param const __param_##name \ + __attribute_used__ \ __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \ = { __param_str_##name, perm, set, get, arg } diff -Nru a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h --- a/include/linux/nfs_fs.h Fri Oct 17 23:12:58 2003 +++ b/include/linux/nfs_fs.h Fri Oct 17 23:12:58 2003 @@ -403,7 +403,7 @@ nfs_size_to_loff_t(__u64 size) { loff_t maxsz = (((loff_t) ULONG_MAX) << PAGE_CACHE_SHIFT) + PAGE_CACHE_SIZE - 1; - if (size > maxsz) + if (size > (__u64) maxsz) return maxsz; return (loff_t) size; } diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h --- a/include/linux/sysctl.h Fri Oct 17 23:12:58 2003 +++ b/include/linux/sysctl.h Fri Oct 17 23:12:58 2003 @@ -127,6 +127,7 @@ KERN_PANIC_ON_OOPS=57, /* int: whether we will panic on an oops */ KERN_HPPA_PWRSW=58, /* int: hppa soft-power enable */ KERN_HPPA_UNALIGNED=59, /* int: hppa unaligned-trap enable */ + KERN_CACHEDECAYTICKS=60,/* ulong: value for cache_decay_ticks (EXPERIMENTAL!) */ }; diff -Nru a/kernel/printk.c b/kernel/printk.c --- a/kernel/printk.c Fri Oct 17 23:12:59 2003 +++ b/kernel/printk.c Fri Oct 17 23:12:59 2003 @@ -361,6 +361,12 @@ __call_console_drivers(start, end); } } +#ifdef CONFIG_IA64_EARLY_PRINTK + if (!console_drivers) { + void early_printk (const char *str, size_t len); + early_printk(&LOG_BUF(start), end - start); + } +#endif } /* @@ -678,7 +684,11 @@ * for us. */ spin_lock_irqsave(&logbuf_lock, flags); +#ifdef CONFIG_IA64_EARLY_PRINTK + con_start = log_end; +#else con_start = log_start; +#endif spin_unlock_irqrestore(&logbuf_lock, flags); } release_console_sem(); @@ -731,3 +741,117 @@ tty->driver->write(tty, 0, msg, strlen(msg)); return; } + +#ifdef CONFIG_IA64_EARLY_PRINTK + +#include + +# ifdef CONFIG_IA64_EARLY_PRINTK_VGA + + +#define VGABASE ((char *)0xc0000000000b8000) +#define VGALINES 24 +#define VGACOLS 80 + +static int current_ypos = VGALINES, current_xpos = 0; + +static void +early_printk_vga (const char *str, size_t len) +{ + char c; + int i, k, j; + + while (len-- > 0) { + c = *str++; + if (current_ypos >= VGALINES) { + /* scroll 1 line up */ + for (k = 1, j = 0; k < VGALINES; k++, j++) { + for (i = 0; i < VGACOLS; i++) { + writew(readw(VGABASE + 2*(VGACOLS*k + i)), + VGABASE + 2*(VGACOLS*j + i)); + } + } + for (i = 0; i < VGACOLS; i++) { + writew(0x720, VGABASE + 2*(VGACOLS*j + i)); + } + current_ypos = VGALINES-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(VGACOLS*current_ypos + current_xpos++)); + if (current_xpos >= VGACOLS) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +# endif /* CONFIG_IA64_EARLY_PRINTK_VGA */ + +# ifdef CONFIG_IA64_EARLY_PRINTK_UART + +#include +#include + +static void early_printk_uart(const char *str, size_t len) +{ + static char *uart = NULL; + unsigned long uart_base; + char c; + + if (!uart) { + uart_base = 0; +# ifdef CONFIG_SERIAL_8250_HCDP + { + extern unsigned long hcdp_early_uart(void); + uart_base = hcdp_early_uart(); + } +# endif +# if CONFIG_IA64_EARLY_PRINTK_UART_BASE + if (!uart_base) + uart_base = CONFIG_IA64_EARLY_PRINTK_UART_BASE; +# endif + if (!uart_base) + return; + + uart = ioremap(uart_base, 64); + if (!uart) + return; + } + + while (len-- > 0) { + c = *str++; + while ((readb(uart + UART_LSR) & UART_LSR_TEMT) == 0) + cpu_relax(); /* spin */ + + writeb(c, uart + UART_TX); + + if (c == '\n') + writeb('\r', uart + UART_TX); + } +} + +# endif /* CONFIG_IA64_EARLY_PRINTK_UART */ + +#ifdef CONFIG_IA64_EARLY_PRINTK_SGI_SN +extern int early_printk_sn_sal(const char *str, int len); +#endif + +void early_printk(const char *str, size_t len) +{ +#ifdef CONFIG_IA64_EARLY_PRINTK_UART + early_printk_uart(str, len); +#endif +#ifdef CONFIG_IA64_EARLY_PRINTK_VGA + early_printk_vga(str, len); +#endif +#ifdef CONFIG_IA64_EARLY_PRINTK_SGI_SN + early_printk_sn_sal(str, len); +#endif +} + +#endif /* CONFIG_IA64_EARLY_PRINTK */ diff -Nru a/kernel/sysctl.c b/kernel/sysctl.c --- a/kernel/sysctl.c Fri Oct 17 23:12:58 2003 +++ b/kernel/sysctl.c Fri Oct 17 23:12:58 2003 @@ -579,6 +579,16 @@ .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_SMP + { + .ctl_name = KERN_CACHEDECAYTICKS, + .procname = "cache_decay_ticks", + .data = &cache_decay_ticks, + .maxlen = sizeof(cache_decay_ticks), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, +#endif { .ctl_name = 0 } }; diff -Nru a/mm/memory.c b/mm/memory.c --- a/mm/memory.c Fri Oct 17 23:12:58 2003 +++ b/mm/memory.c Fri Oct 17 23:12:58 2003 @@ -121,8 +121,10 @@ } pmd = pmd_offset(dir, 0); pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) + for (j = 0; j < PTRS_PER_PMD ; j++) { + prefetchw(pmd + j + PREFETCH_STRIDE/sizeof(*pmd)); free_one_pmd(tlb, pmd+j); + } pmd_free_tlb(tlb, pmd); }