diff -Nru a/Documentation/ia64/fsys.txt b/Documentation/ia64/fsys.txt
--- a/Documentation/ia64/fsys.txt	Fri Oct 17 23:12:58 2003
+++ b/Documentation/ia64/fsys.txt	Fri Oct 17 23:12:58 2003
@@ -4,7 +4,7 @@
 		-----------------------------------
 
 		        Started: 13-Jan-2003
-		    Last update: 11-Feb-2003
+		    Last update: 27-Sep-2003
 
 	              David Mosberger-Tang
 		      <davidm@hpl.hp.com>
@@ -146,6 +146,12 @@
    task pointer is not considered sensitive: it's already exposed
    through ar.k6).
 
+ o Fsyscall-handlers MUST NOT access user-memory without first
+   validating access-permission (this can be done typically via
+   probe.r.fault and/or probe.w.fault) and without guarding against
+   memory access exceptions (this can be done with the EX() macros
+   defined by asmmacro.h).
+
 The above restrictions may seem draconian, but remember that it's
 possible to trade off some of the restrictions by paying a slightly
 higher overhead.  For example, if an fsyscall-handler could benefit
@@ -229,3 +235,52 @@
 PSR.bn	Unchanged.  Note: fsys-mode handlers may clear the bit, if needed.
 	Doing so requires clearing PSR.i and PSR.ic as well.
 PSR.ia	Unchanged.  Note: the ia64 linux kernel never sets this bit.
+
+* Using fast system calls
+
+To use fast system calls, userspace applications need simply call
+__kernel_syscall_via_epc().  For example
+
+-- example fgettimeofday() call --
+-- fgettimeofday.S --
+
+#include <asm/asmmacro.h>
+
+GLOBAL_ENTRY(fgettimeofday)
+.prologue
+.save ar.pfs, r11
+mov r11 = ar.pfs
+.body 
+
+mov r2 = 0xa000000000020660;;  // gate address 
+			       // found by inspection of System.map for the 
+			       // __kernel_syscall_via_epc() function.  See
+			       // below for how to do this for real.
+
+mov b7 = r2
+mov r15 = 1087		       // gettimeofday syscall
+;;
+br.call.sptk.many b6 = b7
+;;
+
+.restore sp
+
+mov ar.pfs = r11
+br.ret.sptk.many rp;;	      // return to caller
+END(fgettimeofday)
+
+-- end fgettimeofday.S --
+
+In reality, getting the gate address is accomplished by two extra
+values passed via the ELF auxiliary vector (include/asm-ia64/elf.h)
+
+ o AT_SYSINFO : is the address of __kernel_syscall_via_epc()
+ o AT_SYSINFO_EHDR : is the address of the kernel gate ELF DSO
+
+The ELF DSO is a pre-linked library that is mapped in by the kernel at
+the gate page.  It is a proper ELF shared object so, with a dynamic
+loader that recognises the library, you should be able to make calls to
+the exported functions within it as with any other shared library.
+AT_SYSINFO points into the kernel DSO at the
+__kernel_syscall_via_epc() function for historical reasons (it was
+used before the kernel DSO) and as a convenience.
diff -Nru a/arch/ia64/Kconfig b/arch/ia64/Kconfig
--- a/arch/ia64/Kconfig	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/Kconfig	Fri Oct 17 23:12:58 2003
@@ -57,6 +57,10 @@
 
 config IA64_GENERIC
 	bool "generic"
+	select NUMA
+	select ACPI_NUMA
+	select VIRTUAL_MEM_MAP
+	select DISCONTIGMEM
 	---help---
 	  This selects the system type of your hardware.  A "generic" kernel
 	  will run on any supported IA-64 system.  However, if you configure
@@ -220,24 +224,8 @@
 	  Access).  This option is for configuring high-end multiprocessor
 	  server systems.  If in doubt, say N.
 
-choice
-	prompt "Maximum Memory per NUMA Node" if NUMA && IA64_DIG
-	depends on NUMA && IA64_DIG
-	default IA64_NODESIZE_16GB
-
-config IA64_NODESIZE_16GB
-	bool "16GB"
-
-config IA64_NODESIZE_64GB
-	bool "64GB"
-
-config IA64_NODESIZE_256GB
-	bool "256GB"
-
-endchoice
-
 config DISCONTIGMEM
-	bool "Discontiguous memory support" if (IA64_DIG || IA64_SGI_SN2 || IA64_GENERIC) && NUMA
+	bool "Discontiguous memory support" if (IA64_DIG || IA64_SGI_SN2 || IA64_GENERIC) && NUMA && VIRTUAL_MEM_MAP
 	default y if (IA64_SGI_SN2 || IA64_GENERIC) && NUMA
 	help
 	  Say Y to support efficient handling of discontiguous physical memory,
@@ -250,14 +238,10 @@
 	default y if !IA64_HP_SIM
 	help
 	  Say Y to compile the kernel with support for a virtual mem map.
-	  This is an alternate method of supporting large holes in the
-	  physical address space on non NUMA machines. Since the DISCONTIGMEM
-	  option is not supported on machines with the ZX1 chipset, this is
-	  the only way of supporting more than 1 Gb of memory on those
-	  machines. This code also only takes effect if a memory hole of
-	  greater than 1 Gb is found during boot, so it is safe to enable
-	  unless you require the DISCONTIGMEM option for your machine. If you
-	  are unsure, say Y.
+	  This code also only takes effect if a memory hole of greater than
+	  1 Gb is found during boot.  You must turn this option on if you
+	  require the DISCONTIGMEM option for your machine. If you are
+	  unsure, say Y.
 
 config IA64_MCA
 	bool "Enable IA-64 Machine Check Abort"
@@ -636,6 +620,33 @@
 	  send a BREAK and then within 5 seconds a command keypress. The
 	  keys are documented in <file:Documentation/sysrq.txt>. Don't say Y
 	  unless you really know what this hack does.
+
+config IA64_EARLY_PRINTK
+	bool "Early printk support"
+	depends on DEBUG_KERNEL && !IA64_GENERIC
+	help
+	  Selecting this option uses the VGA screen or serial console for
+	  printk() output before the consoles are initialised.  It is useful
+	  for debugging problems early in the boot process, but only if you
+	  have a suitable VGA/serial console attached.  If you're unsure,
+	  select N.
+
+config IA64_EARLY_PRINTK_UART
+	bool "Early printk on MMIO serial port"
+	depends on IA64_EARLY_PRINTK
+
+config IA64_EARLY_PRINTK_UART_BASE
+	hex "UART MMIO base address"
+	depends on IA64_EARLY_PRINTK_UART
+	default "ff5e0000"
+
+config IA64_EARLY_PRINTK_VGA
+	bool "Early printk on VGA"
+	depends on IA64_EARLY_PRINTK
+
+config IA64_EARLY_PRINTK_SGI_SN
+	bool "Early printk on SGI SN serial console"
+	depends on IA64_EARLY_PRINTK && (IA64_GENERIC || IA64_SGI_SN2)
 
 config DEBUG_SLAB
 	bool "Debug memory allocations"
diff -Nru a/arch/ia64/Makefile b/arch/ia64/Makefile
--- a/arch/ia64/Makefile	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/Makefile	Fri Oct 17 23:12:58 2003
@@ -64,7 +64,7 @@
 drivers-$(CONFIG_PCI)		+= arch/ia64/pci/
 drivers-$(CONFIG_IA64_HP_SIM)	+= arch/ia64/hp/sim/
 drivers-$(CONFIG_IA64_HP_ZX1)	+= arch/ia64/hp/common/ arch/ia64/hp/zx1/
-drivers-$(CONFIG_IA64_GENERIC)	+= arch/ia64/hp/common/ arch/ia64/hp/zx1/ arch/ia64/hp/sim/
+drivers-$(CONFIG_IA64_GENERIC)	+= arch/ia64/hp/common/ arch/ia64/hp/zx1/ arch/ia64/hp/sim/ arch/ia64/sn/
 drivers-$(CONFIG_OPROFILE)	+= arch/ia64/oprofile/
 
 boot := arch/ia64/hp/sim/boot
diff -Nru a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
--- a/arch/ia64/ia32/sys_ia32.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/ia32/sys_ia32.c	Fri Oct 17 23:12:58 2003
@@ -2486,11 +2486,14 @@
 putstat64 (struct stat64 *ubuf, struct kstat *kbuf)
 {
 	int err;
+	u64 hdev;
 
 	if (clear_user(ubuf, sizeof(*ubuf)))
 		return -EFAULT;
 
-	err  = __put_user(huge_encode_dev(kbuf->dev), &ubuf->st_dev);
+	hdev = huge_encode_dev(kbuf->dev);
+	err  = __put_user(hdev, (u32*)&ubuf->st_dev);
+	err |= __put_user(hdev >> 32, ((u32*)&ubuf->st_dev) + 1);
 	err |= __put_user(kbuf->ino, &ubuf->__st_ino);
 	err |= __put_user(kbuf->ino, &ubuf->st_ino_lo);
 	err |= __put_user(kbuf->ino >> 32, &ubuf->st_ino_hi);
@@ -2498,7 +2501,9 @@
 	err |= __put_user(kbuf->nlink, &ubuf->st_nlink);
 	err |= __put_user(kbuf->uid, &ubuf->st_uid);
 	err |= __put_user(kbuf->gid, &ubuf->st_gid);
-	err |= __put_user(huge_encode_dev(kbuf->rdev), &ubuf->st_rdev);
+	hdev = huge_encode_dev(kbuf->rdev);
+	err  = __put_user(hdev, (u32*)&ubuf->st_rdev);
+	err |= __put_user(hdev >> 32, ((u32*)&ubuf->st_rdev) + 1);
 	err |= __put_user(kbuf->size, &ubuf->st_size_lo);
 	err |= __put_user((kbuf->size >> 32), &ubuf->st_size_hi);
 	err |= __put_user(kbuf->atime.tv_sec, &ubuf->st_atime);
@@ -2724,8 +2729,8 @@
 struct epoll_event32
 {
 	u32 events;
-	u64 data;
-} __attribute__((packed));
+	u32 data[2];
+}; 
 
 asmlinkage long
 sys32_epoll_ctl(int epfd, int op, int fd, struct epoll_event32 *event)
@@ -2740,10 +2745,10 @@
 		return error;
 
 	__get_user(event64.events, &event->events);
-	__get_user(data_halfword, (u32*)(&event->data));
+	__get_user(data_halfword, &event->data[0]);
 	event64.data = data_halfword;
-	__get_user(data_halfword, ((u32*)(&event->data) + 1));
- 	event64.data |= ((u64)data_halfword) << 32;
+	__get_user(data_halfword, &event->data[1]);
+ 	event64.data |= (u64)data_halfword << 32;
 
 	set_fs(KERNEL_DS);
 	error = sys_epoll_ctl(epfd, op, fd, &event64);
@@ -2758,8 +2763,9 @@
 {
 	struct epoll_event *events64 = NULL;
 	mm_segment_t old_fs = get_fs();
-	int error;
+	int error, numevents, size;
 	int evt_idx;
+	int do_free_pages = 0;
 
 	if (maxevents <= 0) {
 		return -EINVAL;
@@ -2770,43 +2776,45 @@
 				 maxevents * sizeof(struct epoll_event32))))
 		return error;
 
-	/* Allocate the space needed for the intermediate copy */
-	events64 = kmalloc(maxevents * sizeof(struct epoll_event), GFP_KERNEL);
+	/* 
+ 	 * Allocate space for the intermediate copy.  If the space needed 
+	 * is large enough to cause kmalloc to fail, then try again with
+	 * __get_free_pages.
+	 */
+	size = maxevents * sizeof(struct epoll_event);
+	events64 = kmalloc(size, GFP_KERNEL);
 	if (events64 == NULL) {
-		return -ENOMEM;
-	}
-
-	/* Expand the 32-bit structures into the 64-bit structures */
-	for (evt_idx = 0; evt_idx < maxevents; evt_idx++) {
-		u32 data_halfword;
-		__get_user(events64[evt_idx].events, &events[evt_idx].events);
-		__get_user(data_halfword, (u32*)(&events[evt_idx].data));
-		events64[evt_idx].data = data_halfword;
-		__get_user(data_halfword, ((u32*)(&events[evt_idx].data) + 1));
-		events64[evt_idx].data |= ((u64)data_halfword) << 32;
+		events64 = (struct epoll_event *)
+				__get_free_pages(GFP_KERNEL, get_order(size));
+		if (events64 == NULL) 
+			return -ENOMEM;
+		do_free_pages = 1;
 	}
 
 	/* Do the system call */
 	set_fs(KERNEL_DS); /* copy_to/from_user should work on kernel mem*/
-	error = sys_epoll_wait(epfd, events64, maxevents, timeout);
+	numevents = sys_epoll_wait(epfd, events64, maxevents, timeout);
 	set_fs(old_fs);
 
 	/* Don't modify userspace memory if we're returning an error */
-	if (!error) {
+	if (numevents > 0) {
 		/* Translate the 64-bit structures back into the 32-bit
 		   structures */
-		for (evt_idx = 0; evt_idx < maxevents; evt_idx++) {
+		for (evt_idx = 0; evt_idx < numevents; evt_idx++) {
 			__put_user(events64[evt_idx].events,
 				   &events[evt_idx].events);
-			__put_user((u32)(events64[evt_idx].data),
-				   (u32*)(&events[evt_idx].data));
+			__put_user((u32)events64[evt_idx].data,
+				   &events[evt_idx].data[0]);
 			__put_user((u32)(events64[evt_idx].data >> 32),
-				   ((u32*)(&events[evt_idx].data) + 1));
+				   &events[evt_idx].data[1]);
 		}
 	}
 
-	kfree(events64);
-	return error;
+	if (do_free_pages)
+		free_pages((unsigned long) events64, get_order(size));
+	else
+		kfree(events64);
+	return numevents;
 }
 
 #ifdef	NOTYET  /* UNTESTED FOR IA64 FROM HERE DOWN */
diff -Nru a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
--- a/arch/ia64/kernel/acpi.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/acpi.c	Fri Oct 17 23:12:58 2003
@@ -56,6 +56,7 @@
 void (*pm_power_off) (void);
 
 unsigned char acpi_kbd_controller_present = 1;
+unsigned char acpi_legacy_devices;
 
 int acpi_disabled;	/* XXX this shouldn't be needed---we can't boot without ACPI! */
 
@@ -380,7 +381,7 @@
 void __init
 acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *ma)
 {
-	unsigned long paddr, size, hole_size, min_hole_size;
+	unsigned long paddr, size;
 	u8 pxm;
 	struct node_memblk_s *p, *q, *pend;
 
@@ -402,34 +403,6 @@
 	if (!ma->flags.enabled)
 		return;
 
-	/*
-	 * When the chunk is not the first one in the node, check distance
-	 * from the other chunks. When the hole is too huge ignore the chunk.
-	 * This restriction should be removed when multiple chunks per node
-	 * is supported.
-	 */
-	pend = &node_memblk[num_memblks];
-	min_hole_size = 0;
-	for (p = &node_memblk[0]; p < pend; p++) {
-		if (p->nid != pxm)
-			continue;
-		if (p->start_paddr < paddr)
-			hole_size = paddr - (p->start_paddr + p->size);
-		else
-			hole_size = p->start_paddr - (paddr + size);
-
-		if (!min_hole_size || hole_size < min_hole_size)
-			min_hole_size = hole_size;
-	}
-
-	if (min_hole_size) {
-		if (min_hole_size > size) {
-			printk(KERN_ERR "Too huge memory hole. Ignoring %ld MBytes at %lx\n",
-			       size/(1024*1024), paddr);
-			return;
-		}
-	}
-
 	/* record this node in proximity bitmap */
 	pxm_bit_set(pxm);
 
@@ -454,6 +427,12 @@
 {
 	int i, j, node_from, node_to;
 
+	/* If there's no SRAT, fix the phys_id */
+	if (srat_num_cpus == 0) {
+		node_cpuid[0].phys_id = hard_smp_processor_id();
+		return;
+	}
+
 	/* calculate total number of nodes in system from PXM bitmap */
 	numnodes = 0;		/* init total nodes in system */
 
@@ -531,6 +510,9 @@
 	if (!(fadt->iapc_boot_arch & BAF_8042_KEYBOARD_CONTROLLER))
 		acpi_kbd_controller_present = 0;
 
+	if (fadt->iapc_boot_arch & BAF_LEGACY_DEVICES)
+		acpi_legacy_devices = 1;
+
 	acpi_register_irq(fadt->sci_int, ACPI_ACTIVE_LOW, ACPI_LEVEL_SENSITIVE);
 	return 0;
 }
@@ -614,6 +596,12 @@
 
 	smp_build_cpu_map();
 # ifdef CONFIG_NUMA
+	if (srat_num_cpus == 0) {
+		int cpu, i = 1;
+		for (cpu = 0; cpu < smp_boot_data.cpu_count; cpu++)
+			if (smp_boot_data.cpu_phys_id[cpu] != hard_smp_processor_id())
+				node_cpuid[i++].phys_id = smp_boot_data.cpu_phys_id[cpu];
+	}
 	build_cpu_to_node_map();
 # endif
 #endif
diff -Nru a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
--- a/arch/ia64/kernel/asm-offsets.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/asm-offsets.c	Fri Oct 17 23:12:58 2003
@@ -33,16 +33,30 @@
 
 	BLANK();
 
+	DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked));
 	DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid));
 	DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader));
+	DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending));
 	DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid));
 	DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent));
+	DEFINE(IA64_TASK_SIGHAND_OFFSET,offsetof (struct task_struct, sighand));
+	DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal));
 	DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid));
 	DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp));
 	DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack));
 
 	BLANK();
 
+	DEFINE(IA64_SIGHAND_SIGLOCK_OFFSET,offsetof (struct sighand_struct, siglock));
+
+	BLANK();
+
+	DEFINE(IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,offsetof (struct signal_struct,
+							     group_stop_count));
+	DEFINE(IA64_SIGNAL_SHARED_PENDING_OFFSET,offsetof (struct signal_struct, shared_pending));
+
+	BLANK();
+
 	DEFINE(IA64_PT_REGS_B6_OFFSET, offsetof (struct pt_regs, b6));
 	DEFINE(IA64_PT_REGS_B7_OFFSET, offsetof (struct pt_regs, b7));
 	DEFINE(IA64_PT_REGS_AR_CSD_OFFSET, offsetof (struct pt_regs, ar_csd));
@@ -155,6 +169,10 @@
 	DEFINE(IA64_SIGCONTEXT_R12_OFFSET, offsetof (struct sigcontext, sc_gr[12]));
 	DEFINE(IA64_SIGCONTEXT_RBS_BASE_OFFSET,offsetof (struct sigcontext, sc_rbs_base));
 	DEFINE(IA64_SIGCONTEXT_LOADRS_OFFSET, offsetof (struct sigcontext, sc_loadrs));
+
+	BLANK();
+
+	DEFINE(IA64_SIGPENDING_SIGNAL_OFFSET, offsetof (struct sigpending, signal));
 
 	BLANK();
 
diff -Nru a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
--- a/arch/ia64/kernel/fsys.S	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/fsys.S	Fri Oct 17 23:12:58 2003
@@ -4,6 +4,7 @@
  * Copyright (C) 2003 Hewlett-Packard Co
  * 	David Mosberger-Tang <davidm@hpl.hp.com>
  *
+ * 25-Sep-03 davidm	Implement fsys_rt_sigprocmask().
  * 18-Feb-03 louisk	Implement fsys_gettimeofday().
  * 28-Feb-03 davidm	Fixed several bugs in fsys_gettimeofday().  Tuned it some more,
  *			probably broke it along the way... ;-)
@@ -15,6 +16,7 @@
 #include <asm/percpu.h>
 #include <asm/thread_info.h>
 #include <asm/sal.h>
+#include <asm/signal.h>
 #include <asm/system.h>
 #include <asm/unistd.h>
 
@@ -48,8 +50,7 @@
 	.body
 	mov r8=ENOSYS
 	mov r10=-1
-	MCKINLEY_E9_WORKAROUND
-	br.ret.sptk.many b6
+	FSYS_RETURN
 END(fsys_ni_syscall)
 
 ENTRY(fsys_getpid)
@@ -66,8 +67,7 @@
 	;;
 	cmp.ne p8,p0=0,r9
 (p8)	br.spnt.many fsys_fallback_syscall
-	MCKINLEY_E9_WORKAROUND
-	br.ret.sptk.many b6
+	FSYS_RETURN
 END(fsys_getpid)
 
 ENTRY(fsys_getppid)
@@ -114,8 +114,7 @@
 	mov r18=0			// i must not leak kernel bits...
 	mov r19=0			// i must not leak kernel bits...
 #endif
-	MCKINLEY_E9_WORKAROUND
-	br.ret.sptk.many b6
+	FSYS_RETURN
 END(fsys_getppid)
 
 ENTRY(fsys_set_tid_address)
@@ -141,8 +140,7 @@
 	;;
 	mov r17=0			// i must not leak kernel bits...
 	mov r18=0			// i must not leak kernel bits...
-	MCKINLEY_E9_WORKAROUND
-	br.ret.sptk.many b6
+	FSYS_RETURN
 END(fsys_set_tid_address)
 
 /*
@@ -199,7 +197,7 @@
 
 	adds r10=IA64_CPUINFO_ITM_DELTA_OFFSET, r10
 (p7)	tnat.nz p6,p0=r33
-(p6)	br.cond.spnt.few .fail
+(p6)	br.cond.spnt.few .fail_einval
 
 	adds r8=IA64_CPUINFO_NSEC_PER_CYC_OFFSET, r3
 	movl r24=2361183241434822607	// for division hack (only for / 1000)
@@ -225,8 +223,8 @@
 	 * to store the result.  That's OK as long as the stores are also
 	 * protect by EX().
 	 */
-EX(.fail, probe.w.fault r32, 3)		// this must come _after_ NaT-check
-EX(.fail, probe.w.fault r10, 3)		// this must come _after_ NaT-check
+EX(.fail_efault, probe.w.fault r32, 3)		// this must come _after_ NaT-check
+EX(.fail_efault, probe.w.fault r10, 3)		// this must come _after_ NaT-check
 	nop 0
 
 	ldf8 f10=[r8]			// f10 <- local_cpu_data->nsec_per_cyc value
@@ -311,14 +309,13 @@
 (p7)	br.spnt.many 1b
 
 	// finally: r2 = sec, r3 = usec
-EX(.fail, st8 [r32]=r2)
+EX(.fail_efault, st8 [r32]=r2)
 	adds r9=8, r32
 	mov r8=r0			// success
 	;;
-EX(.fail, st8 [r9]=r3)			// store them in the timeval struct
+EX(.fail_efault, st8 [r9]=r3)		// store them in the timeval struct
 	mov r10=0
-	MCKINLEY_E9_WORKAROUND
-	br.ret.sptk.many b6		// return to caller
+	FSYS_RETURN
 	/*
 	 * Note: We are NOT clearing the scratch registers here.  Since the only things
 	 *	 in those registers are time-related variables and some addresses (which
@@ -326,12 +323,183 @@
 	 *	 and we should be fine.
 	 */
 
-.fail:	adds r8=EINVAL, r0		// r8 = EINVAL
-	adds r10=-1, r0			// r10 = -1
-	MCKINLEY_E9_WORKAROUND
-	br.ret.spnt.many b6		// return with r8 set to EINVAL
+.fail_einval:
+	mov r8=EINVAL			// r8 = EINVAL
+	mov r10=-1			// r10 = -1
+	FSYS_RETURN
+
+.fail_efault:
+	mov r8=EFAULT			// r8 = EFAULT
+	mov r10=-1			// r10 = -1
+	FSYS_RETURN
 END(fsys_gettimeofday)
 
+/*
+ * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize).
+ */
+#if _NSIG_WORDS != 1
+# error Sorry, fsys_rt_sigprocmask() needs to be updated for _NSIG_WORDS != 1.
+#endif
+ENTRY(fsys_rt_sigprocmask)
+	.prologue
+	.altrp b6
+	.body
+
+	mf					// ensure reading of current->blocked is ordered
+	add r2=IA64_TASK_BLOCKED_OFFSET,r16
+	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
+	;;
+	/*
+	 * Since we're only reading a single word, we can do it
+	 * atomically without acquiring current->sighand->siglock.  To
+	 * be on the safe side, we need a fully-ordered load, though:
+	 */
+	ld8.acq r3=[r2]				// read/prefetch current->blocked
+	ld4 r9=[r9]
+	add r31=IA64_TASK_SIGHAND_OFFSET,r16
+	;;
+#ifdef CONFIG_SMP
+	ld8 r31=[r31]				// r31 <- current->sighand
+#endif
+	and r9=TIF_ALLWORK_MASK,r9
+	tnat.nz p6,p0=r32
+	;;
+	cmp.ne p7,p0=0,r9
+	tnat.nz.or p6,p0=r35
+	tnat.nz p8,p0=r34
+	;;
+	cmp.ne p15,p0=r0,r34			// oset != NULL?
+	cmp.ne.or p6,p0=_NSIG_WORDS*8,r35
+	tnat.nz.or p8,p0=r33
+
+(p6)	br.spnt.few .fail_einval		// fail with EINVAL
+(p7)	br.spnt.many fsys_fallback_syscall	// got pending kernel work...
+(p8)	br.spnt.few .fail_efault		// fail with EFAULT
+	;;
+
+	cmp.eq p6,p7=r0,r33			// set == NULL?
+	add r31=IA64_SIGHAND_SIGLOCK_OFFSET,r31	// r31 <- current->sighand->siglock
+(p6)	br.dpnt.many .store_mask		// -> short-circuit to just reading the signal mask
+
+	/* Argh, we actually have to do some work and _update_ the signal mask: */
+
+EX(.fail_efault, probe.r.fault r33, 3)		// verify user has read-access to *set
+EX(.fail_efault, ld8 r14=[r33])			// r14 <- *set
+	mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1))
+	;;
+
+	rsm psr.i				// mask interrupt delivery
+	mov ar.ccv=0
+	andcm r14=r14,r17			// filter out SIGKILL & SIGSTOP
+
+#ifdef CONFIG_SMP
+	mov r17=1
+	;;
+	cmpxchg4.acq r18=[r31],r17,ar.ccv	// try to acquire the lock
+	mov r8=EINVAL			// default to EINVAL
+	;;
+	ld8 r3=[r2]			// re-read current->blocked now that we hold the lock
+	cmp4.ne p6,p0=r18,r0
+(p6)	br.cond.spnt.many .lock_contention
+	;;
+#else
+	ld8 r3=[r2]			// re-read current->blocked now that we hold the lock
+	mov r8=EINVAL			// default to EINVAL
+#endif
+	add r18=IA64_TASK_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r16
+	add r19=IA64_TASK_SIGNAL_OFFSET,r16
+	cmp4.eq p6,p0=SIG_BLOCK,r32
+	;;
+	ld8 r19=[r19]			// r19 <- current->signal
+	cmp4.eq p7,p0=SIG_UNBLOCK,r32
+	cmp4.eq p8,p0=SIG_SETMASK,r32
+	;;
+	ld8 r18=[r18]			// r18 <- current->pending.signal
+	.pred.rel.mutex p6,p7,p8
+(p6)	or r3=r3,r14			// SIG_BLOCK
+(p7)	andcm r3=r3,r14			// SIG_UNBLOCK
+
+(p8)	mov r3=r14			// SIG_SETMASK
+(p6)	mov r8=0			// clear error code
+	// recalc_sigpending()
+	add r17=IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,r19
+
+	add r19=IA64_SIGNAL_SHARED_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r19
+	;;
+	ld4 r17=[r17]		// r17 <- current->signal->group_stop_count
+(p7)	mov r8=0		// clear error code
+
+	ld8 r19=[r19]		// r19 <- current->signal->shared_pending
+	;;
+	cmp4.gt p6,p7=r17,r0	// p6/p7 <- (current->signal->group_stop_count > 0)?
+(p8)	mov r8=0		// clear error code
+
+	or r18=r18,r19		// r18 <- current->pending | current->signal->shared_pending
+	;;
+	// r18 <- (current->pending | current->signal->shared_pending) & ~current->blocked:
+	andcm r18=r18,r3
+	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
+	;;
+
+(p7)	cmp.ne.or.andcm p6,p7=r18,r0		// p6/p7 <- signal pending
+	mov r19=0					// i must not leak kernel bits...
+(p6)	br.cond.dpnt.many .sig_pending
+	;;
+
+1:	ld4 r17=[r9]				// r17 <- current->thread_info->flags
+	;;
+	mov ar.ccv=r17
+	and r18=~_TIF_SIGPENDING,r17		// r18 <- r17 & ~(1 << TIF_SIGPENDING)
+	;;
+
+	st8 [r2]=r3				// update current->blocked with new mask
+	cmpxchg4.acq r14=[r9],r18,ar.ccv	// current->thread_info->flags <- r18
+	;;
+	cmp.ne p6,p0=r17,r14			// update failed?
+(p6)	br.cond.spnt.few 1b			// yes -> retry
+
+#ifdef CONFIG_SMP
+	st4.rel [r31]=r0			// release the lock
+#endif
+	ssm psr.i
+	cmp.ne p9,p0=r8,r0			// check for bad HOW value
+	;;
+
+	srlz.d					// ensure psr.i is set again
+	mov r18=0					// i must not leak kernel bits...
+(p9)	br.spnt.few .fail_einval		// bail out for bad HOW value
+
+.store_mask:
+EX(.fail_efault, (p15) probe.w.fault r34, 3)	// verify user has write-access to *oset
+EX(.fail_efault, (p15) st8 [r34]=r3)
+	mov r2=0					// i must not leak kernel bits...
+	mov r3=0					// i must not leak kernel bits...
+	mov r8=0				// return 0
+	mov r9=0					// i must not leak kernel bits...
+	mov r14=0					// i must not leak kernel bits...
+	mov r17=0					// i must not leak kernel bits...
+	mov r31=0					// i must not leak kernel bits...
+	FSYS_RETURN
+
+.sig_pending:
+#ifdef CONFIG_SMP
+	st4.rel [r31]=r0			// release the lock
+#endif
+	ssm psr.i
+	;;
+	srlz.d
+	br.sptk.many fsys_fallback_syscall	// with signal pending, do the heavy-weight syscall
+
+#ifdef CONFIG_SMP
+.lock_contention:
+	/* Rather than spinning here, fall back on doing a heavy-weight syscall.  */
+	ssm psr.i
+	;;
+	srlz.d
+	br.sptk.many fsys_fallback_syscall
+#endif
+END(fsys_rt_sigprocmask)
+
 ENTRY(fsys_fallback_syscall)
 	.prologue
 	.altrp b6
@@ -600,7 +768,7 @@
 	data8 0				// sigaltstack
 	data8 0				// rt_sigaction
 	data8 0				// rt_sigpending
-	data8 0				// rt_sigprocmask
+	data8 fsys_rt_sigprocmask	// rt_sigprocmask
 	data8 0				// rt_sigqueueinfo	// 1180
 	data8 0				// rt_sigreturn
 	data8 0				// rt_sigsuspend
diff -Nru a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
--- a/arch/ia64/kernel/gate.S	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/gate.S	Fri Oct 17 23:12:58 2003
@@ -118,8 +118,7 @@
 
 	mov r10=-1
 	mov r8=ENOSYS
-	MCKINLEY_E9_WORKAROUND
-	br.ret.sptk.many b6
+	FSYS_RETURN
 END(__kernel_syscall_via_epc)
 
 #	define ARG0_OFF		(16 + IA64_SIGFRAME_ARG0_OFFSET)
diff -Nru a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
--- a/arch/ia64/kernel/head.S	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/head.S	Fri Oct 17 23:12:58 2003
@@ -797,6 +797,25 @@
 	br.ret.sptk.many rp
 END(ia64_switch_mode_virt)
 
+GLOBAL_ENTRY(ia64_delay_loop)
+	.prologue
+{	nop 0			// work around GAS unwind info generation bug...
+	.save ar.lc,r2
+	mov r2=ar.lc
+	.body
+	;;
+	mov ar.lc=r32
+}
+	;;
+	// force loop to be 32-byte aligned (GAS bug means we cannot use .align
+	// inside function body without corrupting unwind info).
+{	nop 0 }
+1:	br.cloop.sptk.few 1b
+	;;
+	mov ar.lc=r2
+	br.ret.sptk.many rp
+END(ia64_delay_loop)
+
 #ifdef CONFIG_IA64_BRL_EMU
 
 /*
diff -Nru a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
--- a/arch/ia64/kernel/ia64_ksyms.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/ia64_ksyms.c	Fri Oct 17 23:12:58 2003
@@ -34,13 +34,8 @@
 #include <linux/interrupt.h>
 EXPORT_SYMBOL(probe_irq_mask);
 
-#include <linux/in6.h>
 #include <asm/checksum.h>
-/* not coded yet?? EXPORT_SYMBOL(csum_ipv6_magic); */
-EXPORT_SYMBOL(csum_partial_copy_nocheck);
-EXPORT_SYMBOL(csum_tcpudp_magic);
-EXPORT_SYMBOL(ip_compute_csum);
-EXPORT_SYMBOL(ip_fast_csum);
+EXPORT_SYMBOL(ip_fast_csum);		/* hand-coded assembly */
 
 #include <asm/io.h>
 EXPORT_SYMBOL(__ia64_memcpy_fromio);
@@ -58,9 +53,11 @@
 EXPORT_SYMBOL(clear_page);
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
+#include <linux/bootmem.h>
 #include <asm/pgtable.h>
 EXPORT_SYMBOL(vmalloc_end);
 EXPORT_SYMBOL(ia64_pfn_valid);
+EXPORT_SYMBOL(max_low_pfn);	/* defined by bootmem.c, but not exported by generic code */
 #endif
 
 #include <asm/processor.h>
diff -Nru a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
--- a/arch/ia64/kernel/mca.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/mca.c	Fri Oct 17 23:12:58 2003
@@ -81,8 +81,6 @@
 u64				ia64_mca_sal_data_area[1356];
 u64				ia64_tlb_functional;
 u64				ia64_os_mca_recovery_successful;
-/* TODO: need to assign min-state structure to UC memory */
-u64				ia64_mca_min_state_save_info[MIN_STATE_AREA_SIZE] __attribute__((aligned(512)));
 static void			ia64_mca_wakeup_ipi_wait(void);
 static void			ia64_mca_wakeup(int cpu);
 static void			ia64_mca_wakeup_all(void);
@@ -466,26 +464,6 @@
 #endif /* PLATFORM_MCA_HANDLERS */
 
 /*
- * routine to process and prepare to dump min_state_save
- * information for debugging purposes.
- */
-void
-ia64_process_min_state_save (pal_min_state_area_t *pmss)
-{
-	int i, max = MIN_STATE_AREA_SIZE;
-	u64 *tpmss_ptr = (u64 *)pmss;
-	u64 *return_min_state_ptr = ia64_mca_min_state_save_info;
-
-	for (i=0;i<max;i++) {
-
-		/* copy min-state register info for eventual return to PAL */
-		*return_min_state_ptr++ = *tpmss_ptr;
-
-		tpmss_ptr++;  /* skip to next entry */
-	}
-}
-
-/*
  * ia64_mca_cmc_vector_setup
  *
  *  Setup the corrected machine check vector register in the processor and
@@ -828,7 +806,7 @@
 			irr = ia64_getreg(_IA64_REG_CR_IRR3);
 			break;
 		}
-	} while (!(irr & (1 << irr_bit))) ;
+	} while (!(irr & (1UL << irr_bit))) ;
 }
 
 /*
@@ -961,9 +939,8 @@
 	/* Default = tell SAL to return to same context */
 	ia64_os_to_sal_handoff_state.imots_context = IA64_MCA_SAME_CONTEXT;
 
-	/* Register pointer to new min state values */
 	ia64_os_to_sal_handoff_state.imots_new_min_state =
-		ia64_mca_min_state_save_info;
+		(u64 *)ia64_sal_to_os_handoff_state.pal_min_state;
 }
 
 /*
@@ -2153,9 +2130,6 @@
 	/* Print processor static info if any */
 	if (slpi->valid.psi_static_struct) {
 		spsi = (sal_processor_static_info_t *)p_data;
-
-		/* copy interrupted context PAL min-state info */
-		ia64_process_min_state_save(&spsi->min_state_area);
 
 		/* Print branch register contents if valid */
 		if (spsi->valid.br)
diff -Nru a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S
--- a/arch/ia64/kernel/mca_asm.S	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/mca_asm.S	Fri Oct 17 23:12:58 2003
@@ -77,12 +77,11 @@
 (p6)	movl	r10=IA64_MCA_SAME_CONTEXT;		\
 (p6)	add     _tmp=0x18,_tmp;;			\
 (p6)	ld8	r9=[_tmp],0x10;				\
-(p6)	movl	r22=ia64_mca_min_state_save_info;;	\
+(p6)	mov	r22=r0;;				\
 (p7)	ld8	r8=[_tmp],0x08;;			\
 (p7)	ld8	r9=[_tmp],0x08;;			\
 (p7)	ld8     r10=[_tmp],0x08;;			\
-(p7)	ld8     r22=[_tmp],0x08;;			\
-	DATA_VA_TO_PA(r22)
+(p7)	ld8     r22=[_tmp],0x08;;
 	// now _tmp is pointing to SAL rtn save location
 
 
@@ -97,7 +96,6 @@
 	.global ia64_init_stack
 	.global ia64_mca_sal_data_area
 	.global ia64_tlb_functional
-	.global ia64_mca_min_state_save_info
 
 	.text
 	.align 16
@@ -265,15 +263,15 @@
 	add		r4=8,r2                  // duplicate r2 in r4
 	add		r6=2*8,r2                // duplicate r2 in r4
 
-	mov		r3=cr0                      // cr.dcr
-	mov		r5=cr1                      // cr.itm
-	mov		r7=cr2;;                    // cr.iva
+	mov		r3=cr.dcr
+	mov		r5=cr.itm
+	mov		r7=cr.iva;;
 
 	st8		[r2]=r3,8*8
 	st8		[r4]=r5,3*8
 	st8		[r6]=r7,3*8;;            // 48 byte rements
 
-	mov		r3=cr8;;                    // cr.pta
+	mov		r3=cr.pta;;
 	st8		[r2]=r3,8*8;;            // 64 byte rements
 
 // if PSR.ic=0, reading interruption registers causes an illegal operation fault
@@ -286,23 +284,23 @@
 	add		r4=8,r2                  // duplicate r2 in r4
 	add		r6=2*8,r2                // duplicate r2 in r6
 
-	mov		r3=cr16                     // cr.ipsr
-	mov		r5=cr17                     // cr.isr
-        mov     r7=r0;;                     // cr.ida => cr18 (reserved)
+	mov		r3=cr.ipsr
+	mov		r5=cr.isr
+	mov		r7=r0;;
 	st8		[r2]=r3,3*8
 	st8		[r4]=r5,3*8
 	st8		[r6]=r7,3*8;;
 
-	mov		r3=cr19                     // cr.iip
-	mov		r5=cr20                     // cr.idtr
-	mov		r7=cr21;;                   // cr.iitr
+	mov		r3=cr.iip
+	mov		r5=cr.ifa
+	mov		r7=cr.itir;;
 	st8		[r2]=r3,3*8
 	st8		[r4]=r5,3*8
 	st8		[r6]=r7,3*8;;
 
-	mov		r3=cr22                     // cr.iipa
-	mov		r5=cr23                     // cr.ifs
-	mov		r7=cr24;;                   // cr.iim
+	mov		r3=cr.iipa
+	mov		r5=cr.ifs
+	mov		r7=cr.iim;;
 	st8		[r2]=r3,3*8
 	st8		[r4]=r5,3*8
 	st8		[r6]=r7,3*8;;
@@ -311,104 +309,101 @@
 	st8		[r2]=r3,160;;               // 160 byte rement
 
 SkipIntrRegs:
-	st8		[r2]=r0,168                 // another 168 byte .
+	st8		[r2]=r0,152;;               // another 152 byte .
 
-	mov		r3=cr66;;                   // cr.lid
-	st8		[r2]=r3,40                  // 40 byte rement
+	add		r4=8,r2                     // duplicate r2 in r4
+	add		r6=2*8,r2                   // duplicate r2 in r6
 
-	mov		r3=cr71;;                   // cr.ivr
-	st8		[r2]=r3,8
-
-	mov		r3=cr72;;                   // cr.tpr
-	st8		[r2]=r3,24                  // 24 byte increment
-
-	mov		r3=r0;;                     // cr.eoi => cr75
-	st8		[r2]=r3,168                 // 168 byte inc.
-
-	mov		r3=r0;;                     // cr.irr0 => cr96
-	st8		[r2]=r3,16               // 16 byte inc.
-
-	mov		r3=r0;;                     // cr.irr1 => cr98
-	st8		[r2]=r3,16               // 16 byte inc.
-
-	mov		r3=r0;;                     // cr.irr2 => cr100
-	st8		[r2]=r3,16               // 16 byte inc
-
-	mov		r3=r0;;                     // cr.irr3 => cr100
-	st8		[r2]=r3,16               // 16b inc.
-
-	mov		r3=r0;;                     // cr.itv => cr114
-	st8		[r2]=r3,16               // 16 byte inc.
+	mov		r3=cr.lid
+//	mov		r5=cr.ivr                     // cr.ivr, don't read it
+	mov		r7=cr.tpr;;
+	st8		[r2]=r3,3*8
+	st8		[r4]=r5,3*8
+	st8		[r6]=r7,3*8;;
 
-	mov		r3=r0;;                     // cr.pmv => cr116
-	st8		[r2]=r3,8
+	mov		r3=r0                       // cr.eoi => cr67
+	mov		r5=r0                       // cr.irr0 => cr68
+	mov		r7=r0;;                     // cr.irr1 => cr69
+	st8		[r2]=r3,3*8
+	st8		[r4]=r5,3*8
+	st8		[r6]=r7,3*8;;
 
-	mov		r3=r0;;                     // cr.lrr0 => cr117
-	st8		[r2]=r3,8
+	mov		r3=r0                       // cr.irr2 => cr70
+	mov		r5=r0                       // cr.irr3 => cr71
+	mov		r7=cr.itv;;
+	st8		[r2]=r3,3*8
+	st8		[r4]=r5,3*8
+	st8		[r6]=r7,3*8;;
 
-	mov		r3=r0;;                     // cr.lrr1 => cr118
-	st8		[r2]=r3,8
+	mov		r3=cr.pmv
+	mov		r5=cr.cmcv;;
+	st8		[r2]=r3,7*8
+	st8		[r4]=r5,7*8;;
+
+	mov		r3=r0                       // cr.lrr0 => cr80
+	mov		r5=r0;;                     // cr.lrr1 => cr81
+	st8		[r2]=r3,23*8
+	st8		[r4]=r5,23*8;;
 
-	mov		r3=r0;;                     // cr.cmcv => cr119
-	st8		[r2]=r3,8*10;;
+	adds		r2=25*8,r2;;
 
 cSaveARs:
 // save ARs
 	add		r4=8,r2                  // duplicate r2 in r4
 	add		r6=2*8,r2                // duplicate r2 in r6
 
-	mov		r3=ar0                      // ar.kro
-	mov		r5=ar1                      // ar.kr1
-	mov		r7=ar2;;                    // ar.kr2
+	mov		r3=ar.k0
+	mov		r5=ar.k1
+	mov		r7=ar.k2;;
 	st8		[r2]=r3,3*8
 	st8		[r4]=r5,3*8
 	st8		[r6]=r7,3*8;;
 
-	mov		r3=ar3                      // ar.kr3
-	mov		r5=ar4                      // ar.kr4
-	mov		r7=ar5;;                    // ar.kr5
+	mov		r3=ar.k3
+	mov		r5=ar.k4
+	mov		r7=ar.k5;;
 	st8		[r2]=r3,3*8
 	st8		[r4]=r5,3*8
 	st8		[r6]=r7,3*8;;
 
-	mov		r3=ar6                      // ar.kr6
-	mov		r5=ar7                      // ar.kr7
+	mov		r3=ar.k6
+	mov		r5=ar.k7
 	mov		r7=r0;;                     // ar.kr8
 	st8		[r2]=r3,10*8
 	st8		[r4]=r5,10*8
 	st8		[r6]=r7,10*8;;           // rement by 72 bytes
 
-	mov		r3=ar16                     // ar.rsc
-	mov		ar16=r0			    // put RSE in enforced lazy mode
-	mov		r5=ar17                     // ar.bsp
+	mov		r3=ar.rsc
+	mov		ar.rsc=r0			    // put RSE in enforced lazy mode
+	mov		r5=ar.bsp
 	;;
-	mov		r7=ar18;;                   // ar.bspstore
+	mov		r7=ar.bspstore;;
 	st8		[r2]=r3,3*8
 	st8		[r4]=r5,3*8
 	st8		[r6]=r7,3*8;;
 
-	mov		r3=ar19;;                   // ar.rnat
+	mov		r3=ar.rnat;;
 	st8		[r2]=r3,8*13             // increment by 13x8 bytes
 
-	mov		r3=ar32;;                   // ar.ccv
+	mov		r3=ar.ccv;;
 	st8		[r2]=r3,8*4
 
-	mov		r3=ar36;;                   // ar.unat
+	mov		r3=ar.unat;;
 	st8		[r2]=r3,8*4
 
-	mov		r3=ar40;;                   // ar.fpsr
+	mov		r3=ar.fpsr;;
 	st8		[r2]=r3,8*4
 
-	mov		r3=ar44;;                   // ar.itc
+	mov		r3=ar.itc;;
 	st8		[r2]=r3,160                 // 160
 
-	mov		r3=ar64;;                   // ar.pfs
+	mov		r3=ar.pfs;;
 	st8		[r2]=r3,8
 
-	mov		r3=ar65;;                   // ar.lc
+	mov		r3=ar.lc;;
 	st8		[r2]=r3,8
 
-	mov		r3=ar66;;                   // ar.ec
+	mov		r3=ar.ec;;
 	st8		[r2]=r3
 	add		r2=8*62,r2               //padding
 
@@ -417,7 +412,8 @@
 	movl		r4=0x00;;
 
 cStRR:
-	mov		r3=rr[r4];;
+	dep.z		r5=r4,61,3;;
+	mov		r3=rr[r5];;
 	st8		[r2]=r3,8
 	add		r4=1,r4
 	br.cloop.sptk.few	cStRR
@@ -501,12 +497,12 @@
 	ld8		r3=[r2],8*8
 	ld8		r5=[r4],3*8
 	ld8		r7=[r6],3*8;;            // 48 byte increments
-	mov		cr0=r3                      // cr.dcr
-	mov		cr1=r5                      // cr.itm
-	mov		cr2=r7;;                    // cr.iva
+	mov		cr.dcr=r3
+	mov		cr.itm=r5
+	mov		cr.iva=r7;;
 
 	ld8		r3=[r2],8*8;;            // 64 byte increments
-//      mov		cr8=r3                      // cr.pta
+//      mov		cr.pta=r3
 
 
 // if PSR.ic=1, reading interruption registers causes an illegal operation fault
@@ -523,64 +519,66 @@
 	ld8		r3=[r2],3*8
 	ld8		r5=[r4],3*8
 	ld8		r7=[r6],3*8;;
-	mov		cr16=r3                     // cr.ipsr
-	mov		cr17=r5                     // cr.isr is read only
-//      mov     cr18=r7;;                   // cr.ida (reserved - don't restore)
+	mov		cr.ipsr=r3
+//	mov		cr.isr=r5                   // cr.isr is read only
 
 	ld8		r3=[r2],3*8
 	ld8		r5=[r4],3*8
 	ld8		r7=[r6],3*8;;
-	mov		cr19=r3                     // cr.iip
-	mov		cr20=r5                     // cr.idtr
-	mov		cr21=r7;;                   // cr.iitr
+	mov		cr.iip=r3
+	mov		cr.ifa=r5
+	mov		cr.itir=r7;;
 
 	ld8		r3=[r2],3*8
 	ld8		r5=[r4],3*8
 	ld8		r7=[r6],3*8;;
-	mov		cr22=r3                     // cr.iipa
-	mov		cr23=r5                     // cr.ifs
-	mov		cr24=r7                     // cr.iim
+	mov		cr.iipa=r3
+	mov		cr.ifs=r5
+	mov		cr.iim=r7
 
 	ld8		r3=[r2],160;;               // 160 byte increment
-	mov		cr25=r3                     // cr.iha
+	mov		cr.iha=r3
 
 rSkipIntrRegs:
-	ld8		r3=[r2],168;;               // another 168 byte inc.
-
-	ld8		r3=[r2],40;;                // 40 byte increment
-	mov		cr66=r3                     // cr.lid
-
-	ld8		r3=[r2],8;;
-//      mov		cr71=r3                     // cr.ivr is read only
-	ld8		r3=[r2],24;;                // 24 byte increment
-	mov		cr72=r3                     // cr.tpr
-
-	ld8		r3=[r2],168;;               // 168 byte inc.
-//      mov		cr75=r3                     // cr.eoi
+	ld8		r3=[r2],152;;               // another 152 byte inc.
 
-	ld8		r3=[r2],16;;             // 16 byte inc.
-//      mov		cr96=r3                     // cr.irr0 is read only
+	add		r4=8,r2                     // duplicate r2 in r4
+	add		r6=2*8,r2;;                 // duplicate r2 in r6
 
-	ld8		r3=[r2],16;;             // 16 byte inc.
-//      mov		cr98=r3                     // cr.irr1 is read only
+	ld8		r3=[r2],8*3
+	ld8		r5=[r4],8*3
+	ld8		r7=[r6],8*3;;
+	mov		cr.lid=r3
+//	mov		cr.ivr=r5                   // cr.ivr is read only
+	mov		cr.tpr=r7;;
+
+	ld8		r3=[r2],8*3
+	ld8		r5=[r4],8*3
+	ld8		r7=[r6],8*3;;
+//	mov		cr.eoi=r3
+//	mov		cr.irr0=r5                  // cr.irr0 is read only
+//	mov		cr.irr1=r7;;                // cr.irr1 is read only
+
+	ld8		r3=[r2],8*3
+	ld8		r5=[r4],8*3
+	ld8		r7=[r6],8*3;;
+//	mov		cr.irr2=r3                  // cr.irr2 is read only
+//	mov		cr.irr3=r5                  // cr.irr3 is read only
+	mov		cr.itv=r7;;
+
+	ld8		r3=[r2],8*7
+	ld8		r5=[r4],8*7;;
+	mov		cr.pmv=r3
+	mov		cr.cmcv=r5;;
+
+	ld8		r3=[r2],8*23
+	ld8		r5=[r4],8*23;;
+	adds		r2=8*23,r2
+	adds		r4=8*23,r4;;
+//	mov		cr.lrr0=r3
+//	mov		cr.lrr1=r5
 
-	ld8		r3=[r2],16;;             // 16 byte inc
-//      mov		cr100=r3                    // cr.irr2 is read only
-
-	ld8		r3=[r2],16;;             // 16b inc.
-//      mov		cr102=r3                    // cr.irr3 is read only
-
-	ld8		r3=[r2],16;;             // 16 byte inc.
-//      mov		cr114=r3                    // cr.itv
-
-	ld8		r3=[r2],8;;
-//      mov		cr116=r3                    // cr.pmv
-	ld8		r3=[r2],8;;
-//      mov		cr117=r3                    // cr.lrr0
-	ld8		r3=[r2],8;;
-//      mov		cr118=r3                    // cr.lrr1
-	ld8		r3=[r2],8*10;;
-//      mov		cr119=r3                    // cr.cmcv
+	adds		r2=8*2,r2;;
 
 restore_ARs:
 	add		r4=8,r2                  // duplicate r2 in r4
@@ -589,67 +587,67 @@
 	ld8		r3=[r2],3*8
 	ld8		r5=[r4],3*8
 	ld8		r7=[r6],3*8;;
-	mov		ar0=r3                      // ar.kro
-	mov		ar1=r5                      // ar.kr1
-	mov		ar2=r7;;                    // ar.kr2
+	mov		ar.k0=r3
+	mov		ar.k1=r5
+	mov		ar.k2=r7;;
 
 	ld8		r3=[r2],3*8
 	ld8		r5=[r4],3*8
 	ld8		r7=[r6],3*8;;
-	mov		ar3=r3                      // ar.kr3
-	mov		ar4=r5                      // ar.kr4
-	mov		ar5=r7;;                    // ar.kr5
+	mov		ar.k3=r3
+	mov		ar.k4=r5
+	mov		ar.k5=r7;;
 
 	ld8		r3=[r2],10*8
 	ld8		r5=[r4],10*8
 	ld8		r7=[r6],10*8;;
-	mov		ar6=r3                      // ar.kr6
-	mov		ar7=r5                      // ar.kr7
-//      mov		ar8=r6                      // ar.kr8
+	mov		ar.k6=r3
+	mov		ar.k7=r5
 	;;
 
 	ld8		r3=[r2],3*8
 	ld8		r5=[r4],3*8
 	ld8		r7=[r6],3*8;;
-//      mov		ar16=r3                     // ar.rsc
-//      mov		ar17=r5                     // ar.bsp is read only
-	mov		ar16=r0			    // make sure that RSE is in enforced lazy mode
+//	mov		ar.rsc=r3
+//	mov		ar.bsp=r5                   // ar.bsp is read only
+	mov		ar.rsc=r0			    // make sure that RSE is in enforced lazy mode
 	;;
-	mov		ar18=r7;;                   // ar.bspstore
+	mov		ar.bspstore=r7;;
 
 	ld8		r9=[r2],8*13;;
-	mov		ar19=r9                     // ar.rnat
+	mov		ar.rnat=r9
 
-	mov		ar16=r3			    // ar.rsc
+	mov		ar.rsc=r3
 	ld8		r3=[r2],8*4;;
-	mov		ar32=r3                     // ar.ccv
+	mov		ar.ccv=r3
 
 	ld8		r3=[r2],8*4;;
-	mov		ar36=r3                     // ar.unat
+	mov		ar.unat=r3
 
 	ld8		r3=[r2],8*4;;
-	mov		ar40=r3                     // ar.fpsr
+	mov		ar.fpsr=r3
 
 	ld8		r3=[r2],160;;               // 160
-//      mov		ar44=r3                     // ar.itc
+//      mov		ar.itc=r3
 
 	ld8		r3=[r2],8;;
-	mov		ar64=r3                     // ar.pfs
+	mov		ar.pfs=r3
 
 	ld8		r3=[r2],8;;
-	mov		ar65=r3                     // ar.lc
+	mov		ar.lc=r3
 
 	ld8		r3=[r2];;
-	mov		ar66=r3                     // ar.ec
+	mov		ar.ec=r3
 	add		r2=8*62,r2;;             // padding
 
 restore_RRs:
 	mov		r5=ar.lc
 	mov		ar.lc=0x08-1
-	movl		r4=0x00
+	movl		r4=0x00;;
 cStRRr:
+	dep.z		r7=r4,61,3
 	ld8		r3=[r2],8;;
-//      mov		rr[r4]=r3                   // what are its access previledges?
+	mov		rr[r7]=r3                   // what are its access previledges?
 	add		r4=1,r4
 	br.cloop.sptk.few	cStRRr
 	;;
diff -Nru a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
--- a/arch/ia64/kernel/patch.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/patch.c	Fri Oct 17 23:12:58 2003
@@ -130,9 +130,11 @@
 
 	while (offp < (s32 *) end) {
 		wp = (u64 *) ia64_imva((char *) offp + *offp);
-		wp[0] = 0x0000000100000000;
+		wp[0] = 0x0000000100000000; /* nop.m 0; nop.i 0; nop.i 0 */
 		wp[1] = 0x0004000000000200;
-		ia64_fc(wp);
+		wp[2] = 0x0000000100000011; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */
+		wp[3] = 0x0084006880000200;
+		ia64_fc(wp); ia64_fc(wp + 2);
 		++offp;
 	}
 	ia64_sync_i();
diff -Nru a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
--- a/arch/ia64/kernel/perfmon.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/perfmon.c	Fri Oct 17 23:12:58 2003
@@ -140,7 +140,7 @@
  * in UP:
  * 	- we need to protect against PMU overflow interrupts (local_irq_disable)
  *
- * spin_lock_irqsave()/spin_unlock_irqrestore():
+ * spin_lock_irqsave()/spin_lock_irqrestore():
  * 	in SMP: local_irq_disable + spin_lock
  * 	in UP : local_irq_disable
  *
@@ -254,7 +254,6 @@
 	unsigned long	seed;		/* seed for random-number generator */
 	unsigned long	mask;		/* mask for random-number generator */
 	unsigned int 	flags;		/* notify/do not notify */
-	int 		next_reset_type;/* PFM_PMD_NO_RESET, PFM_PMD_LONG_RESET, PFM_PMD_SHORT_RESET */
 	unsigned long	eventid;	/* overflow event identifier */
 } pfm_counter_t;
 
@@ -267,10 +266,10 @@
 	unsigned int using_dbreg:1;	/* using range restrictions (debug registers) */
 	unsigned int is_sampling:1;	/* true if using a custom format */
 	unsigned int excl_idle:1;	/* exclude idle task in system wide session */
-	unsigned int unsecure:1;	/* exclude idle task in system wide session */
 	unsigned int going_zombie:1;	/* context is zombie (MASKED+blocking) */
 	unsigned int trap_reason:2;	/* reason for going into pfm_handle_work() */
 	unsigned int no_msg:1;		/* no message sent on overflow */
+	unsigned int can_restart:1;	/* allowed to issue a PFM_RESTART */
 	unsigned int reserved:22;
 } pfm_context_flags_t;
 
@@ -356,10 +355,10 @@
 #define ctx_fl_using_dbreg	ctx_flags.using_dbreg
 #define ctx_fl_is_sampling	ctx_flags.is_sampling
 #define ctx_fl_excl_idle	ctx_flags.excl_idle
-#define ctx_fl_unsecure		ctx_flags.unsecure
 #define ctx_fl_going_zombie	ctx_flags.going_zombie
 #define ctx_fl_trap_reason	ctx_flags.trap_reason
 #define ctx_fl_no_msg		ctx_flags.no_msg
+#define ctx_fl_can_restart	ctx_flags.can_restart
 
 #define PFM_SET_WORK_PENDING(t, v)	do { (t)->thread.pfm_needs_checking = v; } while(0);
 #define PFM_GET_WORK_PENDING(t)		(t)->thread.pfm_needs_checking
@@ -493,12 +492,11 @@
 
 typedef struct {
 	unsigned long pfm_spurious_ovfl_intr_count;	/* keep track of spurious ovfl interrupts */
+	unsigned long pfm_replay_ovfl_intr_count;	/* keep track of replayed ovfl interrupts */
 	unsigned long pfm_ovfl_intr_count; 		/* keep track of ovfl interrupts */
 	unsigned long pfm_ovfl_intr_cycles;		/* cycles spent processing ovfl interrupts */
 	unsigned long pfm_ovfl_intr_cycles_min;		/* min cycles spent processing ovfl interrupts */
 	unsigned long pfm_ovfl_intr_cycles_max;		/* max cycles spent processing ovfl interrupts */
-	unsigned long pfm_sysupdt_count;
-	unsigned long pfm_sysupdt_cycles;
 	unsigned long pfm_smpl_handler_calls;
 	unsigned long pfm_smpl_handler_cycles;
 	char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
@@ -513,10 +511,8 @@
 static struct proc_dir_entry 	*perfmon_dir;
 static pfm_uuid_t		pfm_null_uuid = {0,};
 
-static spinlock_t		pfm_smpl_fmt_lock;
-static pfm_buffer_fmt_t		*pfm_buffer_fmt_list;
-#define LOCK_BUF_FMT_LIST()	    spin_lock(&pfm_smpl_fmt_lock)
-#define UNLOCK_BUF_FMT_LIST()	    spin_unlock(&pfm_smpl_fmt_lock)
+static spinlock_t		pfm_buffer_fmt_lock;
+static LIST_HEAD(pfm_buffer_fmt_list);
 
 /* sysctl() controls */
 static pfm_sysctl_t pfm_sysctl;
@@ -544,14 +540,8 @@
 	close: pfm_vm_close
 };
 
-#define pfm_wait_task_inactive(t)	wait_task_inactive(t)
 #define pfm_get_cpu_var(v)		__ia64_per_cpu_var(v)
 #define pfm_get_cpu_data(a,b)		per_cpu(a, b)
-typedef	irqreturn_t	pfm_irq_handler_t;
-#define PFM_IRQ_HANDLER_RET(v)	do {  \
-		put_cpu_no_resched(); \
-		return IRQ_HANDLED;   \
-	} while(0);
 
 static inline void
 pfm_put_task(struct task_struct *task)
@@ -628,7 +618,6 @@
 	.get_sb   = pfmfs_get_sb,
 	.kill_sb  = kill_anon_super,
 };
-
 DEFINE_PER_CPU(unsigned long, pfm_syst_info);
 DEFINE_PER_CPU(struct task_struct *, pmu_owner);
 DEFINE_PER_CPU(pfm_context_t  *, pmu_ctx);
@@ -734,12 +723,14 @@
 static inline void
 pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
 {
-	ctx->ctx_pmds[i].val = val  & ~pmu_conf.ovfl_val;
+	unsigned long ovfl_val = pmu_conf.ovfl_val;
+
+	ctx->ctx_pmds[i].val = val  & ~ovfl_val;
 	/*
 	 * writing to unimplemented part is ignore, so we do not need to
 	 * mask off top part
 	 */
-	ia64_set_pmd(i, val & pmu_conf.ovfl_val);
+	ia64_set_pmd(i, val & ovfl_val);
 }
 
 static pfm_msg_t *
@@ -870,11 +861,12 @@
 {
 	pfm_context_t *ctx = PFM_GET_CTX(task);
 	struct thread_struct *th = &task->thread;
-	unsigned long mask, val;
+	unsigned long mask, val, ovfl_mask;
 	int i;
 
-	DPRINT(("[%d] masking monitoring for [%d]\n", current->pid, task->pid));
+	DPRINT_ovfl(("[%d] masking monitoring for [%d]\n", current->pid, task->pid));
 
+	ovfl_mask = pmu_conf.ovfl_val;
 	/*
 	 * monitoring can only be masked as a result of a valid
 	 * counter overflow. In UP, it means that the PMU still
@@ -904,14 +896,14 @@
 			/*
 		 	 * we rebuild the full 64 bit value of the counter
 		 	 */
-			ctx->ctx_pmds[i].val += (val & pmu_conf.ovfl_val);
+			ctx->ctx_pmds[i].val += (val & ovfl_mask);
 		} else {
 			ctx->ctx_pmds[i].val = val;
 		}
-		DPRINT(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
+		DPRINT_ovfl(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
 			i,
 			ctx->ctx_pmds[i].val,
-			val & pmu_conf.ovfl_val));
+			val & ovfl_mask));
 	}
 	/*
 	 * mask monitoring by setting the privilege level to 0
@@ -926,6 +918,7 @@
 		if ((mask & 0x1) == 0UL) continue;
 		ia64_set_pmc(i, th->pmcs[i] & ~0xfUL);
 		th->pmcs[i] &= ~0xfUL;
+		DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, th->pmcs[i]));
 	}
 	/*
 	 * make all of this visible
@@ -943,11 +936,12 @@
 {
 	pfm_context_t *ctx = PFM_GET_CTX(task);
 	struct thread_struct *th = &task->thread;
-	unsigned long mask;
+	unsigned long mask, ovfl_mask;
 	unsigned long psr, val;
 	int i, is_system;
 
 	is_system = ctx->ctx_fl_system;
+	ovfl_mask = pmu_conf.ovfl_val;
 
 	if (task != current) {
 		printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid);
@@ -989,8 +983,8 @@
 			 * we split the 64bit value according to
 			 * counter width
 			 */
-			val = ctx->ctx_pmds[i].val & pmu_conf.ovfl_val;
-			ctx->ctx_pmds[i].val &= ~pmu_conf.ovfl_val;
+			val = ctx->ctx_pmds[i].val & ovfl_mask;
+			ctx->ctx_pmds[i].val &= ~ovfl_mask;
 		} else {
 			val = ctx->ctx_pmds[i].val;
 		}
@@ -1206,12 +1200,36 @@
 	return ret;
 }
 
+static pfm_buffer_fmt_t *
+__pfm_find_buffer_fmt(pfm_uuid_t uuid)
+{
+	struct list_head * pos;
+	pfm_buffer_fmt_t * entry;
 
-
+	list_for_each(pos, &pfm_buffer_fmt_list) {
+		entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
+		if (pfm_uuid_cmp(uuid, entry->fmt_uuid) == 0)
+			return entry;
+	}
+	return NULL;
+}
+ 
+/*
+ * find a buffer format based on its uuid
+ */
+static pfm_buffer_fmt_t *
+pfm_find_buffer_fmt(pfm_uuid_t uuid)
+{
+	pfm_buffer_fmt_t * fmt;
+	spin_lock(&pfm_buffer_fmt_lock);
+	fmt = __pfm_find_buffer_fmt(uuid);
+	spin_unlock(&pfm_buffer_fmt_lock);
+	return fmt;
+}
+ 
 int
 pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt)
 {
-	pfm_buffer_fmt_t *p;
 	int ret = 0;
 
 	/* some sanity checks */
@@ -1224,80 +1242,44 @@
 	 * XXX: need check validity of fmt_arg_size
 	 */
 
-	LOCK_BUF_FMT_LIST();
-	p = pfm_buffer_fmt_list;
-
+	spin_lock(&pfm_buffer_fmt_lock);
 
-	while (p) {
-		if (pfm_uuid_cmp(fmt->fmt_uuid, p->fmt_uuid) == 0) break;
-		p = p->fmt_next;
-	}
-
-	if (p) {
+	if (__pfm_find_buffer_fmt(fmt->fmt_uuid)) {
 		printk(KERN_ERR "perfmon: duplicate sampling format: %s\n", fmt->fmt_name);
 		ret = -EBUSY;
-	} else {
-		fmt->fmt_prev = NULL;
-		fmt->fmt_next = pfm_buffer_fmt_list;
-		pfm_buffer_fmt_list = fmt;
-		printk(KERN_ERR "perfmon: added sampling format %s\n", fmt->fmt_name);
-	}
-	UNLOCK_BUF_FMT_LIST();
+		goto out;
+	} 
+	list_add(&fmt->fmt_list, &pfm_buffer_fmt_list);
+	printk(KERN_INFO "perfmon: added sampling format %s\n", fmt->fmt_name);
 
-	return ret;
+out:
+	spin_unlock(&pfm_buffer_fmt_lock);
+ 	return ret;
 }
 
 int
 pfm_unregister_buffer_fmt(pfm_uuid_t uuid)
 {
-	pfm_buffer_fmt_t *p;
+	pfm_buffer_fmt_t *fmt;
 	int ret = 0;
 
-	LOCK_BUF_FMT_LIST();
-	p = pfm_buffer_fmt_list;
-	while (p) {
-		if (memcmp(uuid, p->fmt_uuid, sizeof(pfm_uuid_t)) == 0) break;
-		p = p->fmt_next;
-	}
-	if (p) {
-		if (p->fmt_prev)
-			p->fmt_prev->fmt_next = p->fmt_next;
-		else
-			pfm_buffer_fmt_list = p->fmt_next;
-
-		if (p->fmt_next)
-			p->fmt_next->fmt_prev = p->fmt_prev;
+	spin_lock(&pfm_buffer_fmt_lock);
 
-		printk(KERN_ERR "perfmon: removed sampling format: %s\n",  p->fmt_name);
-		p->fmt_next = p->fmt_prev = NULL;
-	} else {
+	fmt = __pfm_find_buffer_fmt(uuid);
+	if (!fmt) {
 		printk(KERN_ERR "perfmon: cannot unregister format, not found\n");
 		ret = -EINVAL;
+		goto out;
 	}
-	UNLOCK_BUF_FMT_LIST();
+	list_del_init(&fmt->fmt_list);
+	printk(KERN_INFO "perfmon: removed sampling format: %s\n", fmt->fmt_name);
 
+out:
+	spin_unlock(&pfm_buffer_fmt_lock);
 	return ret;
 
 }
 
-/*
- * find a buffer format based on its uuid
- */
-static pfm_buffer_fmt_t *
-pfm_find_buffer_fmt(pfm_uuid_t uuid, int nolock)
-{
-	pfm_buffer_fmt_t *p;
-
-	LOCK_BUF_FMT_LIST();
-	for (p = pfm_buffer_fmt_list; p ; p = p->fmt_next) {
-		if (pfm_uuid_cmp(uuid, p->fmt_uuid) == 0) break;
-	}
-
-	UNLOCK_BUF_FMT_LIST();
-
-	return p;
-}
-
 static int
 pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
 {
@@ -2113,7 +2095,7 @@
 	return 1;
 }
 static struct dentry_operations pfmfs_dentry_operations = {
-	.d_delete	= pfmfs_delete_dentry,
+	.d_delete = pfmfs_delete_dentry,
 };
 
 
@@ -2420,7 +2402,7 @@
 #define PFM_CTXARG_BUF_ARG(a)	(pfm_buffer_fmt_t *)(a+1)
 
 	/* invoke and lock buffer format, if found */
-	fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id, 0);
+	fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id);
 	if (fmt == NULL) {
 		DPRINT(("[%d] cannot find buffer format\n", task->pid));
 		return -EINVAL;
@@ -2528,8 +2510,7 @@
 
 	if (!pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) return 0;
 
-	/* no buffer locking here, will be called again */
-	fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id, 1);
+	fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id);
 	if (fmt == NULL) {
 		DPRINT(("cannot find buffer format\n"));
 		return -EINVAL;
@@ -2588,7 +2569,7 @@
 	/*
 	 * make sure the task is off any CPU
 	 */
-	pfm_wait_task_inactive(task);
+	wait_task_inactive(task);
 
 	/* more to come... */
 
@@ -2679,7 +2660,6 @@
 	 */
 	ctx->ctx_fl_block       = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
 	ctx->ctx_fl_system      = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
-	ctx->ctx_fl_unsecure	= (ctx_flags & PFM_FL_UNSECURE) ? 1: 0;
 	ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */
 	ctx->ctx_fl_no_msg      = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
 	/*
@@ -2705,13 +2685,12 @@
 	init_waitqueue_head(&ctx->ctx_msgq_wait);
 	init_waitqueue_head(&ctx->ctx_zombieq);
 
-	DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d unsecure=%d no_msg=%d ctx_fd=%d \n",
+	DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n",
 		ctx,
 		ctx_flags,
 		ctx->ctx_fl_system,
 		ctx->ctx_fl_block,
 		ctx->ctx_fl_excl_idle,
-		ctx->ctx_fl_unsecure,
 		ctx->ctx_fl_no_msg,
 		ctx->ctx_fd));
 
@@ -2755,14 +2734,12 @@
 }
 
 static void
-pfm_reset_regs_masked(pfm_context_t *ctx, unsigned long *ovfl_regs, int flag)
+pfm_reset_regs_masked(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
 {
 	unsigned long mask = ovfl_regs[0];
 	unsigned long reset_others = 0UL;
 	unsigned long val;
-	int i, is_long_reset = (flag == PFM_PMD_LONG_RESET);
-
-	DPRINT_ovfl(("ovfl_regs=0x%lx flag=%d\n", ovfl_regs[0], flag));
+	int i;
 
 	/*
 	 * now restore reset value on sampling overflowed counters
@@ -2793,19 +2770,17 @@
 }
 
 static void
-pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int flag)
+pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
 {
 	unsigned long mask = ovfl_regs[0];
 	unsigned long reset_others = 0UL;
 	unsigned long val;
-	int i, is_long_reset = (flag == PFM_PMD_LONG_RESET);
-
-	DPRINT_ovfl(("ovfl_regs=0x%lx flag=%d\n", ovfl_regs[0], flag));
+	int i;
 
-	if (flag == PFM_PMD_NO_RESET) return;
+	DPRINT_ovfl(("ovfl_regs=0x%lx is_long_reset=%d\n", ovfl_regs[0], is_long_reset));
 
 	if (ctx->ctx_state == PFM_CTX_MASKED) {
-		pfm_reset_regs_masked(ctx, ovfl_regs, flag);
+		pfm_reset_regs_masked(ctx, ovfl_regs, is_long_reset);
 		return;
 	}
 
@@ -3084,7 +3059,7 @@
 {
 	struct thread_struct *thread = NULL;
 	pfarg_reg_t *req = (pfarg_reg_t *)arg;
-	unsigned long value, hw_value;
+	unsigned long value, hw_value, ovfl_mask;
 	unsigned int cnum;
 	int i, can_access_pmu = 0, state;
 	int is_counting, is_loaded, is_system;
@@ -3094,6 +3069,7 @@
 	state     = ctx->ctx_state;
 	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
 	is_system = ctx->ctx_fl_system;
+	ovfl_mask = pmu_conf.ovfl_val;
 
 	if (state == PFM_CTX_TERMINATED || state == PFM_CTX_ZOMBIE) return -EINVAL;
 
@@ -3162,22 +3138,21 @@
 			 * when context is load we use the split value
 			 */
 			if (is_loaded) {
-				hw_value = value &  pmu_conf.ovfl_val;
-				value    = value & ~pmu_conf.ovfl_val;
+				hw_value = value &  ovfl_mask;
+				value    = value & ~ovfl_mask;
 			}
-
-			/*
-			 * update sampling periods
-			 */
-			ctx->ctx_pmds[cnum].long_reset  = req->reg_long_reset;
-			ctx->ctx_pmds[cnum].short_reset = req->reg_short_reset;
-
-			/*
-			 * update randomization parameters
-			 */
-			ctx->ctx_pmds[cnum].seed = req->reg_random_seed;
-			ctx->ctx_pmds[cnum].mask = req->reg_random_mask;
 		}
+		/*
+		 * update reset values (not just for counters)
+		 */
+		ctx->ctx_pmds[cnum].long_reset  = req->reg_long_reset;
+		ctx->ctx_pmds[cnum].short_reset = req->reg_short_reset;
+
+		/*
+		 * update randomization parameters (not just for counters)
+		 */
+		ctx->ctx_pmds[cnum].seed = req->reg_random_seed;
+		ctx->ctx_pmds[cnum].mask = req->reg_random_mask;
 
 		/*
 		 * update context value
@@ -3284,7 +3259,7 @@
 pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 {
 	struct thread_struct *thread = NULL;
-	unsigned long val = 0UL, lval ;
+	unsigned long val = 0UL, lval, ovfl_mask;
 	pfarg_reg_t *req = (pfarg_reg_t *)arg;
 	unsigned int cnum, reg_flags = 0;
 	int i, can_access_pmu = 0, state;
@@ -3299,6 +3274,7 @@
 	state     = ctx->ctx_state;
 	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
 	is_system = ctx->ctx_fl_system;
+	ovfl_mask = pmu_conf.ovfl_val;
 
 	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
 
@@ -3368,7 +3344,7 @@
 			/*
 			 * XXX: need to check for overflow when loaded
 			 */
-			val &= pmu_conf.ovfl_val;
+			val &= ovfl_mask;
 			val += ctx->ctx_pmds[cnum].val;
 
 			lval = ctx->ctx_pmds[cnum].lval;
@@ -3672,22 +3648,48 @@
 		 */
 		ctx->ctx_state = PFM_CTX_LOADED;
 
+		/*
+		 * XXX: not really useful for self monitoring
+		 */
+		ctx->ctx_fl_can_restart = 0;
+
 		return 0;
 	}
-	/* restart another task */
+
+	/* 
+	 * restart another task
+	 */
+
+	/*
+	 * When PFM_CTX_MASKED, we cannot issue a restart before the previous 
+	 * one is seen by the task.
+	 */
+	if (state == PFM_CTX_MASKED) {
+		if (ctx->ctx_fl_can_restart == 0) return -EINVAL;
+		/*
+		 * will prevent subsequent restart before this one is
+		 * seen by other task
+		 */
+		ctx->ctx_fl_can_restart = 0;
+	}
 
 	/*
-	 * if blocking, then post the semaphore.
+	 * if blocking, then post the semaphore is PFM_CTX_MASKED, i.e.
+	 * the task is blocked or on its way to block. That's the normal
+	 * restart path. If the monitoring is not masked, then the task
+	 * can be actively monitoring and we cannot directly intervene.
+	 * Therefore we use the trap mechanism to catch the task and
+	 * force it to reset the buffer/reset PMDs.
+	 *
 	 * if non-blocking, then we ensure that the task will go into
 	 * pfm_handle_work() before returning to user mode.
+	 *
 	 * We cannot explicitely reset another task, it MUST always
 	 * be done by the task itself. This works for system wide because
-	 * the tool that is controlling the session is doing "self-monitoring".
-	 *
-	 * XXX: what if the task never goes back to user?
-	 *
+	 * the tool that is controlling the session is logically doing 
+	 * "self-monitoring".
 	 */
-	if (CTX_OVFL_NOBLOCK(ctx) == 0) {
+	if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) {
 		DPRINT(("unblocking [%d] \n", task->pid));
 		up(&ctx->ctx_restart_sem);
 	} else {
@@ -3725,6 +3727,9 @@
 	return 0;
 }
 
+/*
+ * arg can be NULL and count can be zero for this function
+ */
 static int
 pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 {
@@ -3783,21 +3788,22 @@
 	/*
 	 * check for debug registers in system wide mode
 	 *
-	 * We make the reservation even when context is not loaded
-	 * to make sure we get our slot. Note that the PFM_LOAD_CONTEXT
-	 * may still fail if the task has DBG_VALID set.
+	 * If though a check is done in pfm_context_load(),
+	 * we must repeat it here, in case the registers are
+	 * written after the context is loaded
 	 */
-	LOCK_PFS();
+	if (is_loaded) {
+		LOCK_PFS();
 
-	if (first_time && is_system) {
-		if (pfm_sessions.pfs_ptrace_use_dbregs)
-			ret = -EBUSY;
-		else
-			pfm_sessions.pfs_sys_use_dbregs++;
+		if (first_time && is_system) {
+			if (pfm_sessions.pfs_ptrace_use_dbregs)
+				ret = -EBUSY;
+			else
+				pfm_sessions.pfs_sys_use_dbregs++;
+		}
+		UNLOCK_PFS();
 	}
 
-	UNLOCK_PFS();
-
 	if (ret != 0) return ret;
 
 	/*
@@ -4158,7 +4164,7 @@
 	unsigned long *pmcs_source, *pmds_source;
 	int the_cpu;
 	int ret = 0;
-	int state, is_system;
+	int state, is_system, set_dbregs = 0;
 
 	state     = ctx->ctx_state;
 	is_system = ctx->ctx_fl_system;
@@ -4173,7 +4179,7 @@
 		return -EINVAL;
 	}
 
-	DPRINT(("load_pid [%d]\n", req->load_pid));
+	DPRINT(("load_pid [%d] using_dbreg=%d\n", req->load_pid, ctx->ctx_fl_using_dbreg));
 
 	if (CTX_OVFL_NOBLOCK(ctx) == 0 && req->load_pid == current->pid) {
 		DPRINT(("cannot use blocking mode on self for [%d]\n", current->pid));
@@ -4200,15 +4206,33 @@
 
 	thread = &task->thread;
 
-	ret = -EBUSY;
-
+	ret = 0;
 	/*
 	 * cannot load a context which is using range restrictions,
 	 * into a task that is being debugged.
 	 */
-	if (ctx->ctx_fl_using_dbreg && (thread->flags & IA64_THREAD_DBG_VALID)) {
-		DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid));
-		goto error;
+	if (ctx->ctx_fl_using_dbreg) {
+		if (thread->flags & IA64_THREAD_DBG_VALID) {
+			ret = -EBUSY;
+			DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid));
+			goto error;
+		}
+		LOCK_PFS();
+
+		if (is_system) {
+			if (pfm_sessions.pfs_ptrace_use_dbregs) {
+				DPRINT(("cannot load [%d] dbregs in use\n", task->pid));
+				ret = -EBUSY;
+			} else {
+				pfm_sessions.pfs_sys_use_dbregs++;
+				DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs));
+				set_dbregs = 1;
+			}
+		}
+
+		UNLOCK_PFS();
+
+		if (ret) goto error;
 	}
 
 	/*
@@ -4228,13 +4252,13 @@
 	 */
 	the_cpu = ctx->ctx_cpu = smp_processor_id();
 
+	ret = -EBUSY;
 	/*
 	 * now reserve the session
 	 */
 	ret = pfm_reserve_session(current, is_system, the_cpu);
 	if (ret) goto error;
 
-	ret = -EBUSY;
 	/*
 	 * task is necessarily stopped at this point.
 	 *
@@ -4342,11 +4366,6 @@
 		/* initial saved psr (stopped) */
 		ctx->ctx_saved_psr_up = 0UL;
 		ia64_psr(regs)->up = ia64_psr(regs)->pp = 0;
-
-		if (ctx->ctx_fl_unsecure) {
-			ia64_psr(regs)->sp = 0;
-			DPRINT(("context unsecured for [%d]\n", task->pid));
-		}
 	}
 
 	ret = 0;
@@ -4355,6 +4374,14 @@
 	if (ret) pfm_unreserve_session(ctx, ctx->ctx_fl_system, the_cpu);
 error:
 	/*
+	 * we must undo the dbregs setting (for system-wide)
+	 */
+	if (ret && set_dbregs) {
+		LOCK_PFS();
+		pfm_sessions.pfs_sys_use_dbregs--;
+		UNLOCK_PFS();
+	}
+	/*
 	 * release task, there is now a link with the context
 	 */
 	if (is_system == 0 && task != current) {
@@ -4455,7 +4482,7 @@
 	 */
 	tregs = task == current ? regs : ia64_task_regs(task);
 
-	if (task == current || ctx->ctx_fl_unsecure) {
+	if (task == current) {
 		/*
 		 * cancel user level control
 		 */
@@ -4493,7 +4520,10 @@
 	ctx->ctx_task             = NULL;
 
 	PFM_SET_WORK_PENDING(task, 0);
-	ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
+
+	ctx->ctx_fl_trap_reason  = PFM_TRAP_REASON_NONE;
+	ctx->ctx_fl_can_restart  = 0;
+	ctx->ctx_fl_going_zombie = 0;
 
 	DPRINT(("disconnected [%d] from context\n", task->pid));
 
@@ -4686,7 +4716,7 @@
 
 	UNPROTECT_CTX(ctx, flags);
 
-	pfm_wait_task_inactive(task);
+	wait_task_inactive(task);
 
 	PROTECT_CTX(ctx, flags);
 
@@ -4725,7 +4755,8 @@
 		PFM_CMD_IDX(cmd),
 		PFM_CMD_IS_VALID(cmd),
 		PFM_CMD_NARG(cmd),
-		PFM_CMD_ARG_SIZE(cmd), count));
+		PFM_CMD_ARG_SIZE(cmd), 
+		count));
 
 	/*
 	 * check if number of arguments matches what the command expects
@@ -4842,8 +4873,10 @@
 {
 	pfm_buffer_fmt_t *fmt = ctx->ctx_buf_fmt;
 	pfm_ovfl_ctrl_t rst_ctrl;
+	int state;
 	int ret = 0;
 
+	state = ctx->ctx_state;
 	/*
 	 * Unlock sampling buffer and reset index atomically
 	 * XXX: not really needed when blocking
@@ -4853,9 +4886,10 @@
 		rst_ctrl.bits.mask_monitoring = 0;
 		rst_ctrl.bits.reset_ovfl_pmds = 1;
 
-		/* XXX: check return value */
-		if (fmt->fmt_restart)
-			ret = (*fmt->fmt_restart)(current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
+		if (state == PFM_CTX_LOADED)
+			ret = pfm_buf_fmt_restart_active(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
+		else
+			ret = pfm_buf_fmt_restart(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
 	} else {
 		rst_ctrl.bits.mask_monitoring = 0;
 		rst_ctrl.bits.reset_ovfl_pmds = 1;
@@ -4876,7 +4910,6 @@
 	}
 }
 
-
 /*
  * context MUST BE LOCKED when calling
  * can only be called for current
@@ -4954,7 +4987,7 @@
 	reason = ctx->ctx_fl_trap_reason;
 	ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
 
-	DPRINT(("[%d] reason=%d\n", current->pid, reason));
+	DPRINT(("[%d] reason=%d state=%d\n", current->pid, reason, ctx->ctx_state));
 
 	/*
 	 * must be done before we check non-blocking mode
@@ -5085,7 +5118,7 @@
 {
 	pfm_ovfl_arg_t ovfl_arg;
 	unsigned long mask;
-	unsigned long old_val;
+	unsigned long old_val, ovfl_val;
 	unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL;
 	unsigned long tstamp;
 	pfm_ovfl_ctrl_t	ovfl_ctrl;
@@ -5101,7 +5134,8 @@
 
 	tstamp = ia64_get_itc();
 
-	mask = pmc0 >> PMU_FIRST_COUNTER;
+	mask     = pmc0 >> PMU_FIRST_COUNTER;
+	ovfl_val = pmu_conf.ovfl_val;
 
 	DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s "
 		     "used_pmds=0x%lx reload_pmcs=0x%lx\n",
@@ -5133,7 +5167,7 @@
 		 * pfm_read_pmds().
 		 */
 		old_val               = ctx->ctx_pmds[i].val;
-		ctx->ctx_pmds[i].val += 1 + pmu_conf.ovfl_val;
+		ctx->ctx_pmds[i].val += 1 + ovfl_val;
 
 		/*
 		 * check for overflow condition
@@ -5145,7 +5179,7 @@
 
 		DPRINT_ovfl(("ctx_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx smpl_pmds=0x%lx\n",
 			i, ctx->ctx_pmds[i].val, old_val,
-			ia64_get_pmd(i) & pmu_conf.ovfl_val, ovfl_pmds, ovfl_notify, smpl_pmds));
+			ia64_get_pmd(i) & ovfl_val, ovfl_pmds, ovfl_notify, smpl_pmds));
 	}
 
 	/*
@@ -5196,6 +5230,7 @@
 				for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) {
 					if ((smpl_pmds & 0x1) == 0) continue;
 					ovfl_arg.smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ?  pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
+					DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg.smpl_pmds_values[k-1]));
 				}
 			}
 
@@ -5294,6 +5329,7 @@
 	if (ovfl_ctrl.bits.mask_monitoring) {
 		pfm_mask_monitoring(task);
 		ctx->ctx_state = PFM_CTX_MASKED;
+		ctx->ctx_fl_can_restart = 1;
 	}
 
 	/*
@@ -5376,12 +5412,10 @@
 		 */
 
 		/* sanity check */
-		if (!ctx) goto report_spurious;
+		if (!ctx) goto report_spurious1;
 
-		if (ctx->ctx_fl_system == 0 && (task->thread.flags & IA64_THREAD_PM_VALID) == 0) {
-			printk("perfmon: current [%d] owner = [%d] PMVALID=0 state=%d\n", current->pid, task->pid, ctx->ctx_state);
-			goto report_spurious;
-		}
+		if (ctx->ctx_fl_system == 0 && (task->thread.flags & IA64_THREAD_PM_VALID) == 0) 
+			goto report_spurious2;
 
 		PROTECT_CTX_NOPRINT(ctx, flags);
 
@@ -5400,14 +5434,20 @@
 
 	return retval;
 
-report_spurious:
+report_spurious1:
 	printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n",
 		this_cpu, task->pid);
 	pfm_unfreeze_pmu();
 	return -1;
+report_spurious2:
+	printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n", 
+		this_cpu, 
+		task->pid);
+	pfm_unfreeze_pmu();
+	return -1;
 }
 
-static pfm_irq_handler_t
+static irqreturn_t
 pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
 {
 	unsigned long start_cycles, total_cycles;
@@ -5436,7 +5476,8 @@
 
 		pfm_stats[this_cpu].pfm_ovfl_intr_cycles += total_cycles;
 	}
-	PFM_IRQ_HANDLER_RET();
+	put_cpu_no_resched();
+	return IRQ_HANDLED;
 }
 
 
@@ -5445,10 +5486,13 @@
 pfm_proc_info(char *page)
 {
 	char *p = page;
-	pfm_buffer_fmt_t *b;
+	struct list_head * pos;
+	pfm_buffer_fmt_t * entry;
 	unsigned long psr;
+	int online_cpus = 0;
 	int i;
 
+		p += sprintf(p, "perfmon version           : %u.%u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN);
 		p += sprintf(p, "model                     : %s\n", pmu_conf.pmu_name);
 		p += sprintf(p, "fastctxsw                 : %s\n", pfm_sysctl.fastctxsw > 0 ? "Yes": "No");
 		p += sprintf(p, "ovfl_mask                 : 0x%lx\n", pmu_conf.ovfl_val);
@@ -5462,17 +5506,17 @@
 		p += sprintf(p, "CPU%-2d smpl handler calls  : %lu\n", i, pfm_stats[i].pfm_smpl_handler_calls);
 		p += sprintf(p, "CPU%-2d smpl handler cycles : %lu\n", i, pfm_stats[i].pfm_smpl_handler_cycles);
 		p += sprintf(p, "CPU%-2d spurious intrs      : %lu\n", i, pfm_stats[i].pfm_spurious_ovfl_intr_count);
-		p += sprintf(p, "CPU%-2d sysupdt count       : %lu\n", i, pfm_stats[i].pfm_sysupdt_count);
-		p += sprintf(p, "CPU%-2d sysupdt cycles      : %lu\n", i, pfm_stats[i].pfm_sysupdt_cycles);
+		p += sprintf(p, "CPU%-2d replay   intrs      : %lu\n", i, pfm_stats[i].pfm_replay_ovfl_intr_count);
 		p += sprintf(p, "CPU%-2d syst_wide           : %d\n" , i, pfm_get_cpu_data(pfm_syst_info, i) & PFM_CPUINFO_SYST_WIDE ? 1 : 0);
 		p += sprintf(p, "CPU%-2d dcr_pp              : %d\n" , i, pfm_get_cpu_data(pfm_syst_info, i) & PFM_CPUINFO_DCR_PP ? 1 : 0);
 		p += sprintf(p, "CPU%-2d exclude idle        : %d\n" , i, pfm_get_cpu_data(pfm_syst_info, i) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0);
 		p += sprintf(p, "CPU%-2d owner               : %d\n" , i, pfm_get_cpu_data(pmu_owner, i) ? pfm_get_cpu_data(pmu_owner, i)->pid: -1);
 		p += sprintf(p, "CPU%-2d context             : %p\n" , i, pfm_get_cpu_data(pmu_ctx, i));
 		p += sprintf(p, "CPU%-2d activations         : %lu\n", i, pfm_get_cpu_data(pmu_activation_number,i));
+		online_cpus++;
 	}
 
-	if (num_online_cpus() == 1)
+	if (online_cpus == 1)
 	{
 		psr = pfm_get_psr();
 		ia64_srlz_d();
@@ -5485,7 +5529,7 @@
 	}
 
 	LOCK_PFS();
-		p += sprintf(p, "proc_sessions             : %u\n"
+	p += sprintf(p, "proc_sessions             : %u\n"
 			"sys_sessions              : %u\n"
 			"sys_use_dbregs            : %u\n"
 			"ptrace_use_dbregs         : %u\n",
@@ -5495,29 +5539,30 @@
 			pfm_sessions.pfs_ptrace_use_dbregs);
 	UNLOCK_PFS();
 
-	LOCK_BUF_FMT_LIST();
+	spin_lock(&pfm_buffer_fmt_lock);
 
-	for (b = pfm_buffer_fmt_list; b ; b = b->fmt_next) {
+	list_for_each(pos, &pfm_buffer_fmt_list) {
+		entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
 		p += sprintf(p, "format                    : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n",
-				b->fmt_uuid[0],
-				b->fmt_uuid[1],
-				b->fmt_uuid[2],
-				b->fmt_uuid[3],
-				b->fmt_uuid[4],
-				b->fmt_uuid[5],
-				b->fmt_uuid[6],
-				b->fmt_uuid[7],
-				b->fmt_uuid[8],
-				b->fmt_uuid[9],
-				b->fmt_uuid[10],
-				b->fmt_uuid[11],
-				b->fmt_uuid[12],
-				b->fmt_uuid[13],
-				b->fmt_uuid[14],
-				b->fmt_uuid[15],
-				b->fmt_name);
+				entry->fmt_uuid[0],
+				entry->fmt_uuid[1],
+				entry->fmt_uuid[2],
+				entry->fmt_uuid[3],
+				entry->fmt_uuid[4],
+				entry->fmt_uuid[5],
+				entry->fmt_uuid[6],
+				entry->fmt_uuid[7],
+				entry->fmt_uuid[8],
+				entry->fmt_uuid[9],
+				entry->fmt_uuid[10],
+				entry->fmt_uuid[11],
+				entry->fmt_uuid[12],
+				entry->fmt_uuid[13],
+				entry->fmt_uuid[14],
+				entry->fmt_uuid[15],
+				entry->fmt_name);
 	}
-	UNLOCK_BUF_FMT_LIST();
+	spin_unlock(&pfm_buffer_fmt_lock);
 
 	return p - page;
 }
@@ -5546,7 +5591,7 @@
  * local_cpu_data->pfm_syst_info
  */
 void
-pfm_do_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin)
+pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin)
 {
 	struct pt_regs *regs;
 	unsigned long dcr;
@@ -5591,21 +5636,10 @@
 	}
 }
 
-void
-pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin)
-{
-	unsigned long start, end;
-
-	pfm_stats[smp_processor_id()].pfm_sysupdt_count++;
-	start = ia64_get_itc();
-
-	pfm_do_syst_wide_update_task(task, info, is_ctxswin);
-
-	end = ia64_get_itc();
-	pfm_stats[smp_processor_id()].pfm_sysupdt_cycles += end-start;
-}
-
 #ifdef CONFIG_SMP
+/*
+ * in 2.6, interrupts are masked when we come here and the runqueue lock is held
+ */
 void
 pfm_save_regs(struct task_struct *task)
 {
@@ -5706,14 +5740,11 @@
 	/*
 	 * unfreeze PMU if had pending overflows
 	 */
-	if (t->pmcs[0] & ~1UL) pfm_unfreeze_pmu();
+	if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
 
 	/*
-	 * finally, unmask interrupts and allow context
-	 * access.
-	 * Any pended overflow interrupt may be delivered
-	 * here and will be treated as spurious because we
-	 * have have no PMU owner anymore.
+	 * finally, allow context access.
+	 * interrupts will still be masked after this call.
 	 */
 	pfm_unprotect_ctx_ctxsw(ctx, flags);
 
@@ -5726,10 +5757,6 @@
 }
 
 #else /* !CONFIG_SMP */
-
-/*
- * in 2.5, interrupts are masked when we come here
- */
 void
 pfm_save_regs(struct task_struct *task)
 {
@@ -5836,6 +5863,9 @@
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_SMP
+/*
+ * in 2.6, interrupts are masked when we come here and the runqueue lock is held
+ */
 void
 pfm_load_regs (struct task_struct *task)
 {
@@ -5959,20 +5989,24 @@
 	 * was saved.
 	 */
 	if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
-		struct pt_regs *regs = ia64_task_regs(task);
-		pfm_overflow_handler(task, ctx, t->pmcs[0], regs);
+		/*
+		 * reload pmc0 with the overflow information
+		 * On McKinley PMU, this will trigger a PMU interrupt
+		 */
+		ia64_set_pmc(0, t->pmcs[0]);
+		ia64_srlz_d();
+		t->pmcs[0] = 0UL;
+#ifndef CONFIG_MCKINLEY
+		/*
+		 * will replay the PMU interrupt
+		 */
+		DPRINT(("perfmon: resend irq for [%d]\n", task->pid));
+		hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
+#endif
+		pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
 	}
 
 	/*
-	 * we clear PMC0, to ensure that any in flight interrupt
-	 * will not be attributed to the new context we are installing
-	 * because the actual overflow has been processed above already.
-	 * No real effect until we unmask interrupts at the end of the
-	 * function.
-	 */
-	pfm_unfreeze_pmu();
-
-	/*
 	 * we just did a reload, so we reset the partial reload fields
 	 */
 	ctx->ctx_reload_pmcs[0] = 0UL;
@@ -5990,13 +6024,15 @@
 	SET_ACTIVATION(ctx);
 
 	/*
-	 * establish new ownership. Interrupts
-	 * are still masked at this point.
+	 * establish new ownership. 
 	 */
 	SET_PMU_OWNER(task, ctx);
 
 	/*
-	 * restore the psr.up bit 
+	 * restore the psr.up bit. measurement
+	 * is active again.
+	 * no PMU interrupt can happen at this point
+	 * because we still have interrupts disabled.
 	 */
 	if (likely(psr_up)) pfm_set_psr_up();
 
@@ -6091,42 +6127,39 @@
 	pfm_restore_pmcs(t->pmcs, pmc_mask);
 
 	/*
-	 * Check for pending overflow when state was last saved.
-	 * invoked handler is overflow status bits set.
-	 *
-	 * Any PMU overflow in flight at this point, will still
-	 * be treated as spurious because we have no declared
-	 * owner. Note that the first level interrupt handler
-	 * DOES NOT TOUCH any PMC except PMC0 for which we have
-	 * a copy already.
+	 * check for pending overflow at the time the state
+	 * was saved.
 	 */
 	if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
-		struct pt_regs *regs = ia64_task_regs(task);
-		pfm_overflow_handler(task, ctx, t->pmcs[0], regs);
-	}
+		/*
+		 * reload pmc0 with the overflow information
+		 * On McKinley PMU, this will trigger a PMU interrupt
+		 */
+		ia64_set_pmc(0, t->pmcs[0]);
+		ia64_srlz_d();
 
-	/*
-	 * we clear PMC0, to ensure that any in flight interrupt
-	 * will not be attributed to the new context we are installing
-	 * because the actual overflow has been processed above already.
-	 *
-	 * This is an atomic operation.
-	 */
-	pfm_unfreeze_pmu();
+		t->pmcs[0] = 0UL;
+
+#ifndef CONFIG_MCKINLEY
+		/*
+		 * will replay the PMU interrupt
+		 */
+		DPRINT(("perfmon: resend irq for [%d]\n", task->pid));
+		hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
+#endif
+		pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
+	}
 
 	/*
-	 * establish new ownership. If there was an in-flight
-	 * overflow interrupt, it will be treated as spurious
-	 * before and after the call, because no overflow
-	 * status bit can possibly be set. No new overflow
-	 * can be generated because, at this point, psr.up
-	 * is still cleared.
+	 * establish new ownership. 
 	 */
 	SET_PMU_OWNER(task, ctx);
 
 	/*
-	 * restore the psr. This is the point at which
-	 * new overflow interrupts can be generated again.
+	 * restore the psr.up bit. measurement
+	 * is active again.
+	 * no PMU interrupt can happen at this point
+	 * because we still have interrupts disabled.
 	 */
 	if (likely(psr_up)) pfm_set_psr_up();
 }
@@ -6139,7 +6172,7 @@
 pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
 {
 	u64 pmc0;
-	unsigned long mask2, val, pmd_val;
+	unsigned long mask2, val, pmd_val, ovfl_val;
 	int i, can_access_pmu = 0;
 	int is_self;
 
@@ -6187,7 +6220,7 @@
 		 */
 		task->thread.pmcs[0] &= ~0x1;
 	}
-
+	ovfl_val = pmu_conf.ovfl_val;
 	/*
 	 * we save all the used pmds
 	 * we take care of overflows for counting PMDs
@@ -6210,12 +6243,12 @@
 				task->pid,
 				i,
 				ctx->ctx_pmds[i].val,
-				val & pmu_conf.ovfl_val));
+				val & ovfl_val));
 
 			/*
 			 * we rebuild the full 64 bit value of the counter
 			 */
-			val = ctx->ctx_pmds[i].val + (val & pmu_conf.ovfl_val);
+			val = ctx->ctx_pmds[i].val + (val & ovfl_val);
 
 			/*
 			 * now everything is in ctx_pmds[] and we need
@@ -6228,7 +6261,7 @@
 			 * take care of overflow inline
 			 */
 			if (pmc0 & (1UL << i)) {
-				val += 1 + pmu_conf.ovfl_val;
+				val += 1 + ovfl_val;
 				DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i));
 			}
 		}
@@ -6338,7 +6371,7 @@
 	 * initialize all our spinlocks
 	 */
 	spin_lock_init(&pfm_sessions.pfs_lock);
-	spin_lock_init(&pfm_smpl_fmt_lock);
+	spin_lock_init(&pfm_buffer_fmt_lock);
 
 	init_pfm_fs();
 
@@ -6352,6 +6385,9 @@
 
 __initcall(pfm_init);
 
+/*
+ * this function is called before pfm_init()
+ */
 void
 pfm_init_percpu (void)
 {
@@ -6363,7 +6399,6 @@
 	 */
 	pfm_clear_psr_pp();
 	pfm_clear_psr_up();
-
 
 	if (smp_processor_id() == 0)
 		register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
diff -Nru a/arch/ia64/kernel/perfmon_itanium.h b/arch/ia64/kernel/perfmon_itanium.h
--- a/arch/ia64/kernel/perfmon_itanium.h	Fri Oct 17 23:12:59 2003
+++ b/arch/ia64/kernel/perfmon_itanium.h	Fri Oct 17 23:12:59 2003
@@ -81,6 +81,8 @@
 	 */
 	if (cnum == 13 && ((*val & 0x1) == 0UL) && ctx->ctx_fl_using_dbreg == 0) {
 
+		DPRINT(("pmc[%d]=0x%lx has active pmc13.ta cleared, clearing ibr\n", cnum, *val));
+
 		/* don't mix debug with perfmon */
 		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
 
@@ -97,6 +99,8 @@
 	 * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
 	 */
 	if (cnum == 11 && ((*val >> 28)& 0x1) == 0 && ctx->ctx_fl_using_dbreg == 0) {
+
+		DPRINT(("pmc[%d]=0x%lx has active pmc11.pt cleared, clearing dbr\n", cnum, *val));
 
 		/* don't mix debug with perfmon */
 		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
diff -Nru a/arch/ia64/kernel/perfmon_mckinley.h b/arch/ia64/kernel/perfmon_mckinley.h
--- a/arch/ia64/kernel/perfmon_mckinley.h	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/perfmon_mckinley.h	Fri Oct 17 23:12:58 2003
@@ -109,10 +109,20 @@
 	if (ctx == NULL) return -EINVAL;
 
 	/*
-	 * we must clear the debug registers if any pmc13.ena_dbrpX bit is enabled
-	 * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
+	 * we must clear the debug registers if pmc13 has a value which enable
+	 * memory pipeline event constraints. In this case we need to clear the
+	 * the debug registers if they have not yet been accessed. This is required
+	 * to avoid picking stale state.
+	 * PMC13 is "active" if:
+	 * 	one of the pmc13.cfg_dbrpXX field is different from 0x3
+	 * AND
+	 * 	at the corresponding pmc13.ena_dbrpXX is set.
+	 *
+	 * For now, we just check on cfg_dbrXX != 0x3.
 	 */
-	if (cnum == 13 && (*val & (0xfUL << 45)) && ctx->ctx_fl_using_dbreg == 0) {
+	if (cnum == 13 && ((*val & 0x18181818UL) != 0x18181818UL) && ctx->ctx_fl_using_dbreg == 0) {
+
+		DPRINT(("pmc[%d]=0x%lx has active pmc13 settings, clearing dbr\n", cnum, *val));
 
 		/* don't mix debug with perfmon */
 		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
@@ -128,7 +138,9 @@
 	 * we must clear the (instruction) debug registers if any pmc14.ibrpX bit is enabled
 	 * before they are (fl_using_dbreg==0) to avoid picking up stale information.
 	 */
-	if (cnum == 14 && ((*val & 0x2222) != 0x2222) && ctx->ctx_fl_using_dbreg == 0) {
+	if (cnum == 14 && ((*val & 0x2222UL) != 0x2222UL) && ctx->ctx_fl_using_dbreg == 0) {
+
+		DPRINT(("pmc[%d]=0x%lx has active pmc14 settings, clearing ibr\n", cnum, *val));
 
 		/* don't mix debug with perfmon */
 		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
@@ -170,7 +182,7 @@
 		   && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0)
 		       ||(((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0));
 
-		if (ret) printk("perfmon: failure check_case1\n");
+		if (ret) DPRINT((KERN_DEBUG "perfmon: failure check_case1\n"));
 	}
 
 	return ret ? -EINVAL : 0;
diff -Nru a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
--- a/arch/ia64/kernel/process.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/process.c	Fri Oct 17 23:12:58 2003
@@ -685,12 +685,16 @@
 	(*efi.reset_system)(EFI_RESET_WARM, 0, 0, 0);
 }
 
+EXPORT_SYMBOL(machine_restart);
+
 void
 machine_halt (void)
 {
 	cpu_halt();
 }
 
+EXPORT_SYMBOL(machine_halt);
+
 void
 machine_power_off (void)
 {
@@ -698,3 +702,5 @@
 		pm_power_off();
 	machine_halt();
 }
+
+EXPORT_SYMBOL(machine_power_off);
diff -Nru a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
--- a/arch/ia64/kernel/setup.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/setup.c	Fri Oct 17 23:12:58 2003
@@ -30,6 +30,8 @@
 #include <linux/string.h>
 #include <linux/threads.h>
 #include <linux/tty.h>
+#include <linux/serial.h>
+#include <linux/serial_core.h>
 #include <linux/efi.h>
 #include <linux/initrd.h>
 
@@ -43,6 +45,7 @@
 #include <asm/processor.h>
 #include <asm/sal.h>
 #include <asm/sections.h>
+#include <asm/serial.h>
 #include <asm/smp.h>
 #include <asm/system.h>
 #include <asm/unistd.h>
@@ -101,7 +104,7 @@
 filter_rsvd_memory (unsigned long start, unsigned long end, void *arg)
 {
 	unsigned long range_start, range_end, prev_start;
-	void (*func)(unsigned long, unsigned long);
+	void (*func)(unsigned long, unsigned long, int);
 	int i;
 
 #if IGNORE_PFN0
@@ -122,11 +125,7 @@
 		range_end   = min(end, rsvd_region[i].start);
 
 		if (range_start < range_end)
-#ifdef CONFIG_DISCONTIGMEM
-			call_pernode_memory(__pa(range_start), __pa(range_end), func);
-#else
-			(*func)(__pa(range_start), range_end - range_start);
-#endif
+			call_pernode_memory(__pa(range_start), range_end - range_start, func);
 
 		/* nothing more available in this segment */
 		if (range_end == end) return 0;
@@ -225,6 +224,25 @@
 #endif
 }
 
+#ifdef CONFIG_SERIAL_8250_CONSOLE
+static void __init
+setup_serial_legacy (void)
+{
+	struct uart_port port;
+	unsigned int i, iobase[] = {0x3f8, 0x2f8};
+
+	printk(KERN_INFO "Registering legacy COM ports for serial console\n");
+	memset(&port, 0, sizeof(port));
+	port.iotype = SERIAL_IO_PORT;
+	port.uartclk = BASE_BAUD * 16;
+	for (i = 0; i < ARRAY_SIZE(iobase); i++) {
+		port.line = i;
+		port.iobase = iobase[i];
+		early_serial_setup(&port);
+	}
+}
+#endif
+
 void __init
 setup_arch (char **cmdline_p)
 {
@@ -239,7 +257,6 @@
 	strlcpy(saved_command_line, *cmdline_p, sizeof(saved_command_line));
 
 	efi_init();
-	find_memory();
 
 #ifdef CONFIG_ACPI_BOOT
 	/* Initialize the ACPI boot-time table parser */
@@ -253,6 +270,8 @@
 # endif
 #endif /* CONFIG_APCI_BOOT */
 
+	find_memory();
+
 	/* process SAL system table: */
 	ia64_sal_init(efi.sal_systab);
 
@@ -297,11 +316,24 @@
 #ifdef CONFIG_SERIAL_8250_HCDP
 	if (efi.hcdp) {
 		void setup_serial_hcdp(void *);
-
-		/* Setup the serial ports described by HCDP */
 		setup_serial_hcdp(efi.hcdp);
 	}
 #endif
+#ifdef CONFIG_SERIAL_8250_CONSOLE
+	/*
+	 * Without HCDP, we won't discover any serial ports until the serial driver looks
+	 * in the ACPI namespace.  If ACPI claims there are some legacy devices, register
+	 * the legacy COM ports so serial console works earlier.  This is slightly dangerous
+	 * because we don't *really* know whether there's anything there, but we hope that
+	 * all new boxes will implement HCDP.
+	 */
+	{
+		extern unsigned char acpi_legacy_devices;
+		if (!efi.hcdp && acpi_legacy_devices)
+			setup_serial_legacy();
+	}
+#endif
+
 #ifdef CONFIG_VT
 # if defined(CONFIG_DUMMY_CONSOLE)
 	conswitchp = &dummy_con;
@@ -544,28 +576,7 @@
 	struct cpuinfo_ia64 *cpu_info;
 	void *cpu_data;
 
-#ifdef CONFIG_SMP
-	int cpu;
-
-	/*
-	 * get_free_pages() cannot be used before cpu_init() done.  BSP allocates
-	 * "NR_CPUS" pages for all CPUs to avoid that AP calls get_zeroed_page().
-	 */
-	if (smp_processor_id() == 0) {
-		cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS, PERCPU_PAGE_SIZE,
-					   __pa(MAX_DMA_ADDRESS));
-		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start);
-			__per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start;
-			cpu_data += PERCPU_PAGE_SIZE;
-
-			per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
-		}
-	}
-	cpu_data = __per_cpu_start + __per_cpu_offset[smp_processor_id()];
-#else /* !CONFIG_SMP */
-	cpu_data = __phys_per_cpu_start;
-#endif /* !CONFIG_SMP */
+	cpu_data = per_cpu_init();
 
 	get_max_cacheline_size();
 
@@ -576,9 +587,6 @@
 	 * accessing cpu_data() through the canonical per-CPU address.
 	 */
 	cpu_info = cpu_data + ((char *) &__ia64_per_cpu_var(cpu_info) - __per_cpu_start);
-#ifdef CONFIG_NUMA
-	cpu_info->node_data = get_node_data_ptr();
-#endif
 	identify_cpu(cpu_info);
 
 #ifdef CONFIG_MCKINLEY
diff -Nru a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
--- a/arch/ia64/kernel/time.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/kernel/time.c	Fri Oct 17 23:12:58 2003
@@ -65,8 +65,12 @@
 }
 
 /*
- * Return the number of nano-seconds that elapsed since the last update to jiffy.  The
- * xtime_lock must be at least read-locked when calling this routine.
+ * Return the number of nano-seconds that elapsed since the last
+ * update to jiffy.  It is quite possible that the timer interrupt
+ * will interrupt this and result in a race for any of jiffies,
+ * wall_jiffies or itm_next.  Thus, the xtime_lock must be at least
+ * read synchronised when calling this routine (see do_gettimeofday()
+ * below for an example).
  */
 unsigned long
 itc_get_offset (void)
@@ -77,11 +81,6 @@
 	last_tick = (cpu_data(TIME_KEEPER_ID)->itm_next
 		     - (lost + 1)*cpu_data(TIME_KEEPER_ID)->itm_delta);
 
-	if (unlikely((long) (now - last_tick) < 0)) {
-		printk(KERN_ERR "CPU %d: now < last_tick (now=0x%lx,last_tick=0x%lx)!\n",
-		       smp_processor_id(), now, last_tick);
-		return last_nsec_offset;
-	}
 	elapsed_cycles = now - last_tick;
 	return (elapsed_cycles*local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT;
 }
diff -Nru a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
--- a/arch/ia64/lib/checksum.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/lib/checksum.c	Fri Oct 17 23:12:58 2003
@@ -1,8 +1,8 @@
 /*
  * Network checksum routines
  *
- * Copyright (C) 1999 Hewlett-Packard Co
- * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 1999, 2003 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
  *
  * Most of the code coming from arch/alpha/lib/checksum.c
  *
@@ -10,6 +10,7 @@
  * in an architecture-specific manner due to speed..
  */
 
+#include <linux/module.h>
 #include <linux/string.h>
 
 #include <asm/byteorder.h>
@@ -40,6 +41,8 @@
 			   ((unsigned long) proto << 8));
 }
 
+EXPORT_SYMBOL(csum_tcpudp_magic);
+
 unsigned int
 csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len,
 		    unsigned short proto, unsigned int sum)
@@ -84,6 +87,7 @@
 	return result;
 }
 
+EXPORT_SYMBOL(csum_partial);
 
 /*
  * this routine is used for miscellaneous IP-like checksums, mainly
@@ -94,3 +98,5 @@
 {
 	return ~do_csum(buff,len);
 }
+
+EXPORT_SYMBOL(ip_compute_csum);
diff -Nru a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
--- a/arch/ia64/lib/csum_partial_copy.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/lib/csum_partial_copy.c	Fri Oct 17 23:12:58 2003
@@ -1,12 +1,13 @@
 /*
  * Network Checksum & Copy routine
  *
- * Copyright (C) 1999 Hewlett-Packard Co
- * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 1999, 2003 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
  *
  * Most of the code has been imported from Linux/Alpha
  */
 
+#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/string.h>
 
@@ -146,3 +147,4 @@
 	return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
 }
 
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
diff -Nru a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
--- a/arch/ia64/mm/contig.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/mm/contig.c	Fri Oct 17 23:12:58 2003
@@ -25,6 +25,10 @@
 #include <asm/pgtable.h>
 #include <asm/sections.h>
 
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+static unsigned long num_dma_physpages;
+#endif
+
 /**
  * show_mem - display a memory statistics summary
  *
@@ -160,4 +164,134 @@
 	reserve_bootmem(bootmap_start, bootmap_size);
 
 	find_initrd();
+}
+
+#ifdef CONFIG_SMP
+/**
+ * per_cpu_init - setup per-cpu variables
+ *
+ * Allocate and setup per-cpu data areas.
+ */
+void *
+per_cpu_init (void)
+{
+	void *cpu_data;
+	int cpu;
+
+	/*
+	 * get_free_pages() cannot be used before cpu_init() done.  BSP
+	 * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls
+	 * get_zeroed_page().
+	 */
+	if (smp_processor_id() == 0) {
+		cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS,
+					   PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start);
+			__per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start;
+			cpu_data += PERCPU_PAGE_SIZE;
+			per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
+		}
+	}
+	return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
+}
+#endif /* CONFIG_SMP */
+
+static int
+count_pages (u64 start, u64 end, void *arg)
+{
+	unsigned long *count = arg;
+
+	*count += (end - start) >> PAGE_SHIFT;
+	return 0;
+}
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+static int
+count_dma_pages (u64 start, u64 end, void *arg)
+{
+	unsigned long *count = arg;
+
+	if (end <= MAX_DMA_ADDRESS)
+		*count += (end - start) >> PAGE_SHIFT;
+	return 0;
+}
+#endif
+
+/*
+ * Set up the page tables.
+ */
+
+void
+paging_init (void)
+{
+	unsigned long max_dma;
+	unsigned long zones_size[MAX_NR_ZONES];
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+	unsigned long zholes_size[MAX_NR_ZONES];
+	unsigned long max_gap;
+#endif
+
+	/* initialize mem_map[] */
+
+	memset(zones_size, 0, sizeof(zones_size));
+
+	num_physpages = 0;
+	efi_memmap_walk(count_pages, &num_physpages);
+
+	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+	memset(zholes_size, 0, sizeof(zholes_size));
+
+	num_dma_physpages = 0;
+	efi_memmap_walk(count_dma_pages, &num_dma_physpages);
+
+	if (max_low_pfn < max_dma) {
+		zones_size[ZONE_DMA] = max_low_pfn;
+		zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages;
+	} else {
+		zones_size[ZONE_DMA] = max_dma;
+		zholes_size[ZONE_DMA] = max_dma - num_dma_physpages;
+		if (num_physpages > num_dma_physpages) {
+			zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
+			zholes_size[ZONE_NORMAL] =
+				((max_low_pfn - max_dma) -
+				 (num_physpages - num_dma_physpages));
+		}
+	}
+
+	max_gap = 0;
+	efi_memmap_walk(find_largest_hole, (u64 *)&max_gap);
+	if (max_gap < LARGE_GAP) {
+		vmem_map = (struct page *) 0;
+		free_area_init_node(0, &contig_page_data, NULL, zones_size, 0,
+				    zholes_size);
+		mem_map = contig_page_data.node_mem_map;
+	} else {
+		unsigned long map_size;
+
+		/* allocate virtual_mem_map */
+
+		map_size = PAGE_ALIGN(max_low_pfn * sizeof(struct page));
+		vmalloc_end -= map_size;
+		vmem_map = (struct page *) vmalloc_end;
+		efi_memmap_walk(create_mem_map_page_table, 0);
+
+		free_area_init_node(0, &contig_page_data, vmem_map, zones_size,
+				    0, zholes_size);
+
+		mem_map = contig_page_data.node_mem_map;
+		printk("Virtual mem_map starts at 0x%p\n", mem_map);
+	}
+#else /* !CONFIG_VIRTUAL_MEM_MAP */
+	if (max_low_pfn < max_dma)
+		zones_size[ZONE_DMA] = max_low_pfn;
+	else {
+		zones_size[ZONE_DMA] = max_dma;
+		zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
+	}
+	free_area_init(zones_size);
+#endif /* !CONFIG_VIRTUAL_MEM_MAP */
+	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }
diff -Nru a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
--- a/arch/ia64/mm/discontig.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/mm/discontig.c	Fri Oct 17 23:12:58 2003
@@ -17,72 +17,57 @@
 #include <linux/acpi.h>
 #include <linux/efi.h>
 #include <asm/pgalloc.h>
+#include <asm/tlb.h>
 #include <asm/meminit.h>
-
-
-/*
- * Round an address upward to the next multiple of GRANULE size.
- */
-#define GRANULEROUNDUP(n) (((n)+IA64_GRANULE_SIZE-1) & ~(IA64_GRANULE_SIZE-1))
-
-static struct ia64_node_data	*node_data[MAX_NUMNODES];
-static long			boot_pg_data[8*MAX_NUMNODES+sizeof(pg_data_t)]  __initdata;
-static pg_data_t		*pg_data_ptr[MAX_NUMNODES] __initdata;
-static bootmem_data_t		bdata[MAX_NUMNODES][NR_BANKS_PER_NODE+1] __initdata;
-/*
- * Return the compact node number of this cpu. Used prior to
- * setting up the cpu_data area.
- *	Note - not fast, intended for boot use only!!
- */
-int
-boot_get_local_nodeid(void)
-{
-	int	i;
-
-	for (i = 0; i < NR_CPUS; i++)
-		if (node_cpuid[i].phys_id == hard_smp_processor_id())
-			return node_cpuid[i].nid;
-
-	/* node info missing, so nid should be 0.. */
-	return 0;
-}
+#include <asm/numa.h>
+#include <asm/sections.h>
 
 /*
- * Return a pointer to the pg_data structure for a node.
- * This function is used ONLY in early boot before the cpu_data
- * structure is available.
+ * Track per-node information needed to setup the boot memory allocator, the
+ * per-node areas, and the real VM.
  */
-pg_data_t* __init
-boot_get_pg_data_ptr(long node)
-{
-	return pg_data_ptr[node];
-}
-
-
-/*
- * Return a pointer to the node data for the current node.
- *	(boottime initialization only)
+struct early_node_data {
+	struct ia64_node_data *node_data;
+	pg_data_t *pgdat;
+	unsigned long pernode_addr;
+	unsigned long pernode_size;
+	struct bootmem_data bootmem_data;
+	unsigned long num_physpages;
+	unsigned long num_dma_physpages;
+	unsigned long min_pfn;
+	unsigned long max_pfn;
+};
+
+static struct early_node_data mem_data[NR_NODES] __initdata;
+
+/*
+ * To prevent cache aliasing effects, align per-node structures so that they
+ * start at addresses that are strided by node number.
+ */
+#define NODEDATA_ALIGN(addr, node)						\
+	((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE)
+
+/**
+ * build_node_maps - callback to setup bootmem structs for each node
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * We allocate a struct bootmem_data for each piece of memory that we wish to
+ * treat as a virtually contiguous block (i.e. each node). Each such block
+ * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down
+ * if necessary.  Any non-existent pages will simply be part of the virtual
+ * memmap.  We also update min_low_pfn and max_low_pfn here as we receive
+ * memory ranges from the caller.
  */
-struct ia64_node_data *
-get_node_data_ptr(void)
+static int __init build_node_maps(unsigned long start, unsigned long len,
+				  int node)
 {
-	return node_data[boot_get_local_nodeid()];
-}
+	unsigned long cstart, epfn, end = start + len;
+	struct bootmem_data *bdp = &mem_data[node].bootmem_data;
 
-/*
- * We allocate one of the bootmem_data_t structs for each piece of memory
- * that we wish to treat as a contiguous block.  Each such block must start
- * on a BANKSIZE boundary.  Multiple banks per node is not supported.
- */
-static int __init
-build_maps(unsigned long pstart, unsigned long length, int node)
-{
-	bootmem_data_t	*bdp;
-	unsigned long cstart, epfn;
-
-	bdp = pg_data_ptr[node]->bdata;
-	epfn = GRANULEROUNDUP(pstart + length) >> PAGE_SHIFT;
-	cstart = pstart & ~(BANKSIZE - 1);
+	epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT;
+	cstart = GRANULEROUNDDOWN(start);
 
 	if (!bdp->node_low_pfn) {
 		bdp->node_boot_start = cstart;
@@ -98,34 +83,143 @@
 	return 0;
 }
 
-/*
- * Find space on each node for the bootmem map.
+/**
+ * early_nr_cpus_node - return number of cpus on a given node
+ * @node: node to check
  *
- * Called by efi_memmap_walk to find boot memory on each node. Note that
- * only blocks that are free are passed to this routine (currently filtered by
- * free_available_memory).
+ * Count the number of cpus on @node.  We can't use nr_cpus_node() yet because
+ * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
+ * called yet.
  */
-static int __init
-find_bootmap_space(unsigned long pstart, unsigned long length, int node)
+static int early_nr_cpus_node(int node)
 {
-	unsigned long	mapsize, pages, epfn;
-	bootmem_data_t	*bdp;
+	int cpu, n = 0;
 
-	epfn = (pstart + length) >> PAGE_SHIFT;
-	bdp = &pg_data_ptr[node]->bdata[0];
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (node == node_cpuid[cpu].nid)
+			n++;
+
+	return n;
+}
 
-	if (pstart < bdp->node_boot_start || epfn > bdp->node_low_pfn)
+/**
+ * find_pernode_space - allocate memory for memory map and per-node structures
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * This routine reserves space for the per-cpu data struct, the list of
+ * pg_data_ts and the per-node data struct.  Each node will have something like
+ * the following in the first chunk of addr. space large enough to hold it.
+ *
+ *    ________________________
+ *   |                        |
+ *   |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first
+ *   |    PERCPU_PAGE_SIZE *  |     start and length big enough
+ *   |        NR_CPUS         |
+ *   |------------------------|
+ *   |   local pg_data_t *    |
+ *   |------------------------|
+ *   |  local ia64_node_data  |
+ *   |------------------------|
+ *   |          ???           |
+ *   |________________________|
+ *
+ * Once this space has been set aside, the bootmem maps are initialized.  We
+ * could probably move the allocation of the per-cpu and ia64_node_data space
+ * outside of this function and use alloc_bootmem_node(), but doing it here
+ * is straightforward and we get the alignments we want so...
+ */
+static int __init find_pernode_space(unsigned long start, unsigned long len,
+				     int node)
+{
+	unsigned long epfn, cpu, cpus;
+	unsigned long pernodesize = 0, pernode;
+       	void *cpu_data;
+	struct bootmem_data *bdp = &mem_data[node].bootmem_data;
+
+	epfn = (start + len) >> PAGE_SHIFT;
+
+	/*
+	 * Make sure this memory falls within this node's usable memory
+	 * since we may have thrown some away in build_maps().
+	 */
+	if (start < bdp->node_boot_start ||
+	    epfn > bdp->node_low_pfn)
 		return 0;
 
-	if (!bdp->node_bootmem_map) {
-		pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT);
+	/* Don't setup this node's local space twice... */
+	if (!mem_data[node].pernode_addr) {
+		/*
+		 * Calculate total size needed, incl. what's necessary
+		 * for good alignment and alias prevention.
+		 */
+		cpus = early_nr_cpus_node(node);
+		pernodesize += PERCPU_PAGE_SIZE * cpus;
+		pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
+		pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
+		pernodesize = PAGE_ALIGN(pernodesize);
+		pernode = NODEDATA_ALIGN(start, node);
+
+		/* Is this range big enough for what we want to store here? */
+		if (start + len > (pernode + pernodesize)) {
+			mem_data[node].pernode_addr = pernode;
+			mem_data[node].pernode_size = pernodesize;
+			memset(__va(pernode), 0, pernodesize);
+
+			cpu_data = (void *)pernode;
+			pernode += PERCPU_PAGE_SIZE * cpus;
+
+			mem_data[node].pgdat = __va(pernode);
+			pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
+
+			mem_data[node].node_data = __va(pernode);
+			pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
+
+			mem_data[node].pgdat->bdata = bdp;
+			pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
+
+			/*
+			 * Copy the static per-cpu data into the region we
+			 * just set aside and then setup __per_cpu_offset
+			 * for each CPU on this node.
+			 */
+			for (cpu = 0; cpu < NR_CPUS; cpu++) {
+				if (node == node_cpuid[cpu].nid) {
+					memcpy(cpu_data, __phys_per_cpu_start,
+					       __per_cpu_end-__per_cpu_start);
+					__per_cpu_offset[cpu] =
+						(char*)__va(cpu_data) -
+						__per_cpu_start;
+					cpu_data += PERCPU_PAGE_SIZE;
+				}
+			}
+		}
+	}
+
+	pernode = mem_data[node].pernode_addr;
+	pernodesize = mem_data[node].pernode_size;
+	if (pernode && !bdp->node_bootmem_map) {
+		unsigned long pages, mapsize, map = 0;
+
+		pages = bdp->node_low_pfn -
+			(bdp->node_boot_start >> PAGE_SHIFT);
 		mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
-		if (length > mapsize) {
-			init_bootmem_node(
-				BOOT_NODE_DATA(node),
-				pstart>>PAGE_SHIFT, 
-				bdp->node_boot_start>>PAGE_SHIFT,
-				bdp->node_low_pfn);
+
+		/*
+		 * The map will either contain the pernode area or begin
+		 * after it.
+		 */
+		if (pernode - start > mapsize)
+			map = start;
+		else if (start + len - pernode - pernodesize > mapsize)
+			map = pernode + pernodesize;
+
+		if (map) {
+			init_bootmem_node(mem_data[node].pgdat,
+					  map>>PAGE_SHIFT,
+					  bdp->node_boot_start>>PAGE_SHIFT,
+					  bdp->node_low_pfn);
 		}
 
 	}
@@ -133,85 +227,93 @@
 	return 0;
 }
 
-
-/*
- * Free available memory to the bootmem allocator.
- *
- * Note that only blocks that are free are passed to this routine (currently 
- * filtered by free_available_memory).
+/**
+ * free_node_bootmem - free bootmem allocator memory for use
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
  *
+ * Simply calls the bootmem allocator to free the specified ranged from
+ * the given pg_data_t's bdata struct.  After this function has been called
+ * for all the entries in the EFI memory map, the bootmem allocator will
+ * be ready to service allocation requests.
  */
-static int __init
-discontig_free_bootmem_node(unsigned long pstart, unsigned long length, int node)
+static int __init free_node_bootmem(unsigned long start, unsigned long len,
+				    int node)
 {
-	free_bootmem_node(BOOT_NODE_DATA(node), pstart, length);
+	free_bootmem_node(mem_data[node].pgdat, start, len);
 
 	return 0;
 }
 
-
-/*
- * Reserve the space used by the bootmem maps.
- */
-static void __init
-discontig_reserve_bootmem(void)
-{
-	int		node;
-	unsigned long	mapbase, mapsize, pages;
-	bootmem_data_t	*bdp;
+/**
+ * reserve_pernode_space - reserve memory for per-node space
+ *
+ * Reserve the space used by the bootmem maps & per-node space in the boot
+ * allocator so that when we actually create the real mem maps we don't
+ * use their memory.
+ */
+static void __init reserve_pernode_space(void)
+{
+	unsigned long base, size, pages;
+	struct bootmem_data *bdp;
+	int node;
 
 	for (node = 0; node < numnodes; node++) {
-		bdp = BOOT_NODE_DATA(node)->bdata;
+		pg_data_t *pdp = mem_data[node].pgdat;
 
+		bdp = pdp->bdata;
+
+		/* First the bootmem_map itself */
 		pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT);
-		mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
-		mapbase = __pa(bdp->node_bootmem_map);
-		reserve_bootmem_node(BOOT_NODE_DATA(node), mapbase, mapsize);
+		size = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
+		base = __pa(bdp->node_bootmem_map);
+		reserve_bootmem_node(pdp, base, size);
+
+		/* Now the per-node space */
+		size = mem_data[node].pernode_size;
+		base = __pa(mem_data[node].pernode_addr);
+		reserve_bootmem_node(pdp, base, size);
 	}
 }
 
-/*
- * Allocate per node tables.
- * 	- the pg_data structure is allocated on each node. This minimizes offnode 
- *	  memory references
- *	- the node data is allocated & initialized. Portions of this structure is read-only (after 
- *	  boot) and contains node-local pointers to usefuls data structures located on
- *	  other nodes.
- *
- * We also switch to using the "real" pg_data structures at this point. Earlier in boot, we
- * use a different structure. The only use for pg_data prior to the point in boot is to get 
- * the pointer to the bdata for the node.
- */
-static void __init
-allocate_pernode_structures(void)
-{
-	pg_data_t	*pgdat=0, *new_pgdat_list=0;
-	int		node, mynode;
-
-	mynode = boot_get_local_nodeid();
-	for (node = numnodes - 1; node >= 0 ; node--) {
-		node_data[node] = alloc_bootmem_node(BOOT_NODE_DATA(node), sizeof (struct ia64_node_data));
-		pgdat = __alloc_bootmem_node(BOOT_NODE_DATA(node), sizeof(pg_data_t), SMP_CACHE_BYTES, 0);
-		pgdat->bdata = &(bdata[node][0]);
-		pg_data_ptr[node] = pgdat;
-		pgdat->pgdat_next = new_pgdat_list;
-		new_pgdat_list = pgdat;
-	}
+/**
+ * initialize_pernode_data - fixup per-cpu & per-node pointers
+ *
+ * Each node's per-node area has a copy of the global pg_data_t list, so
+ * we copy that to each node here, as well as setting the per-cpu pointer
+ * to the local node data structure.  The active_cpus field of the per-node
+ * structure gets setup by the platform_cpu_init() function later.
+ */
+static void __init initialize_pernode_data(void)
+{
+	int cpu, node;
+	pg_data_t *pgdat_list[NR_NODES];
 
-	memcpy(node_data[mynode]->pg_data_ptrs, pg_data_ptr, sizeof(pg_data_ptr));
-	memcpy(node_data[mynode]->node_data_ptrs, node_data, sizeof(node_data));
+	for (node = 0; node < numnodes; node++)
+		pgdat_list[node] = mem_data[node].pgdat;
 
-	pgdat_list = new_pgdat_list;
+	/* Copy the pg_data_t list to each node and init the node field */
+	for (node = 0; node < numnodes; node++) {
+		memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list,
+		       sizeof(pgdat_list));
+	}
+
+	/* Set the node_data pointer for each per-cpu struct */
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		node = node_cpuid[cpu].nid;
+		per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data;
+	}
 }
 
-/*
- * Called early in boot to setup the boot memory allocator, and to
- * allocate the node-local pg_data & node-directory data structures..
+/**
+ * find_memory - walk the EFI memory map and setup the bootmem allocator
+ *
+ * Called early in boot to setup the bootmem allocator, and to
+ * allocate the per-cpu and per-node structures.
  */
 void __init find_memory(void)
 {
-	int	node;
-
 	reserve_memory();
 
 	if (numnodes == 0) {
@@ -219,94 +321,48 @@
 		numnodes = 1;
 	}
 
-	for (node = 0; node < numnodes; node++) {
-		pg_data_ptr[node] = (pg_data_t*) &boot_pg_data[node];
-		pg_data_ptr[node]->bdata = &bdata[node][0];
-	}
-
 	min_low_pfn = -1;
 	max_low_pfn = 0;
 
-        efi_memmap_walk(filter_rsvd_memory, build_maps);
-        efi_memmap_walk(filter_rsvd_memory, find_bootmap_space);
-        efi_memmap_walk(filter_rsvd_memory, discontig_free_bootmem_node);
-	discontig_reserve_bootmem();
-	allocate_pernode_structures();
-
-	find_initrd();
-}
-
-/*
- * Initialize the paging system.
- *	- determine sizes of each node
- *	- initialize the paging system for the node
- *	- build the nodedir for the node. This contains pointers to
- *	  the per-bank mem_map entries.
- *	- fix the page struct "virtual" pointers. These are bank specific
- *	  values that the paging system doesn't understand.
- *	- replicate the nodedir structure to other nodes
- */
-
-void __init
-discontig_paging_init(void)
-{
-	int		node, mynode;
-	unsigned long	max_dma, zones_size[MAX_NR_ZONES];
-	unsigned long	kaddr, ekaddr, bid;
-	struct page	*page;
-	bootmem_data_t	*bdp;
-
-	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	/* These actually end up getting called by call_pernode_memory() */
+	efi_memmap_walk(filter_rsvd_memory, build_node_maps);
+	efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
+	efi_memmap_walk(filter_rsvd_memory, free_node_bootmem);
 
-	mynode = boot_get_local_nodeid();
-	for (node = 0; node < numnodes; node++) {
-		long pfn, startpfn;
+	reserve_pernode_space();
+	initialize_pernode_data();
 
-		memset(zones_size, 0, sizeof(zones_size));
+	max_pfn = max_low_pfn;
 
-		startpfn = -1;
-		bdp = BOOT_NODE_DATA(node)->bdata;
-		pfn = bdp->node_boot_start >> PAGE_SHIFT;
-		if (startpfn == -1)
-			startpfn = pfn;
-		if (pfn > max_dma)
-			zones_size[ZONE_NORMAL] += (bdp->node_low_pfn - pfn);
-		else if (bdp->node_low_pfn < max_dma)
-			zones_size[ZONE_DMA] += (bdp->node_low_pfn - pfn);
-		else {
-			zones_size[ZONE_DMA] += (max_dma - pfn);
-			zones_size[ZONE_NORMAL] += (bdp->node_low_pfn - max_dma);
-		}
-
-		free_area_init_node(node, NODE_DATA(node), NULL, zones_size, startpfn, 0);
-
-		page = NODE_DATA(node)->node_mem_map;
+	find_initrd();
+}
 
-		bdp = BOOT_NODE_DATA(node)->bdata;
+/**
+ * per_cpu_init - setup per-cpu variables
+ *
+ * find_pernode_space() does most of this already, we just need to set
+ * local_per_cpu_offset
+ */
+void *per_cpu_init(void)
+{
+	int cpu;
 
-		kaddr = (unsigned long)__va(bdp->node_boot_start);
-		ekaddr = (unsigned long)__va(bdp->node_low_pfn << PAGE_SHIFT);
-		while (kaddr < ekaddr) {
-			if (paddr_to_nid(__pa(kaddr)) == node) {
-				bid = BANK_MEM_MAP_INDEX(kaddr);
-				node_data[mynode]->node_id_map[bid] = node;
-				node_data[mynode]->bank_mem_map_base[bid] = page;
-			}
-			kaddr += BANKSIZE;
-			page += BANKSIZE/PAGE_SIZE;
+	if (smp_processor_id() == 0) {
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			per_cpu(local_per_cpu_offset, cpu) =
+				__per_cpu_offset[cpu];
 		}
 	}
 
-	/*
-	 * Finish setting up the node data for this node, then copy it to the other nodes.
-	 */
-	for (node=0; node < numnodes; node++)
-		if (mynode != node) {
-			memcpy(node_data[node], node_data[mynode], sizeof(struct ia64_node_data));
-			node_data[node]->node = node;
-		}
+	return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
 }
 
+/**
+ * show_mem - give short summary of memory stats
+ *
+ * Shows a simple page count of reserved and used pages in the system.
+ * For discontig machines, it does this on a per-pgdat basis.
+ */
 void show_mem(void)
 {
 	int i, reserved = 0;
@@ -335,7 +391,12 @@
 	printk("%d free buffer pages\n", nr_free_buffer_pages());
 }
 
-/*
+/**
+ * call_pernode_memory - use SRAT to call callback functions with node info
+ * @start: physical start of range
+ * @len: length of range
+ * @arg: function to call for each range
+ *
  * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
  * out to which node a block of memory belongs.  Ignore memory that we cannot
  * identify, and split blocks that run across multiple nodes.
@@ -343,10 +404,10 @@
  * Take this opportunity to round the start address up and the end address
  * down to page boundaries.
  */
-void call_pernode_memory(unsigned long start, unsigned long end, void *arg)
+void call_pernode_memory(unsigned long start, unsigned long len, void *arg)
 {
-	unsigned long rs, re;
-	void (*func)(unsigned long, unsigned long, int, int);
+	unsigned long rs, re, end = start + len;
+	void (*func)(unsigned long, unsigned long, int);
 	int i;
 
 	start = PAGE_ALIGN(start);
@@ -357,21 +418,127 @@
 	func = arg;
 
 	if (!num_memblks) {
-		/*
-		 * This machine doesn't have SRAT, so call func with
-		 * nid=0, bank=0.
-		 */
+		/* No SRAT table, to assume one node (node 0) */
 		if (start < end)
-			(*func)(start, end - start, 0, 0);
+			(*func)(start, len, 0);
 		return;
 	}
 
 	for (i = 0; i < num_memblks; i++) {
 		rs = max(start, node_memblk[i].start_paddr);
-		re = min(end, node_memblk[i].start_paddr+node_memblk[i].size);
+		re = min(end, node_memblk[i].start_paddr +
+			 node_memblk[i].size);
 
 		if (rs < re)
-			(*func)(rs, re-rs, node_memblk[i].nid,
-				node_memblk[i].bank);
+			(*func)(rs, re - rs, node_memblk[i].nid);
+
+		if (re == end)
+			break;
+	}
+}
+
+/**
+ * count_node_pages - callback to build per-node memory info structures
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * Each node has it's own number of physical pages, DMAable pages, start, and
+ * end page frame number.  This routine will be called by call_pernode_memory()
+ * for each piece of usable memory and will setup these values for each node.
+ * Very similar to build_maps().
+ */
+static int count_node_pages(unsigned long start, unsigned long len, int node)
+{
+	unsigned long end = start + len;
+
+	mem_data[node].num_physpages += len >> PAGE_SHIFT;
+	if (start <= __pa(MAX_DMA_ADDRESS))
+		mem_data[node].num_dma_physpages +=
+			(min(end, __pa(MAX_DMA_ADDRESS)) - start) >>PAGE_SHIFT;
+	start = GRANULEROUNDDOWN(start);
+	start = ORDERROUNDDOWN(start);
+	end = GRANULEROUNDUP(end);
+	mem_data[node].max_pfn = max(mem_data[node].max_pfn,
+				     end >> PAGE_SHIFT);
+	mem_data[node].min_pfn = min(mem_data[node].min_pfn,
+				     start >> PAGE_SHIFT);
+
+	return 0;
+}
+
+/**
+ * paging_init - setup page tables
+ *
+ * paging_init() sets up the page tables for each node of the system and frees
+ * the bootmem allocator memory for general use.
+ */
+void paging_init(void)
+{
+	unsigned long max_dma;
+	unsigned long zones_size[MAX_NR_ZONES];
+	unsigned long zholes_size[MAX_NR_ZONES];
+	unsigned long max_gap, pfn_offset = 0;
+	int node;
+
+	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	max_gap = 0;
+	efi_memmap_walk(find_largest_hole, &max_gap);
+
+	/* so min() will work in count_node_pages */
+	for (node = 0; node < numnodes; node++)
+		mem_data[node].min_pfn = ~0UL;
+
+	efi_memmap_walk(filter_rsvd_memory, count_node_pages);
+
+	for (node = 0; node < numnodes; node++) {
+		memset(zones_size, 0, sizeof(zones_size));
+		memset(zholes_size, 0, sizeof(zholes_size));
+
+		num_physpages += mem_data[node].num_physpages;
+
+		if (mem_data[node].min_pfn >= max_dma) {
+			/* All of this node's memory is above ZONE_DMA */
+			zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
+				mem_data[node].min_pfn;
+			zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn -
+				mem_data[node].min_pfn -
+				mem_data[node].num_physpages;
+		} else if (mem_data[node].max_pfn < max_dma) {
+			/* All of this node's memory is in ZONE_DMA */
+			zones_size[ZONE_DMA] = mem_data[node].max_pfn -
+				mem_data[node].min_pfn;
+			zholes_size[ZONE_DMA] = mem_data[node].max_pfn -
+				mem_data[node].min_pfn -
+				mem_data[node].num_dma_physpages;
+		} else {
+			/* This node has memory in both zones */
+			zones_size[ZONE_DMA] = max_dma -
+				mem_data[node].min_pfn;
+			zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
+				mem_data[node].num_dma_physpages;
+			zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
+				max_dma;
+			zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] -
+				(mem_data[node].num_physpages -
+				 mem_data[node].num_dma_physpages);
+		}
+
+		if (node == 0) {
+			vmalloc_end -=
+				PAGE_ALIGN(max_low_pfn * sizeof(struct page));
+			vmem_map = (struct page *) vmalloc_end;
+
+			efi_memmap_walk(create_mem_map_page_table, 0);
+			printk("Virtual mem_map starts at 0x%p\n", vmem_map);
+		}
+
+		pfn_offset = mem_data[node].min_pfn;
+
+		free_area_init_node(node, NODE_DATA(node),
+				    vmem_map + pfn_offset, zones_size,
+				    pfn_offset, zholes_size);
 	}
+
+	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }
diff -Nru a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
--- a/arch/ia64/mm/hugetlbpage.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/mm/hugetlbpage.c	Fri Oct 17 23:12:58 2003
@@ -20,13 +20,46 @@
 
 #define TASK_HPAGE_BASE (REGION_HPAGE << REGION_SHIFT)
 
-static long    htlbpagemem;
-int     htlbpage_max;
-static long    htlbzone_pages;
+static long	htlbpagemem;
+int		htlbpage_max;
+static long	htlbzone_pages;
 
-static LIST_HEAD(htlbpage_freelist);
+static struct list_head hugepage_freelists[MAX_NUMNODES];
 static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
 
+static void enqueue_huge_page(struct page *page)
+{
+	list_add(&page->list,
+		&hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
+}
+
+static struct page *dequeue_huge_page(void)
+{
+	int nid = numa_node_id();
+	struct page *page = NULL;
+
+	if (list_empty(&hugepage_freelists[nid])) {
+		for (nid = 0; nid < MAX_NUMNODES; ++nid)
+			if (!list_empty(&hugepage_freelists[nid]))
+				break;
+	}
+	if (nid >= 0 && nid < MAX_NUMNODES &&
+	    !list_empty(&hugepage_freelists[nid])) {
+		page = list_entry(hugepage_freelists[nid].next, struct page, list);
+		list_del(&page->list);
+	}
+	return page;
+}
+
+static struct page *alloc_fresh_huge_page(void)
+{
+	static int nid = 0;
+	struct page *page;
+	page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER);
+	nid = (nid + 1) % numnodes;
+	return page;
+}
+
 void free_huge_page(struct page *page);
 
 static struct page *alloc_hugetlb_page(void)
@@ -35,13 +68,11 @@
 	struct page *page;
 
 	spin_lock(&htlbpage_lock);
-	if (list_empty(&htlbpage_freelist)) {
+	page = dequeue_huge_page();
+	if (!page) {
 		spin_unlock(&htlbpage_lock);
 		return NULL;
 	}
-
-	page = list_entry(htlbpage_freelist.next, struct page, list);
-	list_del(&page->list);
 	htlbpagemem--;
 	spin_unlock(&htlbpage_lock);
 	set_page_count(page, 1);
@@ -228,7 +259,7 @@
 	INIT_LIST_HEAD(&page->list);
 
 	spin_lock(&htlbpage_lock);
-	list_add(&page->list, &htlbpage_freelist);
+	enqueue_huge_page(page);
 	htlbpagemem++;
 	spin_unlock(&htlbpage_lock);
 }
@@ -371,7 +402,7 @@
 
 	map = NULL;
 	spin_lock(&htlbpage_lock);
-	list_for_each(p, &htlbpage_freelist) {
+	list_for_each(p, &hugepage_freelists[0]) {
 		if (map) {
 			list_del(&map->list);
 			update_and_free_page(map);
@@ -408,11 +439,11 @@
 		return (int)htlbzone_pages;
 	if (lcount > 0) {	/* Increase the mem size. */
 		while (lcount--) {
-			page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
+			page = alloc_fresh_huge_page();
 			if (page == NULL)
 				break;
 			spin_lock(&htlbpage_lock);
-			list_add(&page->list, &htlbpage_freelist);
+			enqueue_huge_page(page);
 			htlbpagemem++;
 			htlbzone_pages++;
 			spin_unlock(&htlbpage_lock);
@@ -449,17 +480,18 @@
 
 static int __init hugetlb_init(void)
 {
-	int i, j;
+	int i;
 	struct page *page;
 
+	for (i = 0; i < MAX_NUMNODES; ++i)
+		INIT_LIST_HEAD(&hugepage_freelists[i]);
+
 	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
+		page = alloc_fresh_huge_page();
 		if (!page)
 			break;
-		for (j = 0; j < HPAGE_SIZE/PAGE_SIZE; ++j)
-			SetPageReserved(&page[j]);
 		spin_lock(&htlbpage_lock);
-		list_add(&page->list, &htlbpage_freelist);
+		enqueue_huge_page(page);
 		spin_unlock(&htlbpage_lock);
 	}
 	htlbpage_max = htlbpagemem = htlbzone_pages = i;
diff -Nru a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
--- a/arch/ia64/mm/init.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/mm/init.c	Fri Oct 17 23:12:58 2003
@@ -13,6 +13,7 @@
 #include <linux/elf.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/module.h>
 #include <linux/personality.h>
 #include <linux/reboot.h>
 #include <linux/slab.h>
@@ -24,6 +25,7 @@
 #include <asm/ia32.h>
 #include <asm/io.h>
 #include <asm/machvec.h>
+#include <asm/numa.h>
 #include <asm/patch.h>
 #include <asm/pgalloc.h>
 #include <asm/sal.h>
@@ -40,10 +42,10 @@
 unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
-# define LARGE_GAP	0x40000000	/* Use virtual mem map if hole is > than this */
   unsigned long vmalloc_end = VMALLOC_END_INIT;
-  static struct page *vmem_map;
-  static unsigned long num_dma_physpages;
+  struct page *vmem_map;
+
+  EXPORT_SYMBOL(vmem_map);
 #endif
 
 static int pgt_cache_water[2] = { 25, 50 };
@@ -337,11 +339,12 @@
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
 
-static int
+int
 create_mem_map_page_table (u64 start, u64 end, void *arg)
 {
 	unsigned long address, start_page, end_page;
 	struct page *map_start, *map_end;
+	int node;
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
@@ -351,19 +354,20 @@
 
 	start_page = (unsigned long) map_start & PAGE_MASK;
 	end_page = PAGE_ALIGN((unsigned long) map_end);
+	node = paddr_to_nid(__pa(start));
 
 	for (address = start_page; address < end_page; address += PAGE_SIZE) {
 		pgd = pgd_offset_k(address);
 		if (pgd_none(*pgd))
-			pgd_populate(&init_mm, pgd, alloc_bootmem_pages(PAGE_SIZE));
+			pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
 		pmd = pmd_offset(pgd, address);
 
 		if (pmd_none(*pmd))
-			pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages(PAGE_SIZE));
+			pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
 		pte = pte_offset_kernel(pmd, address);
 
 		if (pte_none(*pte))
-			set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages(PAGE_SIZE)) >> PAGE_SHIFT,
+			set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)) >> PAGE_SHIFT,
 					     PAGE_KERNEL));
 	}
 	return 0;
@@ -433,17 +437,7 @@
 	return __get_user(byte, (char *) pfn_to_page(pfn)) == 0;
 }
 
-static int
-count_dma_pages (u64 start, u64 end, void *arg)
-{
-	unsigned long *count = arg;
-
-	if (end <= MAX_DMA_ADDRESS)
-		*count += (end - start) >> PAGE_SHIFT;
-	return 0;
-}
-
-static int
+int
 find_largest_hole (u64 start, u64 end, void *arg)
 {
 	u64 *max_gap = arg;
@@ -458,103 +452,6 @@
 	return 0;
 }
 #endif /* CONFIG_VIRTUAL_MEM_MAP */
-
-static int
-count_pages (u64 start, u64 end, void *arg)
-{
-	unsigned long *count = arg;
-
-	*count += (end - start) >> PAGE_SHIFT;
-	return 0;
-}
-
-/*
- * Set up the page tables.
- */
-
-#ifdef CONFIG_DISCONTIGMEM
-void
-paging_init (void)
-{
-	extern void discontig_paging_init(void);
-
-	discontig_paging_init();
-	efi_memmap_walk(count_pages, &num_physpages);
-	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
-}
-#else /* !CONFIG_DISCONTIGMEM */
-void
-paging_init (void)
-{
-	unsigned long max_dma;
-	unsigned long zones_size[MAX_NR_ZONES];
-#  ifdef CONFIG_VIRTUAL_MEM_MAP
-	unsigned long zholes_size[MAX_NR_ZONES];
-	unsigned long max_gap;
-#  endif
-
-	/* initialize mem_map[] */
-
-	memset(zones_size, 0, sizeof(zones_size));
-
-	num_physpages = 0;
-	efi_memmap_walk(count_pages, &num_physpages);
-
-	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
-#  ifdef CONFIG_VIRTUAL_MEM_MAP
-	memset(zholes_size, 0, sizeof(zholes_size));
-
-	num_dma_physpages = 0;
-	efi_memmap_walk(count_dma_pages, &num_dma_physpages);
-
-	if (max_low_pfn < max_dma) {
-		zones_size[ZONE_DMA] = max_low_pfn;
-		zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages;
-	} else {
-		zones_size[ZONE_DMA] = max_dma;
-		zholes_size[ZONE_DMA] = max_dma - num_dma_physpages;
-		if (num_physpages > num_dma_physpages) {
-			zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
-			zholes_size[ZONE_NORMAL] = ((max_low_pfn - max_dma)
-						    - (num_physpages - num_dma_physpages));
-		}
-	}
-
-	max_gap = 0;
-	efi_memmap_walk(find_largest_hole, (u64 *)&max_gap);
-	if (max_gap < LARGE_GAP) {
-		vmem_map = (struct page *) 0;
-		free_area_init_node(0, &contig_page_data, NULL, zones_size, 0, zholes_size);
-		mem_map = contig_page_data.node_mem_map;
-	}
-	else {
-		unsigned long map_size;
-
-		/* allocate virtual_mem_map */
-
-		map_size = PAGE_ALIGN(max_low_pfn * sizeof(struct page));
-		vmalloc_end -= map_size;
-		vmem_map = (struct page *) vmalloc_end;
-		efi_memmap_walk(create_mem_map_page_table, 0);
-
-		free_area_init_node(0, &contig_page_data, vmem_map, zones_size, 0, zholes_size);
-
-		mem_map = contig_page_data.node_mem_map;
-		printk("Virtual mem_map starts at 0x%p\n", mem_map);
-	}
-#  else /* !CONFIG_VIRTUAL_MEM_MAP */
-	if (max_low_pfn < max_dma)
-		zones_size[ZONE_DMA] = max_low_pfn;
-	else {
-		zones_size[ZONE_DMA] = max_dma;
-		zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
-	}
-	free_area_init(zones_size);
-#  endif /* !CONFIG_VIRTUAL_MEM_MAP */
-	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
-}
-#endif /* !CONFIG_DISCONTIGMEM */
 
 static int
 count_reserved_pages (u64 start, u64 end, void *arg)
diff -Nru a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
--- a/arch/ia64/mm/numa.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/mm/numa.c	Fri Oct 17 23:12:58 2003
@@ -11,12 +11,19 @@
  */
 
 #include <linux/config.h>
+#include <linux/cpu.h>
 #include <linux/kernel.h>
+#include <linux/memblk.h>
 #include <linux/mm.h>
+#include <linux/node.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <asm/numa.h>
 
+static struct memblk *sysfs_memblks;
+static struct node *sysfs_nodes;
+static struct cpu *sysfs_cpus;
+
 /*
  * The following structures are usually initialized by ACPI or
  * similar mechanisms and describe the NUMA characteristics of the machine.
@@ -43,3 +50,49 @@
 
 	return (i < num_memblks) ? node_memblk[i].nid : (num_memblks ? -1 : 0);
 }
+
+static int __init topology_init(void)
+{
+	int i, err = 0;
+
+	sysfs_nodes = kmalloc(sizeof(struct node) * numnodes, GFP_KERNEL);
+	if (!sysfs_nodes) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	sysfs_memblks = kmalloc(sizeof(struct memblk) * num_memblks,
+				GFP_KERNEL);
+	if (!sysfs_memblks) {
+		kfree(sysfs_nodes);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	sysfs_cpus = kmalloc(sizeof(struct cpu) * NR_CPUS, GFP_KERNEL);
+	if (!sysfs_cpus) {
+		kfree(sysfs_memblks);
+		kfree(sysfs_nodes);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < numnodes; i++)
+		if ((err = register_node(&sysfs_nodes[i], i, 0)))
+			goto out;
+
+	for (i = 0; i < num_memblks; i++)
+		if ((err = register_memblk(&sysfs_memblks[i], i,
+					   &sysfs_nodes[memblk_to_node(i)])))
+			goto out;
+
+	for (i = 0; i < NR_CPUS; i++)
+		if (cpu_online(i))
+			if((err = register_cpu(&sysfs_cpus[i], i,
+					       &sysfs_nodes[cpu_to_node(i)])))
+				goto out;
+ out:
+	return err;
+}
+
+__initcall(topology_init);
diff -Nru a/arch/ia64/sn/io/machvec/pci_bus_cvlink.c b/arch/ia64/sn/io/machvec/pci_bus_cvlink.c
--- a/arch/ia64/sn/io/machvec/pci_bus_cvlink.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/sn/io/machvec/pci_bus_cvlink.c	Fri Oct 17 23:12:58 2003
@@ -867,6 +867,9 @@
 	int i = 0;
 	struct pci_controller *controller;
 
+	if (!ia64_platform_is("sn2"))
+	    return 0;
+
 	/*
 	 * set pci_raw_ops, etc.
 	 */
diff -Nru a/arch/ia64/sn/io/sn2/ml_SN_intr.c b/arch/ia64/sn/io/sn2/ml_SN_intr.c
--- a/arch/ia64/sn/io/sn2/ml_SN_intr.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/sn/io/sn2/ml_SN_intr.c	Fri Oct 17 23:12:58 2003
@@ -285,7 +285,6 @@
 cpuid_t intr_heuristic(vertex_hdl_t dev, int req_bit, int *resp_bit)
 {
 	cpuid_t		cpuid;
-	cpuid_t		candidate = CPU_NONE;
 	vertex_hdl_t	pconn_vhdl;
 	pcibr_soft_t	pcibr_soft;
 	int 		bit;
@@ -293,30 +292,32 @@
 	/* XXX: gross layering violation.. */
 	if (hwgraph_edge_get(dev, EDGE_LBL_PCI, &pconn_vhdl) == GRAPH_SUCCESS) {
 		pcibr_soft = pcibr_soft_get(pconn_vhdl);
-		if (pcibr_soft && pcibr_soft->bsi_err_intr)
-			candidate = ((hub_intr_t)pcibr_soft->bsi_err_intr)->i_cpuid;
-	}
-
-	if (candidate != CPU_NONE) {
-		/*
-		 * The cpu was chosen already when we assigned
-		 * the error interrupt.
-		 */
-		bit = intr_reserve_level(candidate, req_bit);
-		if (bit >= 0) {
-			*resp_bit = bit;
-			return candidate;
+		if (pcibr_soft && pcibr_soft->bsi_err_intr) {
+			/*
+			 * The cpu was chosen already when we assigned
+			 * the error interrupt.
+			 */
+			cpuid = ((hub_intr_t)pcibr_soft->bsi_err_intr)->i_cpuid;
+			goto done;
 		}
-
-		printk("Cannot target interrupt to target node (%ld).\n",candidate);
-		return CPU_NONE;
 	}
 
 	/*
 	 * Need to choose one.  Try the controlling c-brick first.
 	 */
 	cpuid = intr_cpu_choose_from_node(master_node_get(dev));
-	if (cpuid != CPU_NONE)
-		return cpuid;
-	return intr_cpu_choose_node();
+	if (cpuid == CPU_NONE)
+		cpuid = intr_cpu_choose_node();
+
+ done:
+	if (cpuid != CPU_NONE) {
+		bit = intr_reserve_level(cpuid, req_bit);
+		if (bit >= 0) {
+			*resp_bit = bit;
+			return cpuid;
+		}
+	}
+
+	printk("Cannot target interrupt to target cpu (%ld).\n", cpuid);
+	return CPU_NONE;
 }
diff -Nru a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
--- a/arch/ia64/sn/kernel/setup.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/sn/kernel/setup.c	Fri Oct 17 23:12:58 2003
@@ -147,7 +147,6 @@
  * Sets up an initial console to aid debugging.  Intended primarily
  * for bringup.  See start_kernel() in init/main.c.
  */
-#if defined(CONFIG_IA64_EARLY_PRINTK_SGI_SN) || defined(CONFIG_IA64_SGI_SN_SIM)
 
 void __init
 early_sn_setup(void)
@@ -189,7 +188,6 @@
 		printk(KERN_DEBUG "early_sn_setup: setting master_node_bedrock_address to 0x%lx\n", master_node_bedrock_address);
 	}
 }
-#endif /* CONFIG_IA64_EARLY_PRINTK_SGI_SN */
 
 #ifdef CONFIG_IA64_MCA
 extern int platform_intr_list[];
diff -Nru a/arch/ia64/sn/kernel/sn2/io.c b/arch/ia64/sn/kernel/sn2/io.c
--- a/arch/ia64/sn/kernel/sn2/io.c	Fri Oct 17 23:12:58 2003
+++ b/arch/ia64/sn/kernel/sn2/io.c	Fri Oct 17 23:12:58 2003
@@ -11,6 +11,8 @@
 
 #include <asm/sn/sn2/io.h>
 
+#ifdef CONFIG_IA64_GENERIC
+
 #undef __sn_inb
 #undef __sn_inw
 #undef __sn_inl
@@ -81,3 +83,5 @@
 {
 	return ___sn_readq (addr);
 }
+
+#endif
diff -Nru a/drivers/acpi/tables.c b/drivers/acpi/tables.c
--- a/drivers/acpi/tables.c	Fri Oct 17 23:12:58 2003
+++ b/drivers/acpi/tables.c	Fri Oct 17 23:12:58 2003
@@ -262,10 +262,17 @@
 
 	/* Map the DSDT header via the pointer in the FADT */
 	if (id == ACPI_DSDT) {
-		struct acpi_table_fadt *fadt = (struct acpi_table_fadt *) *header;
+		struct fadt_descriptor_rev2 *fadt = (struct fadt_descriptor_rev2 *) *header;
+
+		if (fadt->header.revision == 3 && fadt->Xdsdt) {
+			*header = (void *) __acpi_map_table(fadt->Xdsdt,
+					sizeof(struct acpi_table_header));
+		} else if (fadt->V1_dsdt) {
+			*header = (void *) __acpi_map_table(fadt->V1_dsdt,
+					sizeof(struct acpi_table_header));
+		} else
+			*header = 0;
 
-		*header = (void *) __acpi_map_table(fadt->dsdt_addr,
-				sizeof(struct acpi_table_header));
 		if (!*header) {
 			printk(KERN_WARNING PREFIX "Unable to map DSDT\n");
 			return -ENODEV;
diff -Nru a/drivers/media/radio/Makefile b/drivers/media/radio/Makefile
--- a/drivers/media/radio/Makefile	Fri Oct 17 23:12:58 2003
+++ b/drivers/media/radio/Makefile	Fri Oct 17 23:12:58 2003
@@ -2,6 +2,8 @@
 # Makefile for the kernel character device drivers.
 #
 
+obj-y		:= dummy.o
+
 miropcm20-objs	:= miropcm20-rds-core.o miropcm20-radio.o
 
 obj-$(CONFIG_RADIO_AZTECH) += radio-aztech.o
diff -Nru a/drivers/media/radio/dummy.c b/drivers/media/radio/dummy.c
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/drivers/media/radio/dummy.c	Fri Oct 17 23:12:59 2003
@@ -0,0 +1 @@
+/* just so the linker knows what kind of object files it's deadling with... */
diff -Nru a/drivers/media/video/Makefile b/drivers/media/video/Makefile
--- a/drivers/media/video/Makefile	Fri Oct 17 23:12:58 2003
+++ b/drivers/media/video/Makefile	Fri Oct 17 23:12:58 2003
@@ -7,6 +7,7 @@
 zoran-objs      :=	zr36120.o zr36120_i2c.o zr36120_mem.o
 zr36067-objs	:=	zoran_procfs.o zoran_device.o \
 			zoran_driver.o zoran_card.o
+obj-y		:=	dummy.o
 
 obj-$(CONFIG_VIDEO_DEV) += videodev.o v4l2-common.o v4l1-compat.o
 
diff -Nru a/drivers/media/video/dummy.c b/drivers/media/video/dummy.c
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/drivers/media/video/dummy.c	Fri Oct 17 23:12:59 2003
@@ -0,0 +1 @@
+/* just so the linker knows what kind of object files it's deadling with... */
diff -Nru a/drivers/net/tulip/media.c b/drivers/net/tulip/media.c
--- a/drivers/net/tulip/media.c	Fri Oct 17 23:12:58 2003
+++ b/drivers/net/tulip/media.c	Fri Oct 17 23:12:58 2003
@@ -278,6 +278,10 @@
 				for (i = 0; i < init_length; i++)
 					outl(init_sequence[i], ioaddr + CSR12);
 			}
+
+			(void) inl(ioaddr + CSR6); /* flush CSR12 writes */
+			udelay(500);		/* Give MII time to recover */
+
 			tmp_info = get_u16(&misc_info[1]);
 			if (tmp_info)
 				tp->advertising[phy_num] = tmp_info | 1;
diff -Nru a/drivers/serial/8250.c b/drivers/serial/8250.c
--- a/drivers/serial/8250.c	Fri Oct 17 23:12:58 2003
+++ b/drivers/serial/8250.c	Fri Oct 17 23:12:58 2003
@@ -2086,6 +2086,9 @@
 
 int __init early_serial_setup(struct uart_port *port)
 {
+	if (port->line >= ARRAY_SIZE(serial8250_ports))
+		return -ENODEV;
+
 	serial8250_isa_init_ports();
 	serial8250_ports[port->line].port	= *port;
 	serial8250_ports[port->line].port.ops	= &serial8250_pops;
diff -Nru a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
--- a/drivers/serial/serial_core.c	Fri Oct 17 23:12:58 2003
+++ b/drivers/serial/serial_core.c	Fri Oct 17 23:12:58 2003
@@ -1859,6 +1859,9 @@
 	if (flow == 'r')
 		termios.c_cflag |= CRTSCTS;
 
+	if (!port->ops)
+		return 0;
+
 	port->ops->set_termios(port, &termios, NULL);
 	co->cflag = termios.c_cflag;
 
diff -Nru a/include/asm-ia64/asmmacro.h b/include/asm-ia64/asmmacro.h
--- a/include/asm-ia64/asmmacro.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/asmmacro.h	Fri Oct 17 23:12:58 2003
@@ -68,20 +68,25 @@
  * we'll patch out the work-around bundles with NOPs, so their impact is minimal.
  */
 #define DO_MCKINLEY_E9_WORKAROUND
+
 #ifdef DO_MCKINLEY_E9_WORKAROUND
 	.section ".data.patch.mckinley_e9", "a"
 	.previous
 /* workaround for Itanium 2 Errata 9: */
-# define MCKINLEY_E9_WORKAROUND			\
-	.xdata4 ".data.patch.mckinley_e9", 1f-.;\
-1:{ .mib;					\
-	nop.m 0;				\
-	nop.i 0;				\
-	br.call.sptk.many b7=1f;;		\
-  };						\
-1:
+# define FSYS_RETURN					\
+	.xdata4 ".data.patch.mckinley_e9", 1f-.;	\
+1:{ .mib;						\
+	nop.m 0;					\
+	mov r16=ar.pfs;					\
+	br.call.sptk.many b7=2f;;			\
+  };							\
+2:{ .mib;						\
+	nop.m 0;					\
+	mov ar.pfs=r16;					\
+	br.ret.sptk.many b6;;				\
+  }
 #else
-# define MCKINLEY_E9_WORKAROUND
+# define FSYS_RETURN	br.ret.sptk.many b6
 #endif
 
 #endif /* _ASM_IA64_ASMMACRO_H */
diff -Nru a/include/asm-ia64/delay.h b/include/asm-ia64/delay.h
--- a/include/asm-ia64/delay.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/delay.h	Fri Oct 17 23:12:58 2003
@@ -67,14 +67,15 @@
 	return result;
 }
 
+extern void ia64_delay_loop (unsigned long loops);
+
 static __inline__ void
 __delay (unsigned long loops)
 {
-	if (loops < 1)
+	if (unlikely(loops < 1))
 		return;
 
-	while (loops--)
-		ia64_nop(0);
+	ia64_delay_loop (loops - 1);
 }
 
 static __inline__ void
diff -Nru a/include/asm-ia64/machvec_sn2.h b/include/asm-ia64/machvec_sn2.h
--- a/include/asm-ia64/machvec_sn2.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/machvec_sn2.h	Fri Oct 17 23:12:58 2003
@@ -99,4 +99,6 @@
 #define platform_dma_sync_sg		sn_dma_sync_sg
 #define platform_dma_supported		sn_dma_supported
 
+#include <asm/sn/sn2/io.h>
+
 #endif /* _ASM_IA64_MACHVEC_SN2_H */
diff -Nru a/include/asm-ia64/mca.h b/include/asm-ia64/mca.h
--- a/include/asm-ia64/mca.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/mca.h	Fri Oct 17 23:12:58 2003
@@ -108,8 +108,6 @@
 	IA64_MCA_NEW_CONTEXT	=	-1	/* SAL to return to new context */
 };
 
-#define MIN_STATE_AREA_SIZE     57
-
 typedef struct ia64_mca_os_to_sal_state_s {
 	u64		imots_os_status;	/*   OS status to SAL as to what happened
 						 *   with the MCA handling.
diff -Nru a/include/asm-ia64/mca_asm.h b/include/asm-ia64/mca_asm.h
--- a/include/asm-ia64/mca_asm.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/mca_asm.h	Fri Oct 17 23:12:58 2003
@@ -110,10 +110,9 @@
 	;;										\
 	dep	temp1 = -1, temp1, PSR_MC, 1;						\
 	;;										\
-	movl	temp2 = start_addr;							\
 	mov	cr.ipsr = temp1;							\
 	;;										\
-	INST_VA_TO_PA(temp2);								\
+	LOAD_PHYSICAL(p0, temp2, start_addr);						\
 	;;										\
 	mov	cr.iip = temp2;								\
 	mov	cr.ifs = r0;								\
diff -Nru a/include/asm-ia64/meminit.h b/include/asm-ia64/meminit.h
--- a/include/asm-ia64/meminit.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/meminit.h	Fri Oct 17 23:12:58 2003
@@ -7,6 +7,8 @@
  * for more details.
  */
 
+#include <linux/config.h>
+
 /*
  * Entries defined so far:
  * 	- boot param structure itself
@@ -32,10 +34,27 @@
 extern void find_initrd (void);
 extern int filter_rsvd_memory (unsigned long start, unsigned long end, void *arg);
 
+/*
+ * For rounding an address to the next IA64_GRANULE_SIZE or order
+ */
+#define GRANULEROUNDDOWN(n)	((n) & ~(IA64_GRANULE_SIZE-1))
+#define GRANULEROUNDUP(n)	(((n)+IA64_GRANULE_SIZE-1) & ~(IA64_GRANULE_SIZE-1))
+#define ORDERROUNDDOWN(n)	((n) & ~((PAGE_SIZE<<MAX_ORDER)-1))
+
 #ifdef CONFIG_DISCONTIGMEM
-extern void call_pernode_memory (unsigned long start, unsigned long end, void *arg);
+  extern void call_pernode_memory (unsigned long start, unsigned long len, void *func);
+#else
+# define call_pernode_memory(start, len, func)	(*func)(start, len, 0)
 #endif
 
 #define IGNORE_PFN0	1	/* XXX fix me: ignore pfn 0 until TLB miss handler is updated... */
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+# define LARGE_GAP	0x40000000 /* Use virtual mem map if hole is > than this */
+  extern unsigned long vmalloc_end;
+  extern struct page *vmem_map;
+  extern int find_largest_hole (u64 start, u64 end, void *arg);
+  extern int create_mem_map_page_table (u64 start, u64 end, void *arg);
+#endif
 
 #endif /* meminit_h */
diff -Nru a/include/asm-ia64/mmzone.h b/include/asm-ia64/mmzone.h
--- a/include/asm-ia64/mmzone.h	Fri Oct 17 23:12:59 2003
+++ b/include/asm-ia64/mmzone.h	Fri Oct 17 23:12:59 2003
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (c) 2000 Silicon Graphics, Inc.  All rights reserved.
+ * Copyright (c) 2000,2003 Silicon Graphics, Inc.  All rights reserved.
  * Copyright (c) 2002 NEC Corp.
  * Copyright (c) 2002 Erich Focht <efocht@ess.nec.de>
  * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
@@ -12,148 +12,26 @@
 #define _ASM_IA64_MMZONE_H
 
 #include <linux/config.h>
-#include <linux/init.h>
-
-/*
- * Given a kaddr, find the base mem_map address for the start of the mem_map
- * entries for the bank containing the kaddr.
- */
-#define BANK_MEM_MAP_BASE(kaddr) local_node_data->bank_mem_map_base[BANK_MEM_MAP_INDEX(kaddr)]
-
-/*
- * Given a kaddr, this macro return the relative map number 
- * within the bank.
- */
-#define BANK_MAP_NR(kaddr) 	(BANK_OFFSET(kaddr) >> PAGE_SHIFT)
-
-/*
- * Given a pte, this macro returns a pointer to the page struct for the pte.
- */
-#define pte_page(pte)	virt_to_page(PAGE_OFFSET | (pte_val(pte)&_PFN_MASK))
-
-/*
- * Determine if a kaddr is a valid memory address of memory that
- * actually exists. 
- *
- * The check consists of 2 parts:
- *	- verify that the address is a region 7 address & does not 
- *	  contain any bits that preclude it from being a valid platform
- *	  memory address
- *	- verify that the chunk actually exists.
- *
- * Note that IO addresses are NOT considered valid addresses.
- *
- * Note, many platforms can simply check if kaddr exceeds a specific size.  
- *	(However, this won't work on SGI platforms since IO space is embedded 
- * 	within the range of valid memory addresses & nodes have holes in the 
- *	address range between banks). 
- */
-#define kern_addr_valid(kaddr)		({long _kav=(long)(kaddr);	\
-					VALID_MEM_KADDR(_kav);})
-
-/*
- * Given a kaddr, return a pointer to the page struct for the page.
- * If the kaddr does not represent RAM memory that potentially exists, return
- * a pointer the page struct for max_mapnr. IO addresses will
- * return the page for max_nr. Addresses in unpopulated RAM banks may
- * return undefined results OR may panic the system.
- *
- */
-#define virt_to_page(kaddr)	({long _kvtp=(long)(kaddr);	\
-				(VALID_MEM_KADDR(_kvtp))	\
-					? BANK_MEM_MAP_BASE(_kvtp) + BANK_MAP_NR(_kvtp)	\
-					: NULL;})
-
-/*
- * Given a page struct entry, return the physical address that the page struct represents.
- * Since IA64 has all memory in the DMA zone, the following works:
- */
-#define page_to_phys(page)	__pa(page_address(page))
-
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
-
-#define node_localnr(pfn, nid)	((pfn) - NODE_DATA(nid)->node_start_pfn)
-
-#define pfn_to_page(pfn)	(struct page *)(node_mem_map(pfn_to_nid(pfn)) + node_localnr(pfn, pfn_to_nid(pfn)))
-
-#define pfn_to_nid(pfn)		 local_node_data->node_id_map[(pfn << PAGE_SHIFT) >> BANKSHIFT]
-
-#define page_to_pfn(page)	(long)((page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn)
+#include <asm/page.h>
+#include <asm/meminit.h>
 
+#ifdef CONFIG_DISCONTIGMEM
 
-/*
- * pfn_valid should be made as fast as possible, and the current definition
- * is valid for machines that are NUMA, but still contiguous, which is what
- * is currently supported. A more generalised, but slower definition would
- * be something like this - mbligh:
- * ( pfn_to_pgdat(pfn) && (pfn < node_end_pfn(pfn_to_nid(pfn))) )
- */
-#define pfn_valid(pfn)          (pfn < max_low_pfn)
-extern unsigned long max_low_pfn;
-
-
-#if defined(CONFIG_IA64_DIG)
-
-/*
- * Platform definitions for DIG platform with contiguous memory.
- */
-#define MAX_PHYSNODE_ID	8		/* Maximum node number +1 */
-#define MAX_PHYS_MEMORY	(1UL << 40)	/* 1 TB */
-
-/*
- * Bank definitions.
- * Configurable settings for DIG: 512MB/bank:  16GB/node,
- *                               2048MB/bank:  64GB/node,
- *                               8192MB/bank: 256GB/node.
- */
-#define NR_BANKS_PER_NODE	32
-#if defined(CONFIG_IA64_NODESIZE_16GB)
-# define BANKSHIFT		29
-#elif defined(CONFIG_IA64_NODESIZE_64GB)
-# define BANKSHIFT		31
-#elif defined(CONFIG_IA64_NODESIZE_256GB)
-# define BANKSHIFT		33
-#else
-# error Unsupported bank and nodesize!
+#ifdef CONFIG_IA64_DIG /* DIG systems are small */
+# define MAX_PHYSNODE_ID	8
+# define NR_NODES		8
+# define NR_MEMBLKS		(NR_NODES * 32)
+#else /* sn2 is the biggest case, so we use that if !DIG */
+# define MAX_PHYSNODE_ID	2048
+# define NR_NODES		256
+# define NR_MEMBLKS		(NR_NODES)
 #endif
-#define BANKSIZE		(1UL << BANKSHIFT)
 
-#elif defined(CONFIG_IA64_SGI_SN2)
-
-/*
- * SGI SN2 discontig definitions
- */
-#define MAX_PHYSNODE_ID	2048	/* 2048 node ids (also called nasid) */
-#define MAX_PHYS_MEMORY	(1UL << 49)
-
-#define NR_BANKS_PER_NODE	4
-#define BANKSHIFT		38
-#define SN2_NODE_SIZE		(64UL*1024*1024*1024)	/* 64GB per node */
-#define BANKSIZE		(SN2_NODE_SIZE/NR_BANKS_PER_NODE)
-
-#endif /* CONFIG_IA64_DIG */
-
-#if defined(CONFIG_IA64_DIG) || defined (CONFIG_IA64_SGI_SN2)
-/* Common defines for both platforms */
-#include <asm/numnodes.h>
-#define BANK_OFFSET(addr)	((unsigned long)(addr) & (BANKSIZE-1))
-#define NR_BANKS		(NR_BANKS_PER_NODE * (1 << NODES_SHIFT))
-#define NR_MEMBLKS		(NR_BANKS)
-
-/*
- * VALID_MEM_KADDR returns a boolean to indicate if a kaddr is
- * potentially a valid cacheable identity mapped RAM memory address.
- * Note that the RAM may or may not actually be present!!
- */
-#define VALID_MEM_KADDR(kaddr)	1
-
-/*
- * Given a nodeid & a bank number, find the address of the mem_map
- * entry for the first page of the bank.
- */
-#define BANK_MEM_MAP_INDEX(kaddr) \
-	(((unsigned long)(kaddr) & (MAX_PHYS_MEMORY-1)) >> BANKSHIFT)
+extern unsigned long max_low_pfn;
 
-#endif /* CONFIG_IA64_DIG || CONFIG_IA64_SGI_SN2 */
+#define pfn_valid(pfn)		(((pfn) < max_low_pfn) && ia64_pfn_valid(pfn))
+#define page_to_pfn(page)	((unsigned long) (page - vmem_map))
+#define pfn_to_page(pfn)	(vmem_map + (pfn))
 
+#endif /* CONFIG_DISCONTIGMEM */
 #endif /* _ASM_IA64_MMZONE_H */
diff -Nru a/include/asm-ia64/nodedata.h b/include/asm-ia64/nodedata.h
--- a/include/asm-ia64/nodedata.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/nodedata.h	Fri Oct 17 23:12:58 2003
@@ -11,9 +11,14 @@
 #ifndef _ASM_IA64_NODEDATA_H
 #define _ASM_IA64_NODEDATA_H
 
+#include <linux/config.h>
 #include <linux/numa.h>
+
+#include <asm/percpu.h>
 #include <asm/mmzone.h>
 
+#ifdef CONFIG_DISCONTIGMEM
+
 /*
  * Node Data. One of these structures is located on each node of a NUMA system.
  */
@@ -22,10 +27,7 @@
 struct ia64_node_data {
 	short			active_cpu_count;
 	short			node;
-	struct pglist_data	*pg_data_ptrs[MAX_NUMNODES];
-	struct page		*bank_mem_map_base[NR_BANKS];
-	struct ia64_node_data	*node_data_ptrs[MAX_NUMNODES];
-	short			node_id_map[NR_BANKS];
+	struct pglist_data	*pg_data_ptrs[NR_NODES];
 };
 
 
@@ -34,41 +36,17 @@
  */
 #define local_node_data		(local_cpu_data->node_data)
 
-
-/*
- * Return a pointer to the node_data structure for the specified node.
- */
-#define node_data(node)	(local_node_data->node_data_ptrs[node])
-
-/*
- * Get a pointer to the node_id/node_data for the current cpu.
- *    (boot time only)
- */
-extern int boot_get_local_nodeid(void);
-extern struct ia64_node_data *get_node_data_ptr(void);
-
 /*
  * Given a node id, return a pointer to the pg_data_t for the node.
- * The following 2 macros are similar. 
  *
  * NODE_DATA 	- should be used in all code not related to system
  *		  initialization. It uses pernode data structures to minimize
  *		  offnode memory references. However, these structure are not 
  *		  present during boot. This macro can be used once cpu_init
  *		  completes.
- *
- * BOOT_NODE_DATA
- *		- should be used during system initialization 
- *		  prior to freeing __initdata. It does not depend on the percpu
- *		  area being present.
- *
- * NOTE:   The names of these macros are misleading but are difficult to change
- *	   since they are used in generic linux & on other architecures.
  */
 #define NODE_DATA(nid)		(local_node_data->pg_data_ptrs[nid])
-#define BOOT_NODE_DATA(nid)	boot_get_pg_data_ptr((long)(nid))
 
-struct pglist_data;
-extern struct pglist_data * __init boot_get_pg_data_ptr(long);
+#endif /* CONFIG_DISCONTIGMEM */
 
 #endif /* _ASM_IA64_NODEDATA_H */
diff -Nru a/include/asm-ia64/numa.h b/include/asm-ia64/numa.h
--- a/include/asm-ia64/numa.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/numa.h	Fri Oct 17 23:12:58 2003
@@ -4,7 +4,7 @@
  * for more details.
  *
  * This file contains NUMA specific prototypes and definitions.
- * 
+ *
  * 2002/08/05 Erich Focht <efocht@ess.nec.de>
  *
  */
@@ -12,12 +12,17 @@
 #define _ASM_IA64_NUMA_H
 
 #include <linux/config.h>
-#include <linux/cpumask.h>
 
 #ifdef CONFIG_NUMA
 
-#include <linux/numa.h>
 #include <linux/cache.h>
+#include <linux/cache.h>
+#include <linux/cpumask.h>
+#include <linux/numa.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+
+#include <asm/mmzone.h>
 
 extern volatile char cpu_to_node_map[NR_CPUS] __cacheline_aligned;
 extern volatile cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
@@ -60,6 +65,10 @@
 extern int paddr_to_nid(unsigned long paddr);
 
 #define local_nodeid (cpu_to_node_map[smp_processor_id()])
+
+#else /* !CONFIG_NUMA */
+
+#define paddr_to_nid(addr)	0
 
 #endif /* CONFIG_NUMA */
 
diff -Nru a/include/asm-ia64/page.h b/include/asm-ia64/page.h
--- a/include/asm-ia64/page.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/page.h	Fri Oct 17 23:12:58 2003
@@ -94,18 +94,20 @@
 
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
 
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+extern int ia64_pfn_valid (unsigned long pfn);
+#else
+# define ia64_pfn_valid(pfn) 1
+#endif
+
 #ifndef CONFIG_DISCONTIGMEM
-# ifdef CONFIG_VIRTUAL_MEM_MAP
-   extern int ia64_pfn_valid (unsigned long pfn);
-#  define pfn_valid(pfn)	(((pfn) < max_mapnr) && ia64_pfn_valid(pfn))
-# else
-#  define pfn_valid(pfn)	((pfn) < max_mapnr)
-# endif
-#define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_valid(pfn)		(((pfn) < max_mapnr) && ia64_pfn_valid(pfn))
 #define page_to_pfn(page)	((unsigned long) (page - mem_map))
 #define pfn_to_page(pfn)	(mem_map + (pfn))
+#endif /* CONFIG_DISCONTIGMEM */
+
 #define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
-#endif
+#define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 
 typedef union ia64_va {
 	struct {
diff -Nru a/include/asm-ia64/pal.h b/include/asm-ia64/pal.h
--- a/include/asm-ia64/pal.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/pal.h	Fri Oct 17 23:12:58 2003
@@ -405,10 +405,11 @@
 						 * generated.
 						 * (Trap Lost )
 						 */
-			op		: 3,	/* Operation that
-						 * caused the machine
-						 * check
+			mi		: 1,	/* More information available
+						 * call PAL_MC_ERROR_INFO
 						 */
+			pi		: 1,	/* Precise instruction pointer */
+			pm		: 1,	/* Precise min-state save area */
 
 			dy		: 1,	/* Processor dynamic
 						 * state valid
@@ -450,11 +451,12 @@
 						 * by the processor
 						 */
 
-			reserved2	: 12,
+			reserved2	: 11,
 			cc		: 1,	/* Cache check */
 			tc		: 1,	/* TLB check */
 			bc		: 1,	/* Bus check */
-			uc		: 1;	/* Unknown check */
+			rc		: 1,	/* Register file check */
+			uc		: 1;	/* Uarch check */
 
 } pal_processor_state_info_t;
 
diff -Nru a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h
--- a/include/asm-ia64/percpu.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/percpu.h	Fri Oct 17 23:12:58 2003
@@ -46,11 +46,13 @@
 
 extern void percpu_modcopy(void *pcpudst, const void *src, unsigned long size);
 extern void setup_per_cpu_areas (void);
+extern void *per_cpu_init(void);
 
 #else /* ! SMP */
 
 #define per_cpu(var, cpu)			((void)cpu, per_cpu__##var)
 #define __get_cpu_var(var)			per_cpu__##var
+#define per_cpu_init()				(__phys_per_cpu_start)
 
 #endif	/* SMP */
 
diff -Nru a/include/asm-ia64/perfmon.h b/include/asm-ia64/perfmon.h
--- a/include/asm-ia64/perfmon.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/perfmon.h	Fri Oct 17 23:12:58 2003
@@ -38,7 +38,6 @@
  */
 #define PFM_FL_NOTIFY_BLOCK    	 0x01	/* block task on user level notifications */
 #define PFM_FL_SYSTEM_WIDE	 0x02	/* create a system wide context */
-#define PFM_FL_UNSECURE		 0x04   /* allow unsecure monitoring for non self-monitoring task */
 #define PFM_FL_OVFL_NO_MSG	 0x80   /* do not post overflow/end messages for notification */
 
 /*
@@ -162,8 +161,6 @@
  */
 #define PFM_VERSION_MAJ		 2U
 #define PFM_VERSION_MIN		 0U
-#define PFM_SMPL_HDR_VERSION_MAJ 2U
-#define PFM_SMPL_HDR_VERSION_MIN 0U
 #define PFM_VERSION		 (((PFM_VERSION_MAJ&0xffff)<<16)|(PFM_VERSION_MIN & 0xffff))
 #define PFM_VERSION_MAJOR(x)	 (((x)>>16) & 0xffff)
 #define PFM_VERSION_MINOR(x)	 ((x) & 0xffff)
@@ -194,9 +191,8 @@
 /*
  * Reset PMD register flags
  */
-#define PFM_PMD_NO_RESET	0
+#define PFM_PMD_SHORT_RESET	0
 #define PFM_PMD_LONG_RESET	1
-#define PFM_PMD_SHORT_RESET	2
 
 typedef union {
 	unsigned int val;
@@ -223,7 +219,7 @@
 } pfm_ovfl_arg_t;
 
 
-typedef struct _pfm_buffer_fmt_t {
+typedef struct {
 	char		*fmt_name;
 	pfm_uuid_t	fmt_uuid;
 	size_t		fmt_arg_size;
@@ -237,8 +233,7 @@
 	int		(*fmt_restart_active)(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs);
 	int		(*fmt_exit)(struct task_struct *task, void *buf, struct pt_regs *regs);
 
-	struct _pfm_buffer_fmt_t *fmt_next;
-	struct _pfm_buffer_fmt_t *fmt_prev;
+	struct list_head fmt_list;
 } pfm_buffer_fmt_t;
 
 extern int pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt);
diff -Nru a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
--- a/include/asm-ia64/pgtable.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/pgtable.h	Fri Oct 17 23:12:58 2003
@@ -174,7 +174,6 @@
 	return (addr & (local_cpu_data->unimpl_pa_mask)) == 0;
 }
 
-#ifndef CONFIG_DISCONTIGMEM
 /*
  * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel
  * memory.  For the return value to be meaningful, ADDR must be >=
@@ -190,7 +189,6 @@
  */
 #define kern_addr_valid(addr)	(1)
 
-#endif
 
 /*
  * Now come the defines and routines to manage and access the three-level
@@ -240,10 +238,8 @@
 #define pte_none(pte) 			(!pte_val(pte))
 #define pte_present(pte)		(pte_val(pte) & (_PAGE_P | _PAGE_PROTNONE))
 #define pte_clear(pte)			(pte_val(*(pte)) = 0UL)
-#ifndef CONFIG_DISCONTIGMEM
 /* pte_page() returns the "struct page *" corresponding to the PTE: */
 #define pte_page(pte)			virt_to_page(((pte_val(pte) & _PFN_MASK) + PAGE_OFFSET))
-#endif
 
 #define pmd_none(pmd)			(!pmd_val(pmd))
 #define pmd_bad(pmd)			(!ia64_phys_addr_valid(pmd_val(pmd)))
diff -Nru a/include/asm-ia64/posix_types.h b/include/asm-ia64/posix_types.h
--- a/include/asm-ia64/posix_types.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/posix_types.h	Fri Oct 17 23:12:58 2003
@@ -10,7 +10,7 @@
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-typedef unsigned int	__kernel_ino_t;
+typedef unsigned long	__kernel_ino_t;
 typedef unsigned int	__kernel_mode_t;
 typedef unsigned int	__kernel_nlink_t;
 typedef long		__kernel_off_t;
diff -Nru a/include/asm-ia64/serial.h b/include/asm-ia64/serial.h
--- a/include/asm-ia64/serial.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/serial.h	Fri Oct 17 23:12:58 2003
@@ -4,8 +4,6 @@
  * Derived from the i386 version.
  */
 
-#include <linux/config.h>
-
 /*
  * This assumes you have a 1.8432 MHz clock for your UART.
  *
@@ -15,107 +13,7 @@
  */
 #define BASE_BAUD ( 1843200 / 16 )
 
-#define CONFIG_SERIAL_DETECT_IRQ	/* on IA-64, we always want to autodetect irqs */
-
-/* Standard COM flags (except for COM4, because of the 8514 problem) */
-#ifdef CONFIG_SERIAL_DETECT_IRQ
-#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ)
-#define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_AUTO_IRQ)
-#else
-#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST)
-#define STD_COM4_FLAGS ASYNC_BOOT_AUTOCONF
-#endif
-
-#ifdef CONFIG_SERIAL_MANY_PORTS
-#define FOURPORT_FLAGS ASYNC_FOURPORT
-#define ACCENT_FLAGS 0
-#define BOCA_FLAGS 0
-#define HUB6_FLAGS 0
-#define RS_TABLE_SIZE	64
-#else
-#define RS_TABLE_SIZE
-#endif
-
 /*
- * The following define the access methods for the HUB6 card. All
- * access is through two ports for all 24 possible chips. The card is
- * selected through the high 2 bits, the port on that card with the
- * "middle" 3 bits, and the register on that port with the bottom
- * 3 bits.
- *
- * While the access port and interrupt is configurable, the default
- * port locations are 0x302 for the port control register, and 0x303
- * for the data read/write register. Normally, the interrupt is at irq3
- * but can be anything from 3 to 7 inclusive. Note that using 3 will
- * require disabling com2.
- */
-
-#define C_P(card,port) (((card)<<6|(port)<<3) + 1)
-
-#define STD_SERIAL_PORT_DEFNS			\
-	/* UART CLK   PORT IRQ     FLAGS        */			\
-	{ 0, BASE_BAUD, 0x3F8, 4, STD_COM_FLAGS },	/* ttyS0 */	\
-	{ 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS },	/* ttyS1 */	\
-	{ 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS },	/* ttyS2 */	\
-	{ 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS },	/* ttyS3 */
-
-#ifdef CONFIG_SERIAL_MANY_PORTS
-#define EXTRA_SERIAL_PORT_DEFNS			\
-	{ 0, BASE_BAUD, 0x1A0, 9, FOURPORT_FLAGS }, 	/* ttyS4 */	\
-	{ 0, BASE_BAUD, 0x1A8, 9, FOURPORT_FLAGS },	/* ttyS5 */	\
-	{ 0, BASE_BAUD, 0x1B0, 9, FOURPORT_FLAGS },	/* ttyS6 */	\
-	{ 0, BASE_BAUD, 0x1B8, 9, FOURPORT_FLAGS },	/* ttyS7 */	\
-	{ 0, BASE_BAUD, 0x2A0, 5, FOURPORT_FLAGS },	/* ttyS8 */	\
-	{ 0, BASE_BAUD, 0x2A8, 5, FOURPORT_FLAGS },	/* ttyS9 */	\
-	{ 0, BASE_BAUD, 0x2B0, 5, FOURPORT_FLAGS },	/* ttyS10 */	\
-	{ 0, BASE_BAUD, 0x2B8, 5, FOURPORT_FLAGS },	/* ttyS11 */	\
-	{ 0, BASE_BAUD, 0x330, 4, ACCENT_FLAGS },	/* ttyS12 */	\
-	{ 0, BASE_BAUD, 0x338, 4, ACCENT_FLAGS },	/* ttyS13 */	\
-	{ 0, BASE_BAUD, 0x000, 0, 0 },	/* ttyS14 (spare) */		\
-	{ 0, BASE_BAUD, 0x000, 0, 0 },	/* ttyS15 (spare) */		\
-	{ 0, BASE_BAUD, 0x100, 12, BOCA_FLAGS },	/* ttyS16 */	\
-	{ 0, BASE_BAUD, 0x108, 12, BOCA_FLAGS },	/* ttyS17 */	\
-	{ 0, BASE_BAUD, 0x110, 12, BOCA_FLAGS },	/* ttyS18 */	\
-	{ 0, BASE_BAUD, 0x118, 12, BOCA_FLAGS },	/* ttyS19 */	\
-	{ 0, BASE_BAUD, 0x120, 12, BOCA_FLAGS },	/* ttyS20 */	\
-	{ 0, BASE_BAUD, 0x128, 12, BOCA_FLAGS },	/* ttyS21 */	\
-	{ 0, BASE_BAUD, 0x130, 12, BOCA_FLAGS },	/* ttyS22 */	\
-	{ 0, BASE_BAUD, 0x138, 12, BOCA_FLAGS },	/* ttyS23 */	\
-	{ 0, BASE_BAUD, 0x140, 12, BOCA_FLAGS },	/* ttyS24 */	\
-	{ 0, BASE_BAUD, 0x148, 12, BOCA_FLAGS },	/* ttyS25 */	\
-	{ 0, BASE_BAUD, 0x150, 12, BOCA_FLAGS },	/* ttyS26 */	\
-	{ 0, BASE_BAUD, 0x158, 12, BOCA_FLAGS },	/* ttyS27 */	\
-	{ 0, BASE_BAUD, 0x160, 12, BOCA_FLAGS },	/* ttyS28 */	\
-	{ 0, BASE_BAUD, 0x168, 12, BOCA_FLAGS },	/* ttyS29 */	\
-	{ 0, BASE_BAUD, 0x170, 12, BOCA_FLAGS },	/* ttyS30 */	\
-	{ 0, BASE_BAUD, 0x178, 12, BOCA_FLAGS },	/* ttyS31 */
-#else
-#define EXTRA_SERIAL_PORT_DEFNS
-#endif
-
-/* You can have up to four HUB6's in the system, but I've only
- * included two cards here for a total of twelve ports.
+ * All legacy serial ports should be enumerated via ACPI namespace, so
+ * we need not list them here.
  */
-#if (defined(CONFIG_HUB6) && defined(CONFIG_SERIAL_MANY_PORTS))
-#define HUB6_SERIAL_PORT_DFNS		\
-	{ 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(0,0) },  /* ttyS32 */	\
-	{ 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(0,1) },  /* ttyS33 */	\
-	{ 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(0,2) },  /* ttyS34 */	\
-	{ 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(0,3) },  /* ttyS35 */	\
-	{ 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(0,4) },  /* ttyS36 */	\
-	{ 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(0,5) },  /* ttyS37 */	\
-	{ 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(1,0) },  /* ttyS38 */	\
-	{ 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(1,1) },  /* ttyS39 */	\
-	{ 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(1,2) },  /* ttyS40 */	\
-	{ 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(1,3) },  /* ttyS41 */	\
-	{ 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(1,4) },  /* ttyS42 */	\
-	{ 0, BASE_BAUD, 0x302, 3, HUB6_FLAGS, C_P(1,5) },  /* ttyS43 */
-#else
-#define HUB6_SERIAL_PORT_DFNS
-#endif
-
-#define SERIAL_PORT_DFNS		\
-	STD_SERIAL_PORT_DEFNS		\
-	EXTRA_SERIAL_PORT_DEFNS		\
-	HUB6_SERIAL_PORT_DFNS
-
diff -Nru a/include/asm-ia64/sn/nodepda.h b/include/asm-ia64/sn/nodepda.h
--- a/include/asm-ia64/sn/nodepda.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/sn/nodepda.h	Fri Oct 17 23:12:58 2003
@@ -128,7 +128,7 @@
  * Check if given a compact node id the corresponding node has all the
  * cpus disabled. 
  */
-#define is_headless_node(cnode)		(!any_online_cpu(node_to_cpumask(cnode)))
+#define is_headless_node(cnode)		(!node_to_cpu_mask[cnode])
 
 /*
  * Check if given a node vertex handle the corresponding node has all the
diff -Nru a/include/asm-ia64/uaccess.h b/include/asm-ia64/uaccess.h
--- a/include/asm-ia64/uaccess.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/uaccess.h	Fri Oct 17 23:12:58 2003
@@ -408,11 +408,7 @@
 extern void handle_exception (struct pt_regs *regs, const struct exception_table_entry *e);
 extern const struct exception_table_entry *search_exception_tables (unsigned long addr);
 
-#ifdef GAS_HAS_LOCAL_TAGS
 # define SEARCH_EXCEPTION_TABLE(regs) search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri)
-#else
-# define SEARCH_EXCEPTION_TABLE(regs) search_exception_tables(regs->cr_iip)
-#endif
 
 static inline int
 done_with_exception (struct pt_regs *regs)
diff -Nru a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
--- a/include/asm-ia64/unistd.h	Fri Oct 17 23:12:58 2003
+++ b/include/asm-ia64/unistd.h	Fri Oct 17 23:12:58 2003
@@ -237,17 +237,17 @@
 #define __NR_epoll_wait			1245
 #define __NR_restart_syscall		1246
 #define __NR_semtimedop			1247
-#define __NR_sys_timer_create		1248
-#define __NR_sys_timer_settime		1249
-#define __NR_sys_timer_gettime		1250
-#define __NR_sys_timer_getoverrun	1251
-#define __NR_sys_timer_delete		1252
-#define __NR_sys_clock_settime		1253
-#define __NR_sys_clock_gettime		1254
-#define __NR_sys_clock_getres		1255
-#define __NR_sys_clock_nanosleep	1256
-#define __NR_sys_fstatfs64		1257
-#define __NR_sys_statfs64		1258
+#define __NR_timer_create		1248
+#define __NR_timer_settime		1249
+#define __NR_timer_gettime		1250
+#define __NR_timer_getoverrun		1251
+#define __NR_timer_delete		1252
+#define __NR_clock_settime		1253
+#define __NR_clock_gettime		1254
+#define __NR_clock_getres		1255
+#define __NR_clock_nanosleep		1256
+#define __NR_fstatfs64			1257
+#define __NR_statfs64			1258
 
 #ifdef __KERNEL__
 
diff -Nru a/include/linux/module.h b/include/linux/module.h
--- a/include/linux/module.h	Fri Oct 17 23:12:58 2003
+++ b/include/linux/module.h	Fri Oct 17 23:12:58 2003
@@ -60,10 +60,11 @@
 #define __module_cat(a,b) ___module_cat(a,b)
 #define __MODULE_INFO(tag, name, info)					  \
 static const char __module_cat(name,__LINE__)[]				  \
+  __attribute_used__							  \
   __attribute__((section(".modinfo"),unused)) = __stringify(tag) "=" info
 
-#define MODULE_GENERIC_TABLE(gtype,name)			\
-extern const struct gtype##_id __mod_##gtype##_table		\
+#define MODULE_GENERIC_TABLE(gtype,name)		\
+extern const struct gtype##_id __mod_##gtype##_table	\
   __attribute__ ((unused, alias(__stringify(name))))
 
 #define THIS_MODULE (&__this_module)
@@ -142,6 +143,7 @@
 #define __CRC_SYMBOL(sym, sec)					\
 	extern void *__crc_##sym __attribute__((weak));		\
 	static const unsigned long __kcrctab_##sym		\
+	__attribute_used__					\
 	__attribute__((section("__kcrctab" sec), unused))	\
 	= (unsigned long) &__crc_##sym;
 #else
@@ -155,6 +157,7 @@
 	__attribute__((section("__ksymtab_strings")))		\
 	= MODULE_SYMBOL_PREFIX #sym;                    	\
 	static const struct kernel_symbol __ksymtab_##sym	\
+	__attribute_used__					\
 	__attribute__((section("__ksymtab" sec), unused))	\
 	= { (unsigned long)&sym, __kstrtab_##sym }
 
diff -Nru a/include/linux/moduleparam.h b/include/linux/moduleparam.h
--- a/include/linux/moduleparam.h	Fri Oct 17 23:12:58 2003
+++ b/include/linux/moduleparam.h	Fri Oct 17 23:12:58 2003
@@ -52,6 +52,7 @@
 #define __module_param_call(prefix, name, set, get, arg, perm)		\
 	static char __param_str_##name[] __initdata = prefix #name;	\
 	static struct kernel_param const __param_##name			\
+	__attribute_used__						\
     __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \
 	= { __param_str_##name, perm, set, get, arg }
 
diff -Nru a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
--- a/include/linux/nfs_fs.h	Fri Oct 17 23:12:58 2003
+++ b/include/linux/nfs_fs.h	Fri Oct 17 23:12:58 2003
@@ -403,7 +403,7 @@
 nfs_size_to_loff_t(__u64 size)
 {
 	loff_t maxsz = (((loff_t) ULONG_MAX) << PAGE_CACHE_SHIFT) + PAGE_CACHE_SIZE - 1;
-	if (size > maxsz)
+	if (size > (__u64) maxsz)
 		return maxsz;
 	return (loff_t) size;
 }
diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h
--- a/include/linux/sysctl.h	Fri Oct 17 23:12:58 2003
+++ b/include/linux/sysctl.h	Fri Oct 17 23:12:58 2003
@@ -127,6 +127,7 @@
 	KERN_PANIC_ON_OOPS=57,  /* int: whether we will panic on an oops */
 	KERN_HPPA_PWRSW=58,	/* int: hppa soft-power enable */
 	KERN_HPPA_UNALIGNED=59,	/* int: hppa unaligned-trap enable */
+	KERN_CACHEDECAYTICKS=60,/* ulong: value for cache_decay_ticks (EXPERIMENTAL!) */
 };
 
 
diff -Nru a/kernel/printk.c b/kernel/printk.c
--- a/kernel/printk.c	Fri Oct 17 23:12:59 2003
+++ b/kernel/printk.c	Fri Oct 17 23:12:59 2003
@@ -361,6 +361,12 @@
 			__call_console_drivers(start, end);
 		}
 	}
+#ifdef CONFIG_IA64_EARLY_PRINTK
+	if (!console_drivers) {
+		void early_printk (const char *str, size_t len);
+		early_printk(&LOG_BUF(start), end - start);
+	}
+#endif
 }
 
 /*
@@ -678,7 +684,11 @@
 		 * for us.
 		 */
 		spin_lock_irqsave(&logbuf_lock, flags);
+#ifdef CONFIG_IA64_EARLY_PRINTK
+		con_start = log_end;
+#else
 		con_start = log_start;
+#endif
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 	}
 	release_console_sem();
@@ -731,3 +741,117 @@
 		tty->driver->write(tty, 0, msg, strlen(msg));
 	return;
 }
+
+#ifdef CONFIG_IA64_EARLY_PRINTK
+
+#include <asm/io.h>
+
+# ifdef CONFIG_IA64_EARLY_PRINTK_VGA
+
+
+#define VGABASE		((char *)0xc0000000000b8000)
+#define VGALINES	24
+#define VGACOLS		80
+
+static int current_ypos = VGALINES, current_xpos = 0;
+
+static void
+early_printk_vga (const char *str, size_t len)
+{
+	char c;
+	int  i, k, j;
+
+	while (len-- > 0) {
+		c = *str++;
+		if (current_ypos >= VGALINES) {
+			/* scroll 1 line up */
+			for (k = 1, j = 0; k < VGALINES; k++, j++) {
+				for (i = 0; i < VGACOLS; i++) {
+					writew(readw(VGABASE + 2*(VGACOLS*k + i)),
+					       VGABASE + 2*(VGACOLS*j + i));
+				}
+			}
+			for (i = 0; i < VGACOLS; i++) {
+				writew(0x720, VGABASE + 2*(VGACOLS*j + i));
+			}
+			current_ypos = VGALINES-1;
+		}
+		if (c == '\n') {
+			current_xpos = 0;
+			current_ypos++;
+		} else if (c != '\r')  {
+			writew(((0x7 << 8) | (unsigned short) c),
+			       VGABASE + 2*(VGACOLS*current_ypos + current_xpos++));
+			if (current_xpos >= VGACOLS) {
+				current_xpos = 0;
+				current_ypos++;
+			}
+		}
+	}
+}
+
+# endif /* CONFIG_IA64_EARLY_PRINTK_VGA */
+
+# ifdef CONFIG_IA64_EARLY_PRINTK_UART
+
+#include <linux/serial_reg.h>
+#include <asm/system.h>
+
+static void early_printk_uart(const char *str, size_t len)
+{
+	static char *uart = NULL;
+	unsigned long uart_base;
+	char c;
+
+	if (!uart) {
+		uart_base = 0;
+#  ifdef CONFIG_SERIAL_8250_HCDP
+		{
+			extern unsigned long hcdp_early_uart(void);
+			uart_base = hcdp_early_uart();
+		}
+#  endif
+#  if CONFIG_IA64_EARLY_PRINTK_UART_BASE
+		if (!uart_base)
+			uart_base = CONFIG_IA64_EARLY_PRINTK_UART_BASE;
+#  endif
+		if (!uart_base)
+			return;
+
+		uart = ioremap(uart_base, 64);
+		if (!uart)
+			return;
+	}
+
+	while (len-- > 0) {
+		c = *str++;
+		while ((readb(uart + UART_LSR) & UART_LSR_TEMT) == 0)
+			cpu_relax(); /* spin */
+
+		writeb(c, uart + UART_TX);
+
+		if (c == '\n')
+			writeb('\r', uart + UART_TX);
+	}
+}
+
+# endif /* CONFIG_IA64_EARLY_PRINTK_UART */
+
+#ifdef CONFIG_IA64_EARLY_PRINTK_SGI_SN
+extern int early_printk_sn_sal(const char *str, int len);
+#endif
+
+void early_printk(const char *str, size_t len)
+{
+#ifdef CONFIG_IA64_EARLY_PRINTK_UART
+	early_printk_uart(str, len);
+#endif
+#ifdef CONFIG_IA64_EARLY_PRINTK_VGA
+	early_printk_vga(str, len);
+#endif
+#ifdef CONFIG_IA64_EARLY_PRINTK_SGI_SN
+ 	early_printk_sn_sal(str, len);
+#endif
+}
+
+#endif /* CONFIG_IA64_EARLY_PRINTK */
diff -Nru a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c	Fri Oct 17 23:12:58 2003
+++ b/kernel/sysctl.c	Fri Oct 17 23:12:58 2003
@@ -579,6 +579,16 @@
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_SMP
+	{
+		.ctl_name	= KERN_CACHEDECAYTICKS,
+		.procname	= "cache_decay_ticks",
+		.data		= &cache_decay_ticks,
+		.maxlen		= sizeof(cache_decay_ticks),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
diff -Nru a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c	Fri Oct 17 23:12:58 2003
+++ b/mm/memory.c	Fri Oct 17 23:12:58 2003
@@ -121,8 +121,10 @@
 	}
 	pmd = pmd_offset(dir, 0);
 	pgd_clear(dir);
-	for (j = 0; j < PTRS_PER_PMD ; j++)
+	for (j = 0; j < PTRS_PER_PMD ; j++) {
+		prefetchw(pmd + j + PREFETCH_STRIDE/sizeof(*pmd));
 		free_one_pmd(tlb, pmd+j);
+	}
 	pmd_free_tlb(tlb, pmd);
 }