Name: Hotplug CPU Patch: Base XII
Author: Matt Fleming, Zwane Mwaikambo, Rusty Russell
Status: Experimental
Depends: Hotcpu/thread_control.patch.gz
Depends: Hotcpu/sigpower.patch.gz

D: This is the arch-indep hotplug cpu code, contributed by Matt
D: Fleming, Zwane Mwaikambo and me.  cpumask_t adoption by Dipankar Sarma.
D: 
D: When a CPU goes down, normal tasks get migrated.  If their cpu mask
D: doesn't allow this, it gets reset to "any", and they get sent a
D: SIGPWR (which CPU is going down is in the siginfo struct).  Kernel
D: threads which are bound to particular CPUs are not touched: they
D: must register notifiers to deal with this themselves.
D: 
D: These changes, while widespread, are fairly well contained in
D: CONFIG_HOTPLUG_CPU.  There are no doubt more places in the kernel
D: which contain per-cpu data structures which need notifying, but
D: these are the core ones.
D: 
D: Patch contains:
D: - New option: CONFIG_HOTPLUG_CPU.
D: - drivers/base/cpu.c:
D:    Add "online" attribute to sysfs.
D: - fs/buffer.c:
D:    Release the buffer head lru list for cpu when it goes offline.
D: - kernel/cpu.c:
D:    cpu_down implementation
D:    /sbin/hotplug call for cpu activity.
D:    New events: CPU_UP_PREPARE (can be NAKed), CPU_UP_CANCELED.
D:    New events: CPU_DOWN_PREPARE (can be NAKed), CPU_DOWN_CANCELED, CPU_DEAD.
D: - kernel/rcupdate.c
D:    Code for clearing RCU queue of dead CPUs, under CONFIG_HOTPLUG_CPU.
D:    pull_from_global_queue() in rcu_process_callbacks, noop if !HOTPLUG_CPU.
D: - kernel/sched.c:
D:    Trivial exposed wake_idle_cpu(cpu) function, useful for some archs
D:      which fake hotplug CPUs.
D:    cpu_is_offline() check when pulling tasks onto CPU: NOOP if !HOTPLUG_CPU.
D:    Code to stop migration threads (similar to workqueue.c).
D: - kernel/softirq.c
D:    Code to stop ksoftirqd when cpu goes offline, and to migrate
D:    irqs when it finally dies (under HOTPLUG_CPU).
D: - kernel/timer.c:
D:    Code to pull timers when cpu dies, under HOTPLUG_CPU.
D: - kernel/workqueue.c
D:    Keep list of all workqueues in system.
D:    Move name pointer into wq struct, so we can name newly created thread
D:      if CPU comes up later.
D:    Grab cpucontrol lock around workqueue creation/destruction/flush.
D:    Code to kill workqueue threads when a CPU goes offline, under HOTPLUG_CPU.
D: - mm/slab.c
D:    Move ac_entry and ac_delta helper functions earlier in file.
D:    Clean up list iterators.
D:    Added stop_cpu_timer(), under HOTPLUG_CPU.
D:    Add code to free caches when cpu goes down.
D: - mm/swap.c
D:    Spill local vm_committed_space counters into global when cpu dies,
D:    under HOTPLUG_CPU.
D: - mm/vmscan.c
D:    Migrate kswapd when last cpu in node goes down, and back
D:    if one comes up, under HOTPLUG_CPU.
D: - net/core/dev.c
D:    Add callback to drain softnet queue when cpu goes down.
D: 
D: Changes since base VI:
D:    1) Thread stopping code all uses completion idiom, for simplicity,
D:       a-la workqueues.  This effects ksoftirqd and migration threads.
D:    2) Moved mm/slab.c code under CONFIG_HOTPLUG_CPU.
D:    3) Simplified code in net/core/dev.c and removed some whitespace.
D: 
D: Changes since base VIII:
D:    1) Ignore TASK_DEAD tasks in check_for_tasks, too.
D:    2) Expose CPU_UP_CANCELED case in mm/slab.c to !HOTPLUG_CPU.
D:    3) Hold cachep->spinlock around free_block() call.
D:    4) any_online_cpus return fixed to NR_CPUS.
D: 
D: Changes since base IX:
D:    1) cpucontrol lock around workqueue creation/destruction/flushing.
D:    2) Cleanup nr_interruptible numbers when CPU_DEAD.
D:    3) Add kernel_thread_on_cpu for convenient thread starting.
D:    4) Initialize completion in sched.c every time cpu comes up.
D:
D: Changes since base X:
D:    1) Use kthread_create etc.

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/drivers/base/cpu.c .17007-linux-2.6.0-test5-bk11.updated/drivers/base/cpu.c
--- .17007-linux-2.6.0-test5-bk11/drivers/base/cpu.c	2003-09-22 10:27:56.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/drivers/base/cpu.c	2003-09-25 14:54:34.000000000 +1000
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/cpu.h>
 #include <linux/topology.h>
+#include <linux/device.h>
 
 
 struct sysdev_class cpu_sysdev_class = {
@@ -14,6 +15,46 @@ struct sysdev_class cpu_sysdev_class = {
 };
 EXPORT_SYMBOL(cpu_sysdev_class);
 
+#ifdef CONFIG_HOTPLUG_CPU
+static ssize_t show_online(struct sys_device *dev, char *buf)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+
+	return sprintf(buf, "%u\n", !!cpu_online(cpu->sysdev.id));
+}
+
+static ssize_t store_online(struct sys_device *dev, const char *buf,
+			    size_t count)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	ssize_t ret;
+
+	switch (buf[0]) {
+	case '0':
+		ret = cpu_down(cpu->sysdev.id);
+		break;
+	case '1':
+		ret = cpu_up(cpu->sysdev.id);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret >= 0)
+		ret = count;
+	return ret;
+}
+static SYSDEV_ATTR(online, 0600, show_online, store_online);
+
+static void __init register_cpu_control(struct cpu *cpu)
+{
+	sysdev_create_file(&cpu->sysdev, &attr_online);
+}
+#else /* ... !CONFIG_HOTPLUG_CPU */
+static void __init register_cpu_control(struct cpu *cpu)
+{
+}
+#endif /* CONFIG_HOTPLUG_CPU */
 
 /*
  * register_cpu - Setup a driverfs device for a CPU.
@@ -23,10 +64,15 @@ EXPORT_SYMBOL(cpu_sysdev_class);
  */
 int __init register_cpu(struct cpu *cpu, int num, struct node *root)
 {
+	int ret;
+
 	cpu->node_id = cpu_to_node(num);
 	cpu->sysdev.id = num;
 	cpu->sysdev.cls = &cpu_sysdev_class;
-	return sys_device_register(&cpu->sysdev);
+	ret = sys_device_register(&cpu->sysdev);
+	if (ret == 0)
+		register_cpu_control(cpu);
+	return ret;
 }
 
 
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/fs/buffer.c .17007-linux-2.6.0-test5-bk11.updated/fs/buffer.c
--- .17007-linux-2.6.0-test5-bk11/fs/buffer.c	2003-09-22 10:27:35.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/fs/buffer.c	2003-09-25 14:54:34.000000000 +1000
@@ -2996,7 +2996,18 @@ static void buffer_init_cpu(int cpu)
 	bha->ratelimit = 0;
 	memset(bhl, 0, sizeof(*bhl));
 }
-	
+
+static void buffer_exit_cpu(int cpu)
+{
+	int i;
+	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
+
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		brelse(b->bhs[i]);
+		b->bhs[i] = NULL;
+	}
+}
+
 static int __devinit buffer_cpu_notify(struct notifier_block *self, 
 				unsigned long action, void *hcpu)
 {
@@ -3005,6 +3016,9 @@ static int __devinit buffer_cpu_notify(s
 	case CPU_UP_PREPARE:
 		buffer_init_cpu(cpu);
 		break;
+	case CPU_OFFLINE:
+		buffer_exit_cpu(cpu);
+		break;
 	default:
 		break;
 	}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/include/linux/cpu.h .17007-linux-2.6.0-test5-bk11.updated/include/linux/cpu.h
--- .17007-linux-2.6.0-test5-bk11/include/linux/cpu.h	2003-09-22 10:09:13.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/include/linux/cpu.h	2003-09-25 14:54:34.000000000 +1000
@@ -37,6 +37,7 @@ extern int register_cpu_notifier(struct 
 extern void unregister_cpu_notifier(struct notifier_block *nb);
 
 int cpu_up(unsigned int cpu);
+int cpu_down(unsigned int cpu);
 #else
 static inline int register_cpu_notifier(struct notifier_block *nb)
 {
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/include/linux/mmzone.h .17007-linux-2.6.0-test5-bk11.updated/include/linux/mmzone.h
--- .17007-linux-2.6.0-test5-bk11/include/linux/mmzone.h	2003-09-22 10:28:12.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/include/linux/mmzone.h	2003-09-25 14:54:34.000000000 +1000
@@ -209,6 +209,7 @@ typedef struct pglist_data {
 	int node_id;
 	struct pglist_data *pgdat_next;
 	wait_queue_head_t       kswapd_wait;
+	struct task_struct *kswapd;
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/include/linux/notifier.h .17007-linux-2.6.0-test5-bk11.updated/include/linux/notifier.h
--- .17007-linux-2.6.0-test5-bk11/include/linux/notifier.h	2003-09-21 17:31:11.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/include/linux/notifier.h	2003-09-25 14:54:34.000000000 +1000
@@ -60,11 +60,14 @@ extern int notifier_call_chain(struct no
 
 #define NETLINK_URELEASE	0x0001	/* Unicast netlink socket released */
 
-#define CPU_ONLINE	0x0002 /* CPU (unsigned)v is up */
-#define CPU_UP_PREPARE	0x0003 /* CPU (unsigned)v coming up */
-#define CPU_UP_CANCELED	0x0004 /* CPU (unsigned)v NOT coming up */
-#define CPU_OFFLINE	0x0005 /* CPU (unsigned)v offline (still scheduling) */
-#define CPU_DEAD	0x0006 /* CPU (unsigned)v dead */
+#define CPU_ONLINE		0x0002 /* CPU v is up */
+#define CPU_UP_PREPARE		0x0003 /* CPU v coming up (can fail) */
+#define CPU_UP_CANCELED		0x0004 /* CPU v NOT coming up */
+#define CPU_DOWN_PREPARE	0x0005 /* CPU v going down (can fail) */
+#define CPU_DOWN_CANCELED	0x0006 /* CPU v NOT going down */
+#define CPU_OFFLINE		0x0007 /* CPU v offline
+					  (still scheduling) */
+#define CPU_DEAD		0x0008 /* CPU v dead */
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/include/linux/sched.h .17007-linux-2.6.0-test5-bk11.updated/include/linux/sched.h
--- .17007-linux-2.6.0-test5-bk11/include/linux/sched.h	2003-09-25 09:56:38.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/include/linux/sched.h	2003-09-25 16:39:50.000000000 +1000
@@ -518,11 +518,14 @@ extern void node_nr_running_init(void);
 #define node_nr_running_init() {}
 #endif
 
+/* Move tasks off this (offline) CPU onto another. */
+extern void migrate_all_tasks(void);
 extern void set_user_nice(task_t *p, long nice);
 extern int task_prio(task_t *p);
 extern int task_nice(task_t *p);
 extern int task_curr(task_t *p);
 extern int idle_cpu(int cpu);
+extern void wake_idle_cpu(unsigned int cpu);
 
 void yield(void);
 
@@ -875,7 +878,6 @@ static inline void set_task_cpu(struct t
 }
 
 #endif /* CONFIG_SMP */
-
 #endif /* __KERNEL__ */
 
 #endif
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/cpu.c .17007-linux-2.6.0-test5-bk11.updated/kernel/cpu.c
--- .17007-linux-2.6.0-test5-bk11/kernel/cpu.c	2003-09-22 10:09:14.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/kernel/cpu.c	2003-09-25 14:54:34.000000000 +1000
@@ -1,14 +1,17 @@
 /* CPU control.
- * (C) 2001 Rusty Russell
+ * (C) 2001, 2002, 2003 Rusty Russell
+ *
  * This code is licenced under the GPL.
  */
 #include <linux/proc_fs.h>
 #include <linux/smp.h>
 #include <linux/init.h>
-#include <linux/notifier.h>
 #include <linux/sched.h>
 #include <linux/unistd.h>
+#include <linux/kmod.h>		/* for hotplug_path */
 #include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
 #include <asm/semaphore.h>
 
 /* This protects CPUs going up and down... */
@@ -19,15 +22,159 @@ static struct notifier_block *cpu_chain 
 /* Need to know about CPUs going up/down? */
 int register_cpu_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&cpu_chain, nb);
+	int ret;
+
+	if ((ret = down_interruptible(&cpucontrol)) != 0)
+		return ret;
+	ret = notifier_chain_register(&cpu_chain, nb);
+	up(&cpucontrol);
+	return ret;
 }
 
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-	notifier_chain_unregister(&cpu_chain,nb);
+	down(&cpucontrol);
+	notifier_chain_unregister(&cpu_chain, nb);
+	up(&cpucontrol);
 }
 
-int __devinit cpu_up(unsigned int cpu)
+#ifdef CONFIG_HOTPLUG_CPU
+static inline void check_for_tasks(int cpu)
+{
+	struct task_struct *p;
+
+	write_lock_irq(&tasklist_lock);
+	for_each_process(p) {
+		int dying = p->state & (TASK_ZOMBIE | TASK_DEAD);
+		if (p->thread_info->cpu == cpu && !dying)
+			printk(KERN_WARNING "Task %s is on cpu %d, "
+				"not dying\n", p->comm, cpu);
+	}
+	write_unlock_irq(&tasklist_lock);
+}
+
+/* Notify userspace when a cpu event occurs, by running '/sbin/hotplug
+ * cpu' with certain environment variables set.  */
+static int cpu_run_sbin_hotplug(unsigned int cpu, const char *action)
+{
+	char *argv[3], *envp[5], cpu_str[12], action_str[32];
+	int i;
+
+	sprintf(cpu_str, "CPU=%d", cpu);
+	sprintf(action_str, "ACTION=%s", action);
+	/* FIXME: Add DEVPATH. --RR */
+
+	i = 0;
+	argv[i++] = hotplug_path;
+	argv[i++] = "cpu";
+	argv[i] = NULL;
+
+	i = 0;
+	/* minimal command environment */
+	envp [i++] = "HOME=/";
+	envp [i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp [i++] = cpu_str;
+	envp [i++] = action_str;
+	envp [i] = NULL;
+
+	return call_usermodehelper(argv[0], argv, envp, 0);
+}
+
+
+static inline int cpu_down_check(unsigned int cpu)
+{
+	if (!cpu_online(cpu))
+		return -EINVAL;
+
+	if (num_online_cpus() == 1)
+		return -EBUSY;
+
+	return 0;
+}
+
+static inline int cpu_disable(int cpu)
+{
+	int ret;
+
+	ret = __cpu_disable();
+	if (ret < 0)
+		return ret;
+
+	/* Everyone looking at cpu_online() should be doing so with
+	 * preemption disabled. */
+	synchronize_kernel();
+	BUG_ON(cpu_online(cpu));
+	return 0;
+}
+
+int cpu_down(unsigned int cpu)
+{
+	int err, rc;
+	void *vcpu = (void *)(long)cpu;
+	cpumask_t mask;
+
+	if ((err = down_interruptible(&cpucontrol)) != 0) 
+		return err;
+
+	if ((err = cpu_down_check(cpu)) != 0)
+		goto out;
+
+	rc = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, vcpu);
+	if (rc == NOTIFY_BAD) {
+		/* FIXME: It'd be nice to only call those who saw
+		 * CPU_DOWN_PREPARE. --RR */
+		err = -EBUSY;
+		goto notify_out;
+	}
+
+	/* Schedule ourselves on the dying CPU. */
+	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+
+	if ((err = cpu_disable(cpu)) != 0)
+		goto notify_out;
+
+	cpu_run_sbin_hotplug(cpu, "offline");
+
+	/* Move other tasks off to other CPUs (simple since they are
+           not running now). */
+	migrate_all_tasks();
+
+	/* Move off dying CPU, which will revert to idle process. */
+	cpus_clear(mask);
+	cpus_complement(mask);
+	cpu_clear(cpu, mask);
+	set_cpus_allowed(current, mask);
+
+	/* Tell kernel threads to go away: they can't fail here. */
+	rc = notifier_call_chain(&cpu_chain, CPU_OFFLINE, vcpu);
+	BUG_ON(rc == NOTIFY_BAD);
+
+	check_for_tasks(cpu);
+
+	/* This actually kills the CPU. */
+	__cpu_die(cpu);
+
+ notify_out:
+	if (err < 0)
+		rc = notifier_call_chain(&cpu_chain, CPU_DOWN_CANCELED, vcpu);
+	else {
+		/* CPU is completely dead: tell everyone.  Too late to
+		 * complain. */
+		rc = notifier_call_chain(&cpu_chain, CPU_DEAD, vcpu);
+	}
+	BUG_ON(rc == NOTIFY_BAD);
+out:
+	up(&cpucontrol);
+	return err;
+}
+#else
+static inline int cpu_run_sbin_hotplug(unsigned int cpu, const char *action)
+{
+	return 0;
+}
+#endif /*CONFIG_HOTPLUG_CPU*/
+
+int cpu_up(unsigned int cpu)
 {
 	int ret;
 	void *hcpu = (void *)(long)cpu;
@@ -41,7 +188,7 @@ int __devinit cpu_up(unsigned int cpu)
 	}
 	ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
 	if (ret == NOTIFY_BAD) {
-		printk("%s: attempt to bring up CPU %u failed\n",
+		printk(KERN_DEBUG "%s: attempt to bring up CPU %u failed\n",
 				__FUNCTION__, cpu);
 		ret = -EINVAL;
 		goto out_notify;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/kthread.c .17007-linux-2.6.0-test5-bk11.updated/kernel/kthread.c
--- .17007-linux-2.6.0-test5-bk11/kernel/kthread.c	2003-09-25 14:54:33.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/kernel/kthread.c	2003-09-25 17:59:18.000000000 +1000
@@ -10,6 +10,43 @@
 #include <linux/err.h>
 #include <asm/semaphore.h>
 
+/* All thread comms is command -> ack, so we keep it simple. */
+struct kt_message
+{
+	struct task_struct *from, *to;
+	void *info;
+};
+
+static spinlock_t ktm_lock = SPIN_LOCK_UNLOCKED;
+static struct kt_message ktm;
+
+static void ktm_send(struct task_struct *to, void *info)
+{
+	spin_lock(&ktm_lock);
+	ktm.to = to;
+	ktm.from = current;
+	ktm.info = info;
+	wake_up_process(ktm.to);
+	spin_unlock(&ktm_lock);
+}
+
+static struct kt_message ktm_receive(void)
+{
+	struct kt_message m;
+
+	for (;;) {
+		spin_lock(&ktm_lock);
+		if (ktm.to == current)
+			break;
+		current->state = TASK_INTERRUPTIBLE;
+		spin_unlock(&ktm_lock);
+		schedule();
+	}
+	m = ktm;
+	spin_unlock(&ktm_lock);
+	return m;
+}
+
 struct kthread
 {
 	int (*initfn)(void *data);
@@ -18,57 +55,50 @@ struct kthread
 	char *name;
 };
 
-/* We serialize kthread operations, so they all share these */
-static DECLARE_MUTEX(kthread_lock);
-static struct task_struct *kthread_result;
-static struct completion kthread_done;
-static struct task_struct *kthread_stop;
-
 static int kthread(void *data)
 {
 	/* Copy data: it's on keventd_init's stack */
 	struct kthread k = *(struct kthread *)data;
-	int ret;
+	struct kt_message m;
+	int ret = 0;
 
-	/* Created by __kthread_create */
+	/* Send to spawn_kthread, so it knows who we are. */
 	daemonize("%s", k.name);
-	kthread_result = current;
-	complete(&kthread_done);
-	schedule();
+	ktm_send(current->real_parent, current);
 
-	/* Woken by kthread_destroy? */
-	if (kthread_stop == current)
+	/* Receive from kthread_start or kthread_destroy */
+	m = ktm_receive();
+	if (!m.info)
 		goto stop;
-
-	/* Woken by kthread_init. */
-	ret = k.initfn ? k.initfn(k.data) : 0;
-	if (ret < 0) {
-		kthread_result = ERR_PTR(ret);
+	if (k.initfn && (ret = k.initfn(k.data)) < 0)
 		goto stop;
-	}
-	kthread_result = current;
-	complete(&kthread_done);
+	ktm_send(m.from, current);
 
-	while (kthread_stop != current) {
-		/* Must read kthread_stop before we schedule */
-		smp_mb();
-		schedule();
-		/* Woken by random process or kthread_destroy */
+	for (;;) {
+		/* If it fails, just wait until kthread_destroy. */
+		if (k.corefn && (ret = k.corefn(k.data)) < 0)
+			k.corefn = NULL;
 
-		/* If keventd would reap children, we could just exit,
-		 * and do a sys_waitpid() in kthread_destroy, rather
-		 * than hanging around. */ 
-		if (kthread_stop != current && k.corefn) {
-			ret = k.corefn(k.data);
-			if (ret < 0)
-				k.corefn = NULL;
+		/* Check if we're being told to stop. */
+		spin_lock(&ktm_lock);
+		if (ktm.to == current && ktm.info == NULL) {
+			m = ktm;
+			spin_unlock(&ktm_lock);
+			goto stop;
 		}
+		current->state == TASK_INTERRUPTIBLE;
+		spin_unlock(&ktm_lock);
+		schedule();
 	}
+
 stop:
-	complete(&kthread_done);
+	ktm_send(m.from, NULL);
 	return 0;
 }
 
+/* Serialize all kthread control stuff, for simplicity. */
+static DECLARE_MUTEX(kthread_control);
+
 struct kthread_create
 {
 	struct task_struct *result;
@@ -82,16 +112,13 @@ static void spawn_kthread(void *data)
 	struct kthread_create *kc = data;
 	int ret;
 
-	down(&kthread_lock);
-	init_completion(&kthread_done);
 	ret = kernel_thread(kthread, &kc->k, CLONE_KERNEL);
 	if (ret < 0)
 		kc->result = ERR_PTR(ret);
 	else {
-		wait_for_completion(&kthread_done);
-		kc->result = kthread_result;
+		struct kt_message m = ktm_receive();
+		kc->result = m.info;
 	}
-	up(&kthread_lock);
 	complete(&kc->done);
 }
 
@@ -117,36 +144,33 @@ struct task_struct *kthread_create(int (
 	kc.k.data = data;
 	kc.k.name = name;
 
+	down(&kthread_control);
 	schedule_work(&work);
 	wait_for_completion(&kc.done);
+	up(&kthread_control);
 	return kc.result;
 }
 
 struct task_struct *kthread_start(struct task_struct *k)
 {
-	down(&kthread_lock);
-	init_completion(&kthread_done);
-	wake_up_process(k);
-	wait_for_completion(&kthread_done);
+	struct kt_message m;
 
-	k = kthread_result;
-	up(&kthread_lock);
+	down(&kthread_control);
+	ktm_send(k, k);
+	m = ktm_receive();
+	up(&kthread_control);
 
-	return k;
+	return m.info;
 }
 
 int kthread_destroy(struct task_struct *k)
 {
-	int ret;
-
-	down(&kthread_lock);
-	init_completion(&kthread_done);
-	kthread_stop = k;
+	struct kt_message m;
 
-	wake_up_process(k);
-	wait_for_completion(&kthread_done);
-	ret = PTR_ERR(kthread_result);
-	up(&kthread_lock);
+	down(&kthread_control);
+	ktm_send(k, NULL);
+	m = ktm_receive();
+	up(&kthread_control);
 
-	return ret;
+	return PTR_ERR(m.info);
 }
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/rcupdate.c .17007-linux-2.6.0-test5-bk11.updated/kernel/rcupdate.c
--- .17007-linux-2.6.0-test5-bk11/kernel/rcupdate.c	2003-09-22 10:27:38.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/kernel/rcupdate.c	2003-09-25 14:54:34.000000000 +1000
@@ -154,6 +154,78 @@ out_unlock:
 }
 
 
+#ifdef CONFIG_HOTPLUG_CPU
+/* slack queue used for offloading callbacks e.g. in the case of a cpu
+ * going offline */
+static struct rcu_global_queue_s {
+	spinlock_t lock;
+	struct list_head list;
+} rcu_global_queue = {
+	.lock = SPIN_LOCK_UNLOCKED,
+	.list = LIST_HEAD_INIT(rcu_global_queue.list),
+};
+
+/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
+ * locking requirements, the list it's pulling from has to belong to a cpu
+ * which is dead and hence not processing interrupts.
+ */
+static void rcu_move_batch(struct list_head *list)
+{
+	struct list_head *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rcu_global_queue.lock, flags);
+	while (!list_empty(list)) {
+		entry = list->next;
+		list_del(entry);
+		list_add_tail(entry, &rcu_global_queue.list);
+	}
+	spin_unlock_irqrestore(&rcu_global_queue.lock, flags);
+}
+
+static void rcu_offline_cpu(int cpu)
+{
+	/* if the cpu going offline owns the grace period
+	 * we can block indefinitely waiting for it, so flush
+	 * it here
+	 */
+	spin_lock_irq(&rcu_ctrlblk.mutex);
+	if (RCU_batch(cpu) == rcu_ctrlblk.curbatch) {
+		rcu_ctrlblk.curbatch++;
+		rcu_start_batch(rcu_ctrlblk.maxbatch);
+	}
+	spin_unlock_irq(&rcu_ctrlblk.mutex);
+
+	rcu_move_batch(&RCU_curlist(cpu));
+	rcu_move_batch(&RCU_nxtlist(cpu));
+	
+	BUG_ON(!list_empty(&RCU_curlist(cpu)));
+	BUG_ON(!list_empty(&RCU_nxtlist(cpu)));
+
+	tasklet_kill(&RCU_tasklet(cpu));
+	list_del_init(&RCU_curlist(cpu));
+	list_del_init(&RCU_nxtlist(cpu));
+	memset(&per_cpu(rcu_data, cpu), 0, sizeof(struct rcu_data));
+}
+
+static inline void pull_from_global_queue(int cpu)
+{
+	/* Pick up any pending global callbacks. This is rarely used
+	 * so lock contention is fine. Each cpu picks one callback and it's
+	 * ok if we miss one since someone else can pick it up */
+	if (unlikely(!list_empty(&rcu_global_queue.list))) {
+		spin_lock(&rcu_global_queue.lock);
+		if (!list_empty(&rcu_global_queue.list))
+			list_move_tail(&rcu_global_queue.list, &RCU_nxtlist(cpu));
+		spin_unlock(&rcu_global_queue.lock);
+	}
+}
+#else /* ... !CONFIG_HOTPLUG_CPU */
+static inline void pull_from_global_queue(int cpu)
+{
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 /*
  * This does the RCU processing work from tasklet context. 
  */
@@ -169,6 +241,9 @@ static void rcu_process_callbacks(unsign
 	}
 
 	local_irq_disable();
+
+	pull_from_global_queue(cpu);
+
 	if (!list_empty(&RCU_nxtlist(cpu)) && list_empty(&RCU_curlist(cpu))) {
 		list_splice(&RCU_nxtlist(cpu), &RCU_curlist(cpu));
 		INIT_LIST_HEAD(&RCU_nxtlist(cpu));
@@ -214,7 +289,11 @@ static int __devinit rcu_cpu_notify(stru
 	case CPU_UP_PREPARE:
 		rcu_online_cpu(cpu);
 		break;
-	/* Space reserved for CPU_OFFLINE :) */
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		rcu_offline_cpu(cpu);
+		break;
+#endif
 	default:
 		break;
 	}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/sched.c .17007-linux-2.6.0-test5-bk11.updated/kernel/sched.c
--- .17007-linux-2.6.0-test5-bk11/kernel/sched.c	2003-09-25 09:56:39.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/kernel/sched.c	2003-09-25 17:56:49.000000000 +1000
@@ -35,6 +35,8 @@
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/percpu.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
 
 #ifdef CONFIG_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
@@ -568,6 +570,22 @@ repeat:
 }
 #endif
 
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* Wake up a CPU from idle */
+void wake_idle_cpu(unsigned int cpu)
+{
+	resched_task(cpu_rq(cpu)->idle);
+}
+
+static inline int cpu_is_offline(unsigned int cpu)
+{
+	return !cpu_online(cpu);
+}
+#else /* ... !CONFIG_HOTPLUG_CPU */
+#define cpu_is_offline(cpu) 0
+#endif /* CONFIG_HOTPLUG_CPU */
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -601,7 +619,8 @@ repeat_lock_task:
 			 */
 			if (unlikely(sync && !task_running(rq, p) &&
 				(task_cpu(p) != smp_processor_id()) &&
-				cpu_isset(smp_processor_id(), p->cpus_allowed))) {
+				cpu_isset(smp_processor_id(), p->cpus_allowed)
+				    && !cpu_is_offline(smp_processor_id()))) {
 
 				set_task_cpu(p, smp_processor_id());
 				task_rq_unlock(rq, &flags);
@@ -815,9 +834,11 @@ unsigned long nr_running(void)
 {
 	unsigned long i, sum = 0;
 
-	for (i = 0; i < NR_CPUS; i++)
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i))
+			continue;
 		sum += cpu_rq(i)->nr_running;
-
+	}
 	return sum;
 }
 
@@ -838,7 +859,7 @@ unsigned long nr_context_switches(void)
 	unsigned long i, sum = 0;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_online(i))
+		if (!cpu_possible(i))
 			continue;
 		sum += cpu_rq(i)->nr_switches;
 	}
@@ -1163,6 +1184,10 @@ static void load_balance(runqueue_t *thi
 	struct list_head *head, *curr;
 	task_t *tmp;
 
+	/* CPU going down is a special case: we don't pull more tasks onboard */
+	if (unlikely(cpu_is_offline(this_cpu)))
+		goto out;
+
 	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
 	if (!busiest)
 		goto out;
@@ -2598,62 +2623,129 @@ static void move_task_away(struct task_s
 	local_irq_restore(flags);
 }
 
-typedef struct {
-	int cpu;
-	struct completion startup_done;
-	task_t *task;
-} migration_startup_t;
-
 /*
  * migration_thread - this is a highprio system thread that performs
  * thread migration by bumping thread off CPU then 'pushing' onto
  * another runqueue.
  */
-static int migration_thread(void * data)
+static int migration_kthread_init(void *data)
 {
 	/* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */
 	struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 };
-	migration_startup_t *startup = data;
-	int cpu = startup->cpu;
-	runqueue_t *rq;
-	int ret;
-
-	startup->task = current;
-	complete(&startup->startup_done);
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	schedule();
+	unsigned int cpu = (long)data;
 
 	BUG_ON(smp_processor_id() != cpu);
 
-	daemonize("migration/%d", cpu);
 	set_fs(KERNEL_DS);
 
-	ret = setscheduler(0, SCHED_FIFO, &param);
+	setscheduler(0, SCHED_FIFO, &param);
+	return 0;
+}
 
-	rq = this_rq();
-	rq->migration_thread = current;
+static int migration_kthread(void *data)
+{
+	runqueue_t *rq;
+	struct list_head *head;
+	migration_req_t *req;
 
-	for (;;) {
-		struct list_head *head;
-		migration_req_t *req;
+	rq = this_rq();
 
-		spin_lock_irq(&rq->lock);
-		head = &rq->migration_queue;
-		current->state = TASK_INTERRUPTIBLE;
-		if (list_empty(head)) {
-			spin_unlock_irq(&rq->lock);
-			schedule();
-			continue;
-		}
+	spin_lock_irq(&rq->lock);
+	head = &rq->migration_queue;
+	current->state = TASK_INTERRUPTIBLE;
+	while (!list_empty(head)) {
 		req = list_entry(head->next, migration_req_t, list);
 		list_del_init(head->next);
-		spin_unlock_irq(&rq->lock);
 
+		spin_unlock_irq(&rq->lock);
 		move_task_away(req->task,
 			       any_online_cpu(req->task->cpus_allowed));
 		complete(&req->done);
+		spin_lock_irq(&rq->lock);
 	}
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* migrate_all_tasks - function to migrate all the tasks from the
+ * current cpu caller must have already scheduled this to the target
+ * cpu via set_cpus_allowed  */
+void migrate_all_tasks(void)
+{
+	struct task_struct *tsk, *t;
+	int dest_cpu, src_cpu, signalme;
+	unsigned int node;
+	struct siginfo sig;
+
+	/* We're nailed to this CPU. */
+	src_cpu = smp_processor_id();
+
+	sig.si_signo = SIGPWR;
+	sig.si_errno = 0;
+	sig.si_code = SI_KERNEL;
+	sig.si_cpu = src_cpu;
+
+again:
+	/* lock out everyone else intentionally */
+	write_lock_irq(&tasklist_lock);
+
+	/* watch out for per node tasks, let's stay on this node */
+	node = cpu_to_node(src_cpu);
+
+	do_each_thread(t, tsk) {
+		cpumask_t mask;
+		if (tsk == current)
+			continue;
+
+		if (task_cpu(tsk) != src_cpu)
+			continue;
+
+		/* Figure out where this task should go (attempting to
+		 * keep it on-node), and check if it can be migrated
+		 * as-is.  NOTE that kernel threads bound to more than
+		 * one online cpu will be migrated. */
+		mask = node_to_cpumask(node);
+		cpus_and(mask, mask, tsk->cpus_allowed);
+		dest_cpu = any_online_cpu(mask);
+		if (dest_cpu == NR_CPUS)
+			dest_cpu = any_online_cpu(tsk->cpus_allowed);
+		if (dest_cpu == NR_CPUS) {
+			/* Kernel threads which are bound to specific
+			 * processors need to look after themselves
+			 * with their own callbacks.
+			 */
+			if (tsk->mm == NULL)
+				continue;
+
+			printk(KERN_INFO "process %d (%s) no "
+			       "longer affine to cpu%d\n",
+			       tsk->pid, tsk->comm, src_cpu);
+			cpus_clear(tsk->cpus_allowed);
+			cpus_complement(tsk->cpus_allowed);
+			dest_cpu = any_online_cpu(tsk->cpus_allowed);
+			signalme = 1;
+		} else
+			signalme = 0;
+
+		get_task_struct(tsk);
+		move_task_away(tsk, dest_cpu);
+		if (signalme)
+			goto do_signal;
+		put_task_struct(tsk);
+	} while_each_thread(t, tsk);
+
+	write_unlock_irq(&tasklist_lock);
+	return;
+
+do_signal:
+	/* Need to drop tasklist_lock to send signal, then restart. */
+	write_unlock_irq(&tasklist_lock);
+	send_sig_info(sig.si_signo, &sig, tsk);
+	put_task_struct(tsk);
+	goto again;
+
 }
+#endif /* CONFIG_HOTPLUG_CPU */
 
 /*
  * migration_call - callback that gets triggered when a CPU is added.
@@ -2663,42 +2755,66 @@ static int migration_call(struct notifie
 			  unsigned long action,
 			  void *hcpu)
 {
-	long cpu = (long) hcpu;
-	migration_startup_t startup;
+	unsigned int cpu = (long)hcpu;
+	struct task_struct *p;
 
 	switch (action) {
-	case CPU_ONLINE:
-
-		printk("Starting migration thread for cpu %li\n", cpu);
-
-		startup.cpu = cpu;
-		startup.task = NULL;
-		init_completion(&startup.startup_done);
-
-		kernel_thread(migration_thread, &startup, CLONE_KERNEL);
-		wait_for_completion(&startup.startup_done);
-		wait_task_inactive(startup.task);
+	case CPU_UP_PREPARE:
+		p = kthread_create(migration_kthread_init, migration_kthread,
+				   hcpu, "migration/%d", cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		/* Manually bind to CPU: thread stopped, so this is OK. */
+		p->thread_info->cpu = cpu;
+		p->cpus_allowed = cpumask_of_cpu(cpu);
+		cpu_rq(cpu)->migration_thread = p;
+		break;
 
-		startup.task->thread_info->cpu = cpu;
-		startup.task->cpus_allowed = cpumask_of_cpu(cpu);
+	case CPU_UP_CANCELED:
+		/* Bind back to this cpu so it can run. */
+		p = cpu_rq(cpu)->migration_thread;
+		cpu_rq(cpu)->migration_thread = NULL;
+		p->thread_info->cpu = smp_processor_id();
+		p->cpus_allowed = cpumask_of_cpu(smp_processor_id());
+		kthread_destroy(p);
+		break;
 
-		wake_up_process(startup.task);
+	case CPU_ONLINE:
+		kthread_start(cpu_rq(cpu)->migration_thread);
+		break;
 
-		while (!cpu_rq(cpu)->migration_thread)
-			yield();
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_OFFLINE:
+		kthread_destroy(cpu_rq(cpu)->migration_thread);
+		break;
+	case CPU_DEAD: {
+		runqueue_t *rq = this_rq_lock();
+		runqueue_t *old_rq = cpu_rq(cpu);
 
+		rq->nr_uninterruptible += old_rq->nr_uninterruptible;
+		old_rq->nr_uninterruptible = 0;
+		BUG_ON(old_rq->nr_running != 0);
+		BUG_ON(atomic_read(&old_rq->nr_iowait) != 0);
+		rq_unlock(rq);
 		break;
 	}
+#endif
+	}
 	return NOTIFY_OK;
 }
 
-static struct notifier_block migration_notifier = { &migration_call, NULL, 0 };
+/* Want this before the other threads, so they can use set_cpus_allowed. */
+static struct notifier_block __devinitdata migration_notifier = { 
+	.notifier_call = migration_call,
+	.priority = -1,
+};
 
 __init int migration_init(void)
 {
+	void *cpu = (void *)(long)smp_processor_id();
 	/* Start one for boot CPU. */
-	migration_call(&migration_notifier, CPU_ONLINE,
-		       (void *)(long)smp_processor_id());
+	migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 	return 0;
 }
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/softirq.c .17007-linux-2.6.0-test5-bk11.updated/kernel/softirq.c
--- .17007-linux-2.6.0-test5-bk11/kernel/softirq.c	2003-09-25 09:56:39.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/kernel/softirq.c	2003-09-25 17:58:45.000000000 +1000
@@ -14,6 +14,7 @@
 #include <linux/notifier.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
+#include <linux/kthread.h>
 
 /*
    - No shared variables, all the data are CPU local.
@@ -319,63 +320,87 @@ void __init softirq_init(void)
 	register_cpu_notifier(&tasklet_nb);
 }
 
-static int ksoftirqd(void * __bind_cpu)
+static int ksoftirqd_init(void *__bind_cpu)
 {
-	int cpu = (int) (long) __bind_cpu;
+	unsigned int cpu = (long) __bind_cpu;
 
-	daemonize("ksoftirqd/%d", cpu);
+	BUG_ON(smp_processor_id() != cpu);
 	set_user_nice(current, 19);
 	current->flags |= PF_IOTHREAD;
+	return 0;
+}
 
-	/* Migrate to the right CPU */
-	set_cpus_allowed(current, cpumask_of_cpu(cpu));
-	BUG_ON(smp_processor_id() != cpu);
-
-	__set_current_state(TASK_INTERRUPTIBLE);
-	mb();
-
-	__get_cpu_var(ksoftirqd) = current;
-
-	for (;;) {
-		if (!local_softirq_pending())
-			schedule();
-
-		__set_current_state(TASK_RUNNING);
-
-		while (local_softirq_pending()) {
-			do_softirq();
-			cond_resched();
-		}
-
-		__set_current_state(TASK_INTERRUPTIBLE);
+static int ksoftirqd(void *__bind_cpu)
+{
+again:
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (local_softirq_pending()) {
+		current->state = TASK_RUNNING;
+		do_softirq();
+		cond_resched();
+		goto again;
 	}
+	return 0;
 }
 
 static int __devinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
-	int hotcpu = (unsigned long)hcpu;
-
-	if (action == CPU_ONLINE) {
-		if (kernel_thread(ksoftirqd, hcpu, CLONE_KERNEL) < 0) {
-			printk("ksoftirqd for %i failed\n", hotcpu);
-			return NOTIFY_BAD;
+	unsigned int hotcpu = (unsigned long)hcpu;
+	int ret = NOTIFY_OK;
+	struct task_struct *p;
+  
+	switch (action) {
+	case CPU_UP_PREPARE:
+		p = kthread_create(ksoftirqd_init, ksoftirqd, hcpu,
+				   "ksoftirqd/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("ksoftirqd for %u failed\n", hotcpu);
+			ret = NOTIFY_BAD;
+			break;
 		}
+		per_cpu(ksoftirqd, hotcpu) = p;
+ 		break;
+	case CPU_ONLINE:
+		set_cpus_allowed(per_cpu(ksoftirqd, hotcpu),
+				 cpumask_of_cpu(hotcpu));
+		break;
+	case CPU_UP_CANCELED:
+		kthread_destroy(per_cpu(ksoftirqd, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_OFFLINE:
+		kthread_destroy(per_cpu(ksoftirqd, hotcpu));
+		per_cpu(ksoftirqd, hotcpu) = NULL;
+	case CPU_DEAD: {
+		struct tasklet_struct *i, *next;
 
-		while (!per_cpu(ksoftirqd, hotcpu))
-			yield();
- 	}
-	return NOTIFY_OK;
+		/* Pull pending softirqs from dead CPU to us. */
+		local_irq_disable();
+		for (i = per_cpu(tasklet_vec, hotcpu).list; i; i = next) {
+			next = i->next;
+			__tasklet_schedule(i);
+		}
+		for (i = per_cpu(tasklet_hi_vec, hotcpu).list; i; i = next) {
+			next = i->next;
+			__tasklet_hi_schedule(i);
+		}
+		local_irq_enable();
+		break;
+	}
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	return ret;
 }
 
-static struct notifier_block __devinitdata cpu_nfb = {
-	.notifier_call = cpu_callback
-};
+static struct notifier_block __devinitdata cpu_nfb = { cpu_callback };
 
 __init int spawn_ksoftirqd(void)
 {
-	cpu_callback(&cpu_nfb, CPU_ONLINE, (void *)(long)smp_processor_id());
+	void *cpu = (void *)(long)smp_processor_id();
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/timer.c .17007-linux-2.6.0-test5-bk11.updated/kernel/timer.c
--- .17007-linux-2.6.0-test5-bk11/kernel/timer.c	2003-09-22 10:28:13.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/kernel/timer.c	2003-09-25 14:54:34.000000000 +1000
@@ -1206,7 +1206,41 @@ static void __devinit init_timers_cpu(in
 
 	base->timer_jiffies = jiffies;
 }
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __devinit migrate_timers(int cpu)
+{
+	unsigned long flags;
+	tvec_base_t *base;
+	struct list_head *head;
+	struct timer_list *timer;
+	int index;
+
+	BUG_ON(cpu_online(cpu));
+	base = &per_cpu(tvec_bases, cpu);
 	
+	spin_lock_irqsave(&base->lock, flags);
+	index = base->timer_jiffies & TVR_MASK;
+ 
+	if (!index &&
+		(!cascade(base, &base->tv2, INDEX(0))) &&
+			(!cascade(base, &base->tv3, INDEX(1))) &&
+				!cascade(base, &base->tv4, INDEX(2)))
+			cascade(base, &base->tv5, INDEX(3));
+
+repeat:
+	head = base->tv1.vec + index;
+	if (!list_empty(head)) {
+		timer = list_entry(head->next, struct timer_list, entry);
+		spin_unlock_irqrestore(&base->lock, flags);
+		mod_timer(timer, timer->expires + 1);
+		spin_lock_irqsave(&base->lock, flags);
+		goto repeat;
+	}
+	spin_unlock_irqrestore(&base->lock, flags);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 static int __devinit timer_cpu_notify(struct notifier_block *self, 
 				unsigned long action, void *hcpu)
 {
@@ -1215,6 +1249,11 @@ static int __devinit timer_cpu_notify(st
 	case CPU_UP_PREPARE:
 		init_timers_cpu(cpu);
 		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		migrate_timers(cpu);
+		break;
+#endif
 	default:
 		break;
 	}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/workqueue.c .17007-linux-2.6.0-test5-bk11.updated/kernel/workqueue.c
--- .17007-linux-2.6.0-test5-bk11/kernel/workqueue.c	2003-09-22 10:27:38.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/kernel/workqueue.c	2003-09-25 17:58:37.000000000 +1000
@@ -25,6 +25,10 @@
 #include <linux/completion.h>
 #include <linux/workqueue.h>
 #include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
 
 /*
  * The per-CPU workqueue.
@@ -43,13 +47,10 @@ struct cpu_workqueue_struct {
 	long insert_sequence;	/* Next to add */
 
 	struct list_head worklist;
-	wait_queue_head_t more_work;
+	struct task_struct *worker;
 	wait_queue_head_t work_done;
 
 	struct workqueue_struct *wq;
-	task_t *thread;
-	struct completion exit;
-
 } ____cacheline_aligned;
 
 /*
@@ -58,8 +59,13 @@ struct cpu_workqueue_struct {
  */
 struct workqueue_struct {
 	struct cpu_workqueue_struct cpu_wq[NR_CPUS];
+	struct list_head list;
+	const char *name;
 };
 
+/* All the workqueues on the system: protected by cpucontrol mutex. */
+static LIST_HEAD(workqueues);
+
 /*
  * Queue work on a workqueue. Return non-zero if it was successfully
  * added.
@@ -80,7 +86,7 @@ int queue_work(struct workqueue_struct *
 		spin_lock_irqsave(&cwq->lock, flags);
 		list_add_tail(&work->entry, &cwq->worklist);
 		cwq->insert_sequence++;
-		wake_up(&cwq->more_work);
+		wake_up_process(cwq->worker);
 		spin_unlock_irqrestore(&cwq->lock, flags);
 		ret = 1;
 	}
@@ -101,7 +107,7 @@ static void delayed_work_timer_fn(unsign
 	spin_lock_irqsave(&cwq->lock, flags);
 	list_add_tail(&work->entry, &cwq->worklist);
 	cwq->insert_sequence++;
-	wake_up(&cwq->more_work);
+	wake_up_process(cwq->worker);
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
@@ -151,67 +157,51 @@ static inline void run_workqueue(struct 
 
 		spin_lock_irqsave(&cwq->lock, flags);
 		cwq->remove_sequence++;
-		wake_up(&cwq->work_done);
+		wake_up_process(cwq->worker);
 	}
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
-typedef struct startup_s {
-	struct cpu_workqueue_struct *cwq;
-	struct completion done;
-	const char *name;
-} startup_t;
-
-static int worker_thread(void *__startup)
+static int worker_thread_init(void *__cwq)
 {
-	startup_t *startup = __startup;
-	struct cpu_workqueue_struct *cwq = startup->cwq;
-	int cpu = cwq - cwq->wq->cpu_wq;
-	DECLARE_WAITQUEUE(wait, current);
 	struct k_sigaction sa;
+	struct cpu_workqueue_struct *cwq = __cwq;
+	int cpu = cwq - cwq->wq->cpu_wq;
+
+	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+	BUG_ON(smp_processor_id() != cpu);
 
-	daemonize("%s/%d", startup->name, cpu);
 	allow_signal(SIGCHLD);
 	current->flags |= PF_IOTHREAD;
-	cwq->thread = current;
-
 	set_user_nice(current, -10);
-	set_cpus_allowed(current, cpumask_of_cpu(cpu));
-
-	complete(&startup->done);
 
 	/* Install a handler so SIGCLD is delivered */
 	sa.sa.sa_handler = SIG_IGN;
 	sa.sa.sa_flags = 0;
 	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
 	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+	return 0;
+}
 
-	for (;;) {
-		set_task_state(current, TASK_INTERRUPTIBLE);
-
-		add_wait_queue(&cwq->more_work, &wait);
-		if (!cwq->thread)
-			break;
-		if (list_empty(&cwq->worklist))
-			schedule();
-		else
-			set_task_state(current, TASK_RUNNING);
-		remove_wait_queue(&cwq->more_work, &wait);
-
-		if (!list_empty(&cwq->worklist))
-			run_workqueue(cwq);
+static int worker_thread(void *__cwq)
+{
+	struct cpu_workqueue_struct *cwq = __cwq;
 
-		if (signal_pending(current)) {
-			while (waitpid(-1, NULL, __WALL|WNOHANG) > 0)
-				/* SIGCHLD - auto-reaping */ ;
+again:
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (signal_pending(current)) {
+		while (waitpid(-1, NULL, __WALL|WNOHANG) > 0)
+			/* SIGCHLD - auto-reaping */ ;
 
-			/* zap all other signals */
-			flush_signals(current);
-		}
+		/* zap all other signals */
+		flush_signals(current);
 	}
-	remove_wait_queue(&cwq->more_work, &wait);
-	complete(&cwq->exit);
 
+	if (!list_empty(&cwq->worklist)) {
+		current->state = TASK_RUNNING;
+		run_workqueue(cwq);
+		goto again;
+	}
 	return 0;
 }
 
@@ -236,6 +226,7 @@ void flush_workqueue(struct workqueue_st
 
 	might_sleep();
 
+	down(&cpucontrol);
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 		DEFINE_WAIT(wait);
 		long sequence_needed;
@@ -257,41 +248,34 @@ void flush_workqueue(struct workqueue_st
 		finish_wait(&cwq->work_done, &wait);
 		spin_unlock_irq(&cwq->lock);
 	}
+	up(&cpucontrol);
 }
 
-static int create_workqueue_thread(struct workqueue_struct *wq,
-				   const char *name,
-				   int cpu)
+static struct task_struct *
+create_workqueue_thread(struct workqueue_struct *wq, int cpu)
 {
-	startup_t startup;
 	struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
-	int ret;
+	struct task_struct *p;
 
 	spin_lock_init(&cwq->lock);
 	cwq->wq = wq;
-	cwq->thread = NULL;
+	cwq->worker = NULL;
 	cwq->insert_sequence = 0;
 	cwq->remove_sequence = 0;
 	INIT_LIST_HEAD(&cwq->worklist);
-	init_waitqueue_head(&cwq->more_work);
 	init_waitqueue_head(&cwq->work_done);
-	init_completion(&cwq->exit);
-
-	init_completion(&startup.done);
-	startup.cwq = cwq;
-	startup.name = name;
-	ret = kernel_thread(worker_thread, &startup, CLONE_FS | CLONE_FILES);
-	if (ret >= 0) {
-		wait_for_completion(&startup.done);
-		BUG_ON(!cwq->thread);
-	}
-	return ret;
+	p = kthread_create(worker_thread_init, worker_thread, cwq, 
+			   "%s/%d", wq->name, cpu);
+	if (!IS_ERR(p))
+		cwq->worker = p;
+	return p;
 }
 
 struct workqueue_struct *create_workqueue(const char *name)
 {
 	int cpu, destroy = 0;
 	struct workqueue_struct *wq;
+	struct task_struct *p;
 
 	BUG_ON(strlen(name) > 10);
 
@@ -299,12 +283,21 @@ struct workqueue_struct *create_workqueu
 	if (!wq)
 		return NULL;
 
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	down(&cpucontrol);
+	wq->name = name;
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 		if (!cpu_online(cpu))
 			continue;
-		if (create_workqueue_thread(wq, name, cpu) < 0)
+		p = create_workqueue_thread(wq, cpu);
+		if (IS_ERR(p))
 			destroy = 1;
+		else
+			kthread_start(p);
 	}
+
+	list_add(&wq->list, &workqueues);
+
 	/*
 	 * Was there any error during startup? If yes then clean up:
 	 */
@@ -312,6 +305,7 @@ struct workqueue_struct *create_workqueu
 		destroy_workqueue(wq);
 		wq = NULL;
 	}
+	up(&cpucontrol);
 	return wq;
 }
 
@@ -320,13 +314,8 @@ static void cleanup_workqueue_thread(str
 	struct cpu_workqueue_struct *cwq;
 
 	cwq = wq->cpu_wq + cpu;
-	if (cwq->thread) {
-		/* Tell thread to exit and wait for it. */
-		cwq->thread = NULL;
-		wake_up(&cwq->more_work);
-
-		wait_for_completion(&cwq->exit);
-	}
+	if (cwq->worker)
+		kthread_destroy(cwq->worker);
 }
 
 void destroy_workqueue(struct workqueue_struct *wq)
@@ -335,10 +324,14 @@ void destroy_workqueue(struct workqueue_
 
 	flush_workqueue(wq);
 
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	down(&cpucontrol);
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 		if (cpu_online(cpu))
 			cleanup_workqueue_thread(wq, cpu);
 	}
+	up(&cpucontrol);
+	list_del(&wq->list);
 	kfree(wq);
 }
 
@@ -370,14 +363,58 @@ int current_is_keventd(void)
 		if (!cpu_online(cpu))
 			continue;
 		cwq = keventd_wq->cpu_wq + cpu;
-		if (current == cwq->thread)
+		if (current == cwq->worker)
 			return 1;
 	}
 	return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+/* We're holding the cpucontrol mutex here */
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	unsigned int hotcpu = (unsigned long)hcpu;
+	struct workqueue_struct *wq;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		/* Create a new workqueue thread for it. */
+		list_for_each_entry(wq, &workqueues, list) {
+			if (create_workqueue_thread(wq, hotcpu) < 0) {
+				printk("workqueue for %i failed\n", hotcpu);
+				return NOTIFY_BAD;
+			}
+		}
+		break;
+
+	case CPU_ONLINE:
+		/* Start the thread we created for it. */
+		list_for_each_entry(wq, &workqueues, list) {
+			struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+			kthread_start(cwq->worker);
+		}
+		break;
+
+	case CPU_UP_CANCELED:
+	case CPU_OFFLINE:
+		list_for_each_entry(wq, &workqueues, list)
+			cleanup_workqueue_thread(wq, hotcpu);
+		return NOTIFY_OK;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpu_nfb = { &cpu_callback, NULL, 0 };
+#endif
+
 void init_workqueues(void)
 {
+#ifdef CONFIG_HOTPLUG_CPU
+	register_cpu_notifier(&cpu_nfb);
+#endif
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
 }
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/mm/slab.c .17007-linux-2.6.0-test5-bk11.updated/mm/slab.c
--- .17007-linux-2.6.0-test5-bk11/mm/slab.c	2003-09-25 09:56:39.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/mm/slab.c	2003-09-25 16:49:58.000000000 +1000
@@ -519,9 +519,19 @@ enum {
 static DEFINE_PER_CPU(struct timer_list, reap_timers);
 
 static void reap_timer_fnc(unsigned long data);
-
+static void free_block (kmem_cache_t* cachep, void** objpp, int len);
 static void enable_cpucache (kmem_cache_t *cachep);
 
+static inline void ** ac_entry(struct array_cache *ac)
+{
+	return (void**)(ac+1);
+}
+
+static inline struct array_cache *ac_data(kmem_cache_t *cachep)
+{
+	return cachep->array[smp_processor_id()];
+}
+
 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
 static void cache_estimate (unsigned long gfporder, size_t size,
 		 int flags, size_t *left_over, unsigned int *num)
@@ -576,27 +586,34 @@ static void start_cpu_timer(int cpu)
 	}
 }
 
-/*
- * Note: if someone calls kmem_cache_alloc() on the new
- * cpu before the cpuup callback had a chance to allocate
- * the head arrays, it will oops.
- * Is CPU_ONLINE early enough?
- */
+#ifdef CONFIG_HOTPLUG_CPU
+static void stop_cpu_timer(int cpu)
+{
+	struct timer_list *rt = &per_cpu(reap_timers, cpu);
+
+	if (rt->function) {
+		del_timer(rt);
+		synchronize_kernel();
+		WARN_ON(timer_pending(rt));
+		rt->function = NULL;
+	}	
+}
+#endif
+
 static int __devinit cpuup_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
 	long cpu = (long)hcpu;
-	struct list_head *p;
+	kmem_cache_t* cachep;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
 		down(&cache_chain_sem);
-		list_for_each(p, &cache_chain) {
+		list_for_each_entry(cachep, &cache_chain, next) {
 			int memsize;
 			struct array_cache *nc;
 
-			kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
 			memsize = sizeof(void*)*cachep->limit+sizeof(struct array_cache);
 			nc = kmalloc(memsize, GFP_KERNEL);
 			if (!nc)
@@ -616,18 +633,28 @@ static int __devinit cpuup_callback(stru
 		up(&cache_chain_sem);
 		break;
 	case CPU_ONLINE:
-		if (g_cpucache_up == FULL)
-			start_cpu_timer(cpu);
+		start_cpu_timer(cpu);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_OFFLINE:
+		stop_cpu_timer(cpu);
 		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+
 	case CPU_UP_CANCELED:
+	case CPU_DEAD:
 		down(&cache_chain_sem);
-
-		list_for_each(p, &cache_chain) {
+		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
-			kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
 
+			spin_lock_irq(&cachep->spinlock);
+			/* cpu is dead; no one can alloc from it. */
 			nc = cachep->array[cpu];
 			cachep->array[cpu] = NULL;
+			cachep->free_limit -= cachep->batchcount;
+			free_block(cachep, ac_entry(nc), nc->avail);
+			spin_unlock_irq(&cachep->spinlock);
 			kfree(nc);
 		}
 		up(&cache_chain_sem);
@@ -641,16 +668,6 @@ bad:
 
 static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 
-static inline void ** ac_entry(struct array_cache *ac)
-{
-	return (void**)(ac+1);
-}
-
-static inline struct array_cache *ac_data(kmem_cache_t *cachep)
-{
-	return cachep->array[smp_processor_id()];
-}
-
 /* Initialisation.
  * Called after the gfp() functions have been enabled, and before smp_init().
  */
@@ -1316,7 +1333,6 @@ static void smp_call_function_all_cpus(v
 	preempt_enable();
 }
 
-static void free_block (kmem_cache_t* cachep, void** objpp, int len);
 static void drain_array_locked(kmem_cache_t* cachep,
 				struct array_cache *ac, int force);
 
@@ -1435,6 +1451,9 @@ int kmem_cache_destroy (kmem_cache_t * c
 		return 1;
 	}
 
+	/* no cpu_online check required here since we clear the percpu
+	 * array on cpu offline and set this to NULL.
+	 */
 	for (i = 0; i < NR_CPUS; i++)
 		kfree(cachep->array[i]);
 
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/mm/swap.c .17007-linux-2.6.0-test5-bk11.updated/mm/swap.c
--- .17007-linux-2.6.0-test5-bk11/mm/swap.c	2003-09-22 10:23:17.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/mm/swap.c	2003-09-25 14:54:34.000000000 +1000
@@ -24,6 +24,9 @@
 #include <linux/mm_inline.h>
 #include <linux/buffer_head.h>	/* for try_to_release_page() */
 #include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -372,7 +375,33 @@ void vm_acct_memory(long pages)
 	preempt_enable();
 }
 EXPORT_SYMBOL(vm_acct_memory);
-#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* Drop the CPU's cached committed space back into the central pool. */
+static int __devinit cpu_swap_callback(struct notifier_block *nfb,
+				       unsigned long action,
+				       void *hcpu)
+{
+	long *committed;
+
+	committed = &per_cpu(committed_space, (long)hcpu);
+	if (action == CPU_DEAD) {
+		atomic_add(*committed, &vm_committed_space);
+		*committed = 0;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpu_swap_notifier __devinitdata
+= { &cpu_swap_callback };
+
+static int __init cpu_swap_setup(void)
+{
+	return register_cpu_notifier(&cpu_swap_notifier);
+}
+__initcall(cpu_swap_setup);
+#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_SMP */
 
 
 /*
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/mm/vmscan.c .17007-linux-2.6.0-test5-bk11.updated/mm/vmscan.c
--- .17007-linux-2.6.0-test5-bk11/mm/vmscan.c	2003-09-22 10:28:13.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/mm/vmscan.c	2003-09-25 14:54:34.000000000 +1000
@@ -29,6 +29,8 @@
 #include <linux/backing-dev.h>
 #include <linux/rmap-locking.h>
 #include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
 
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
@@ -1087,6 +1089,47 @@ int shrink_all_memory(int nr_pages)
 }
 #endif
 
+#ifdef CONFIG_HOTPLUG_CPU
+/* It's optimal to keep kswapds on the same CPUs as their memory, but
+   not required for correctness.  So if the last cpu in a node goes
+   away, let them run anywhere, and as the first one comes back,
+   restore their cpu bindings. */
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	pg_data_t *pgdat;
+	unsigned int hotcpu = (unsigned long)hcpu;
+	cpumask_t mask;
+
+	if (action == CPU_OFFLINE) {
+		/* Make sure that kswapd never becomes unschedulable. */
+		for_each_pgdat(pgdat) {
+			mask = node_to_cpumask(pgdat->node_id);
+			if (any_online_cpu(mask) == NR_CPUS) {
+				cpus_complement(mask);
+				set_cpus_allowed(pgdat->kswapd, mask);
+			}
+		}
+	}
+
+	if (action == CPU_ONLINE) {
+		for_each_pgdat(pgdat) {
+			mask = node_to_cpumask(pgdat->node_id);
+			cpu_clear(hotcpu, mask);
+			if (any_online_cpu(mask) == NR_CPUS) {
+				cpu_set(hotcpu, mask);
+				/* One of our CPUs came back: restore mask */
+				set_cpus_allowed(pgdat->kswapd, mask);
+			}
+		}
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpu_nfb = { &cpu_callback, NULL, 0 };
+#endif /* CONFIG_HOTPLUG_CPU */
+
 static int __init kswapd_init(void)
 {
 	pg_data_t *pgdat;
@@ -1094,6 +1137,9 @@ static int __init kswapd_init(void)
 	for_each_pgdat(pgdat)
 		kernel_thread(kswapd, pgdat, CLONE_KERNEL);
 	total_memory = nr_free_pagecache_pages();
+#ifdef CONFIG_HOTPLUG_CPU
+	register_cpu_notifier(&cpu_nfb);
+#endif
 	return 0;
 }
 
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/net/core/dev.c .17007-linux-2.6.0-test5-bk11.updated/net/core/dev.c
--- .17007-linux-2.6.0-test5-bk11/net/core/dev.c	2003-09-25 09:56:40.000000000 +1000
+++ .17007-linux-2.6.0-test5-bk11.updated/net/core/dev.c	2003-09-25 16:49:09.000000000 +1000
@@ -105,6 +105,7 @@
 #include <linux/kmod.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
+#include <linux/cpu.h>
 #ifdef CONFIG_NET_RADIO
 #include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */
 #include <net/iw_handler.h>
@@ -3039,3 +3040,57 @@ out:
 }
 
 subsys_initcall(net_dev_init);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int dev_cpu_callback(struct notifier_block *nfb,
+			    unsigned long action,
+			    void *ocpu)
+{
+	struct sk_buff **list_skb;
+	struct net_device **list_net;
+	struct sk_buff *skb;
+	unsigned int cpu, oldcpu = (unsigned long)ocpu;
+	struct softnet_data *sd, *oldsd;
+
+	if (action != CPU_OFFLINE)
+		return NOTIFY_OK;
+
+	local_irq_disable();
+	cpu = smp_processor_id();
+	sd = &per_cpu(softnet_data, cpu);
+	oldsd = &per_cpu(softnet_data, oldcpu);
+
+	/* Find end of our completion_queue. */
+	list_skb = &sd->completion_queue;
+	while (*list_skb)
+		list_skb = &(*list_skb)->next;
+	/* Append completion queue from offline CPU. */
+	*list_skb = oldsd->completion_queue;
+	oldsd->completion_queue = NULL;
+
+	/* Find end of our output_queue. */
+	list_net = &sd->output_queue;
+	while (*list_net)
+		list_net = &(*list_net)->next;
+	/* Append output queue from offline CPU. */
+	*list_net = oldsd->output_queue;
+	oldsd->output_queue = NULL;
+	local_irq_enable();
+
+	/* Process offline CPU's input_pkt_queue */
+	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
+		netif_rx(skb);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpu_callback_nfb = {&dev_cpu_callback, NULL, 0 };
+
+static int __init dev_cpu_callback_init(void)
+{
+	register_cpu_notifier(&cpu_callback_nfb);
+	return 0;
+}
+
+__initcall(dev_cpu_callback_init);
+#endif /* CONFIG_HOTPLUG_CPU */