Name: Hotplug CPU support for per-cpu structures
Author: Zwane Mwaikambo
Status: Tested on 2.5.59, 4xi386 (Zwane)
Depends: Hotcpu/hotcpu-core.patch.gz

D: Handles cpus going up and down in the mm/, fs/ and net/ directories.

Index: linux-2.5.59-lch2/mm/slab.c
Index: linux-2.5.59-lch2/mm/vmscan.c
Index: linux-2.5.59-lch2/fs/buffer.c
Index: linux-2.5.59-lch2/net/core/dev.c

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26070-linux-2.5.59-bk3/fs/buffer.c .26070-linux-2.5.59-bk3.updated/fs/buffer.c
--- .26070-linux-2.5.59-bk3/fs/buffer.c	2003-02-10 09:57:47.000000000 +1100
+++ .26070-linux-2.5.59-bk3.updated/fs/buffer.c	2003-02-10 15:33:55.000000000 +1100
@@ -2862,7 +2862,18 @@ static void buffer_init_cpu(int cpu)
 	bha->ratelimit = 0;
 	memset(bhl, 0, sizeof(*bhl));
 }
-	
+
+static void buffer_exit_cpu(int cpu)
+{
+	int i;
+	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
+
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		brelse(b->bhs[i]);
+		b->bhs[i] = NULL;
+	}
+}
+
 static int __devinit buffer_cpu_notify(struct notifier_block *self, 
 				unsigned long action, void *hcpu)
 {
@@ -2871,6 +2882,9 @@ static int __devinit buffer_cpu_notify(s
 	case CPU_UP_PREPARE:
 		buffer_init_cpu(cpu);
 		break;
+	case CPU_OFFLINE:
+		buffer_exit_cpu(cpu);
+		break;
 	default:
 		break;
 	}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26070-linux-2.5.59-bk3/mm/slab.c .26070-linux-2.5.59-bk3.updated/mm/slab.c
--- .26070-linux-2.5.59-bk3/mm/slab.c	2003-02-10 09:57:52.000000000 +1100
+++ .26070-linux-2.5.59-bk3.updated/mm/slab.c	2003-02-10 15:33:55.000000000 +1100
@@ -527,6 +527,16 @@ static void start_cpu_timer(int cpu)
 	}
 }
 
+static void stop_cpu_timer(int cpu)
+{
+	struct timer_list *rt = &reap_timers[cpu];
+
+	if (rt->function) {
+		del_timer_sync(rt);
+		rt->function = NULL;
+	}
+}
+
 /*
  * Note: if someone calls kmem_cache_alloc() on the new
  * cpu before the cpuup callback had a chance to allocate
@@ -583,6 +593,24 @@ static int __devinit cpuup_callback(stru
 		}
 		up(&cache_chain_sem);
 		break;
+
+	case CPU_DEAD:
+		down(&cache_chain_sem);
+		list_for_each(p, &cache_chain) {
+			struct array_cache *nc;
+
+			kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
+			spin_lock_irq(&cachep->spinlock);
+			nc = cachep->array[cpu];
+			cachep->array[cpu] = NULL;
+			cachep->free_limit = (num_online_cpus())*cachep->batchcount
+						+ cachep->num;
+			kfree(nc);
+			spin_unlock_irq(&cachep->spinlock);
+		}
+		up(&cache_chain_sem);
+		stop_cpu_timer(cpu);
+		break;
 	}
 	return NOTIFY_OK;
 bad:
@@ -590,7 +618,7 @@ bad:
 	return NOTIFY_BAD;
 }
 
-static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
+static struct notifier_block __devinitdata cpucache_notifier = { &cpuup_callback, NULL, 0 };
 
 static inline void ** ac_entry(struct array_cache *ac)
 {
@@ -1235,6 +1263,9 @@ int kmem_cache_destroy (kmem_cache_t * c
 	}
 	{
 		int i;
+		/* no cpu_online check required here since we clear the percpu
+		 * array on cpu offline and set this to NULL.
+		 */
 		for (i = 0; i < NR_CPUS; i++)
 			kfree(cachep->array[i]);
 		/* NUMA: free the list3 structures */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26070-linux-2.5.59-bk3/mm/vmscan.c .26070-linux-2.5.59-bk3.updated/mm/vmscan.c
--- .26070-linux-2.5.59-bk3/mm/vmscan.c	2003-02-10 09:57:52.000000000 +1100
+++ .26070-linux-2.5.59-bk3.updated/mm/vmscan.c	2003-02-10 15:33:55.000000000 +1100
@@ -27,6 +27,7 @@
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
 #include <linux/rmap-locking.h>
+#include <linux/notifier.h>
 
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
@@ -931,6 +932,7 @@ int kswapd(void *p)
 	daemonize();
 	set_cpus_allowed(tsk, node_to_cpumask(pgdat->node_id));
 	sprintf(tsk->comm, "kswapd%d", pgdat->node_id);
+	printk("Set %s affinity to %08lX\n", tsk->comm, tsk->cpus_allowed);
 	sigfillset(&tsk->blocked);
 	
 	/*
@@ -998,6 +1000,45 @@ int shrink_all_memory(int nr_pages)
 }
 #endif
 
+/* It's optimal to keep kswapds on the same CPUs as their memory, but
+   not required for correctness.  So if the last cpu in a node goes
+   away, let them run anywhere, and as the first one comes back,
+   restore their cpu bindings. */
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	pg_data_t *pgdat;
+	unsigned int hotcpu = (unsigned long)hcpu;
+	unsigned long mask;
+
+	if (action == CPU_OFFLINE) {
+		/* Make sure that kswapd never becomes unschedulable. */
+		for_each_pgdat(pgdat) {
+			mask = __node_to_cpu_mask(pgdat->node_id);
+			if (any_online_cpu(mask) < 0) {
+				mask = ~0UL;
+				set_cpus_allowed(pgdat->kswapd, mask);
+			}
+		}
+	}
+
+	if (action == CPU_ONLINE) {
+		for_each_pgdat(pgdat) {
+			mask = __node_to_cpu_mask(pgdat->node_id);
+			mask &= ~(1UL << hotcpu);
+			if (any_online_cpu(mask) < 0) {
+				mask |= (1UL << hotcpu);
+				/* One of our CPUs came back: restore mask */
+				set_cpus_allowed(pgdat->kswapd, mask);
+			}
+		}
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpu_nfb = { &cpu_callback, NULL, 0 };
+
 static int __init kswapd_init(void)
 {
 	pg_data_t *pgdat;
@@ -1005,6 +1046,7 @@ static int __init kswapd_init(void)
 	for_each_pgdat(pgdat)
 		kernel_thread(kswapd, pgdat, CLONE_KERNEL);
 	total_memory = nr_free_pagecache_pages();
+	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
 
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26070-linux-2.5.59-bk3/net/core/dev.c .26070-linux-2.5.59-bk3.updated/net/core/dev.c
--- .26070-linux-2.5.59-bk3/net/core/dev.c	2003-02-07 19:22:29.000000000 +1100
+++ .26070-linux-2.5.59-bk3.updated/net/core/dev.c	2003-02-10 15:33:55.000000000 +1100
@@ -2945,3 +2945,67 @@ static int net_run_sbin_hotplug(struct n
 	return call_usermodehelper(argv [0], argv, envp);
 }
 #endif
+
+static int dev_cpu_callback(struct notifier_block *nfb, unsigned long action, void * ocpu)
+{
+        struct sk_buff *list_sk, *sk_head;
+        struct net_device *list_net, *net_head;
+        struct softnet_data *queue;
+        struct sk_buff *skb;
+        unsigned int  cpu = smp_processor_id();
+	unsigned long oldcpu = (unsigned long) ocpu;
+	unsigned long flags;
+
+	if (action != CPU_OFFLINE)
+		return 0;
+
+	local_irq_save(flags);
+
+        /* Move completion queue */
+
+        list_sk = softnet_data[oldcpu].completion_queue;
+        if (list_sk != NULL) {
+                sk_head = list_sk;
+                while (list_sk->next != NULL)
+                        list_sk = list_sk->next;
+                list_sk->next = softnet_data[cpu].completion_queue;
+                softnet_data[cpu].completion_queue = sk_head;
+                softnet_data[oldcpu].completion_queue = NULL;
+        }
+
+        /* Move output_queue */
+
+        list_net = softnet_data[oldcpu].output_queue;
+        if (list_net != NULL) {
+                net_head = list_net;
+                while (list_net->next != NULL)
+                        list_net = list_net->next_sched;
+                list_net->next_sched = softnet_data[cpu].output_queue;
+                softnet_data[cpu].output_queue = net_head;
+                softnet_data[oldcpu].output_queue = NULL;
+        }
+
+	local_irq_restore(flags);
+        
+        /* Move input_pkt_queue */
+
+	queue = &softnet_data[oldcpu];
+        for (;;) {
+                skb = __skb_dequeue(&queue->input_pkt_queue);
+                if (skb == NULL)
+                        break;
+                netif_rx(skb);
+        }
+
+        return 0;
+}
+
+static struct notifier_block cpu_callback_nfb = {&dev_cpu_callback, NULL, 0 };
+
+static int __init dev_cpu_callback_init(void)
+{
+        register_cpu_notifier(&cpu_callback_nfb);
+        return 0;
+}
+
+__initcall(dev_cpu_callback_init);