Index: linux-2.6.10-rc2-bk13-Percpu/mm/percpu.c
===================================================================
--- linux-2.6.10-rc2-bk13-Percpu.orig/mm/percpu.c	2004-12-02 15:34:02.000000000 +1100
+++ linux-2.6.10-rc2-bk13-Percpu/mm/percpu.c	2004-12-03 17:48:50.000000000 +1100
@@ -108,6 +108,10 @@
 	return b;
 }
 
+/* Enough for slab.c to bootstrap */
+#define INITIAL_NUM_ALLOCATED	40
+static __initdata int initial_sizes[INITIAL_NUM_ALLOCATED];
+
 /* Done early, so areas can be used. */
 void __init setup_per_cpu_areas(void)
 {
@@ -115,11 +119,15 @@
 	char *ptr;
 
 	/* Copy section for each CPU (we discard the original) */
-	reserved_size = ALIGN(__per_cpu_end - __per_cpu_start,SMP_CACHE_BYTES);
+	reserved_size = __per_cpu_end - __per_cpu_start;
+	/* Extra for initial slab allocations. */
+	reserved_size = ALIGN(reserved_size + 64,SMP_CACHE_BYTES);
+
 #ifdef CONFIG_MODULES
 	/* Enough to cover all DEFINE_PER_CPUs in modules, too. */
-	reserved_size = min(reserved_size, 8192UL * sizeof(unsigned long));
+	reserved_size = max(reserved_size, 8192UL * sizeof(unsigned long));
 #endif
+
 	/* Arch may choose to allocate much more for each CPU
 	 * (eg. large pages). */
 	percpu_size = reserved_size;
@@ -129,40 +137,49 @@
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 	}
-}
 
-static int __init percpu_alloc_init(void)
-{
+	/* kmalloc is not available yet: install enough to get slab.c
+	 * to bootstrap. */
 	percpu_core.num_used = 2;
-	percpu_core.num_allocated = 4;
-	percpu_core.size = kmalloc(sizeof(percpu_core.size[0])
-				   * percpu_core.num_allocated,
-				   GFP_KERNEL);
+	percpu_core.num_allocated = INITIAL_NUM_ALLOCATED;
+	percpu_core.size = initial_sizes;
 	/* Static in-kernel percpu data (used, so negative). */
 	percpu_core.size[0] = -(__per_cpu_end - __per_cpu_start);
 	/* Free room. */
 	percpu_core.size[1] = percpu_size + percpu_core.size[0];
 	INIT_LIST_HEAD(&percpu_core.list);
+}
+
+/* Change over to a real sizes array now kmalloc exists. */
+void __init percpu_alloc_init(void)
+{
+	down(&percpu_lock);
+	percpu_core.size = kmalloc(sizeof(initial_sizes), GFP_KERNEL);
+	memcpy(percpu_core.size, initial_sizes, sizeof(initial_sizes));
 
+	/* Arch allocated more than we need for modules? */
 	if (percpu_size > reserved_size) {
+		unsigned long extra = percpu_size - reserved_size;
 		struct percpu_block *b;
 
-		/* Mark out extra space as allocated. */
-		percpu_core.size[1] = reserved_size + percpu_core.size[0];
-		percpu_core.size[2] = -(percpu_size - reserved_size);
+		/* Clip off extra space, mark as allocated. */
+		BUG_ON(percpu_core.size[percpu_core.num_used-1] < 0);
+		BUG_ON(percpu_core.num_used >= percpu_core.num_allocated);
+
+		percpu_core.size[percpu_core.num_used-1] -= extra;
+		percpu_core.size[percpu_core.num_used] = -extra;
 		percpu_core.num_used++;
 
 		/* Duplicate of core block, but with core space allocated. */
 		b = new_block();
-		b->size[0] = -reserved_size;
-		b->size[1] = percpu_size - reserved_size;
+		b->size[0] = -(percpu_size - extra);
+		b->size[1] = extra;
 		b->num_used = 2;
 		b->start = percpu_core.start;
 		list_add(&b->list, &percpu_core.list);
 	}
-	return 0;
+	up(&percpu_lock);
 }
-core_initcall(percpu_alloc_init);
 
 static int split_block(unsigned int i, unsigned short size,
 		       struct percpu_block *pb)
@@ -171,6 +188,7 @@
 	if (pb->num_used + 1 > pb->num_allocated) {
 		int *new = kmalloc(sizeof(new[0]) * pb->num_allocated*2,
 				   GFP_KERNEL);
+
 		if (!new)
 			return 0;
 
@@ -288,6 +306,13 @@
 	unsigned int cpu;
 
 	down(&percpu_lock);
+	/* Bootstrap mode: allocations for slab.c. */
+	if (percpu_core.size == initial_sizes) {
+		BUG_ON(percpu_core.num_used == percpu_core.num_allocated);
+		ret = alloc_from_block(size, align, &percpu_core);
+		goto success;
+	}
+
 	/* Cleverly skips over kernel reserved space. */
 	list_for_each_entry(b, &percpu_core.list, list) {
 		ret = alloc_from_block(size, align, b);
@@ -334,7 +359,9 @@
 			goto unlock;
 		}
 	}
-	BUG();
+	if (system_state == SYSTEM_RUNNING)
+		printk("percpu: freeing bootstrap allocation? %p\n", freeme);
+	free_from_block(freeme, &percpu_core);
 unlock:
 	up(&percpu_lock);
 }
@@ -538,11 +565,14 @@
 		       - atomic_read(&percpu_local_ptr_count)
 		       - atomic_read(&percpu_local_count));
 
+	local_irq_disable();
 	atomic_set(&percpu_local_count, 0);
 	atomic_set(&percpu_count, 0);
 	atomic_set(&percpu_local_ptr_count, 0);
 	atomic_set(&percpu_ptr_count, 0);
 	atomic_set(&smp_id_count, 0);
+	local_irq_enable();
+
 	return len;
 }
 
Index: linux-2.6.10-rc2-bk13-Percpu/include/linux/percpu.h
===================================================================
--- linux-2.6.10-rc2-bk13-Percpu.orig/include/linux/percpu.h	2004-12-02 15:02:31.000000000 +1100
+++ linux-2.6.10-rc2-bk13-Percpu/include/linux/percpu.h	2004-12-03 17:29:25.000000000 +1100
@@ -35,7 +35,7 @@
 extern void free_percpu(const void *);
 extern void *percpu_modalloc(unsigned long size, unsigned long align);
 extern void percpu_modfree(void *freeme);
-
+extern void percpu_alloc_init(void);
 #else /* CONFIG_SMP */
 
 #define per_cpu_ptr(ptr, cpu) (ptr)
@@ -61,6 +61,10 @@
 static inline void percpu_modfree(void *freeme)
 {
 }
+
+static inline void percpu_alloc_init(void)
+{
+}
 #endif /* CONFIG_SMP */
 
 /* Simple wrapper for the common case: zeros memory. */
Index: linux-2.6.10-rc2-bk13-Percpu/mm/slab.c
===================================================================
--- linux-2.6.10-rc2-bk13-Percpu.orig/mm/slab.c	2004-12-03 17:26:46.000000000 +1100
+++ linux-2.6.10-rc2-bk13-Percpu/mm/slab.c	2004-12-03 17:36:54.000000000 +1100
@@ -92,6 +92,7 @@
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
 #include	<linux/rcupdate.h>
+#include	<linux/stop_machine.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -283,7 +284,7 @@
 	
 struct kmem_cache_s {
 /* 1) per-cpu data, touched during every alloc/free */
-	struct array_cache	*array[NR_CPUS];
+	struct array_cache	*pc_array;
 	unsigned int		batchcount;
 	unsigned int		limit;
 /* 2) touched by every alloc & free from the backend */
@@ -508,9 +509,10 @@
 #undef CACHE
 };
 
-static struct arraycache_init initarray_cache __initdata =
+/* Used during bootstrap. */
+static DEFINE_PER_CPU(struct arraycache_init, initarray_cache) =
 	{ { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
-static struct arraycache_init initarray_generic =
+static DEFINE_PER_CPU(struct arraycache_init, initarray_generic) =
 	{ { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 
 /* internal cache of cache description objs */
@@ -558,7 +560,7 @@
 
 static inline struct array_cache *ac_data(kmem_cache_t *cachep)
 {
-	return cachep->array[smp_processor_id()];
+	return __get_cpu_ptr(cachep->pc_array);
 }
 
 static kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags)
@@ -636,24 +638,18 @@
 	}
 }
 
-static struct array_cache *alloc_arraycache(int cpu, int entries, int batchcount)
+static void adjust_free_limits(unsigned int num_cpus)
 {
-	int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
-	struct array_cache *nc = NULL;
+	kmem_cache_t* cachep;
 
-	if (cpu != -1) {
-		nc = kmem_cache_alloc_node(kmem_find_general_cachep(memsize,
-					GFP_KERNEL), cpu_to_node(cpu));
-	}
-	if (!nc)
-		nc = kmalloc(memsize, GFP_KERNEL);
-	if (nc) {
-		nc->avail = 0;
-		nc->limit = entries;
-		nc->batchcount = batchcount;
-		nc->touched = 0;
+	down(&cache_chain_sem);
+	list_for_each_entry(cachep, &cache_chain, next) {
+		spin_lock_irq(&cachep->spinlock);
+		cachep->free_limit = num_cpus*cachep->batchcount
+			+ cachep->num;
+		spin_unlock_irq(&cachep->spinlock);
 	}
-	return nc;
+	up(&cache_chain_sem);
 }
 
 static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -661,26 +657,10 @@
 				  void *hcpu)
 {
 	long cpu = (long)hcpu;
-	kmem_cache_t* cachep;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
-		down(&cache_chain_sem);
-		list_for_each_entry(cachep, &cache_chain, next) {
-			struct array_cache *nc;
-
-			nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
-			if (!nc)
-				goto bad;
-
-			spin_lock_irq(&cachep->spinlock);
-			cachep->array[cpu] = nc;
-			cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
-						+ cachep->num;
-			spin_unlock_irq(&cachep->spinlock);
-
-		}
-		up(&cache_chain_sem);
+		adjust_free_limits(1+num_online_cpus());
 		break;
 	case CPU_ONLINE:
 		start_cpu_timer(cpu);
@@ -689,28 +669,11 @@
 	case CPU_DEAD:
 		/* fall thru */
 	case CPU_UP_CANCELED:
-		down(&cache_chain_sem);
-
-		list_for_each_entry(cachep, &cache_chain, next) {
-			struct array_cache *nc;
-
-			spin_lock_irq(&cachep->spinlock);
-			/* cpu is dead; no one can alloc from it. */
-			nc = cachep->array[cpu];
-			cachep->array[cpu] = NULL;
-			cachep->free_limit -= cachep->batchcount;
-			free_block(cachep, nc, nc->avail);
-			spin_unlock_irq(&cachep->spinlock);
-			kfree(nc);
-		}
-		up(&cache_chain_sem);
+		adjust_free_limits(num_online_cpus());
 		break;
 #endif
 	}
 	return NOTIFY_OK;
-bad:
-	up(&cache_chain_sem);
-	return NOTIFY_BAD;
 }
 
 static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
@@ -753,8 +716,8 @@
 	INIT_LIST_HEAD(&cache_chain);
 	list_add(&cache_cache.next, &cache_chain);
 	cache_cache.colour_off = cache_line_size();
-	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
-
+	/* FIXME: Tricky, but make a macro to do this --RR */
+	cache_cache.pc_array = &per_cpu__initarray_cache.cache;
 	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
 
 	cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
@@ -797,25 +760,26 @@
 	}
 	/* 4) Replace the bootstrap head arrays */
 	{
-		void * ptr;
-		
-		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		struct arraycache_init *ptr;
+
+		ptr = alloc_percpu(struct arraycache_init);
+		memcpy(__get_cpu_ptr(ptr), ac_data(&cache_cache),sizeof(*ptr));
 		local_irq_disable();
-		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
-		memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
-		cache_cache.array[smp_processor_id()] = ptr;
+		cache_cache.pc_array = &ptr->cache;
 		local_irq_enable();
-	
-		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
+		ptr = alloc_percpu(struct arraycache_init);
+		memcpy(__get_cpu_ptr(ptr), ac_data(malloc_sizes[0].cs_cachep),
+		       sizeof(*ptr));
 		local_irq_disable();
-		BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
-		memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
-				sizeof(struct arraycache_init));
-		malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
+		malloc_sizes[0].cs_cachep->pc_array = &ptr->cache;
 		local_irq_enable();
 	}
 
-	/* 5) resize the head arrays to their final sizes */
+	/* 5) Take per-cpu allocation out of bootstrap (now kmalloc works). */
+	percpu_alloc_init();
+
+	/* 6) resize the head arrays to their final sizes */
 	{
 		kmem_cache_t *cachep;
 		down(&cache_chain_sem);
@@ -1396,10 +1360,12 @@
 			 * the cache that's used by kmalloc(24), otherwise
 			 * the creation of further caches will BUG().
 			 */
-			cachep->array[smp_processor_id()] = &initarray_generic.cache;
+			/* FIXME: Tricky, but make a macro to do this --RR */
+			cachep->pc_array = &per_cpu__initarray_generic.cache;
 			g_cpucache_up = PARTIAL;
 		} else {
-			cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
+			cachep->pc_array 
+				= &alloc_percpu(struct arraycache_init)->cache;
 		}
 		BUG_ON(!ac_data(cachep));
 		ac_data(cachep)->avail = 0;
@@ -1597,8 +1563,6 @@
  */
 int kmem_cache_destroy (kmem_cache_t * cachep)
 {
-	int i;
-
 	if (!cachep || in_interrupt())
 		BUG();
 
@@ -1628,8 +1592,7 @@
 	/* no cpu_online check required here since we clear the percpu
 	 * array on cpu offline and set this to NULL.
 	 */
-	for (i = 0; i < NR_CPUS; i++)
-		kfree(cachep->array[i]);
+	free_percpu(cachep->pc_array);
 
 	/* NUMA: free the list3 structures */
 	kfree(cachep->lists.shared);
@@ -2515,67 +2478,87 @@
 
 struct ccupdate_struct {
 	kmem_cache_t *cachep;
-	struct array_cache *new[NR_CPUS];
+	struct array_cache *pc_array;
 };
 
-static void do_ccupdate_local(void *info)
+static int set_pc_array(void *info)
 {
 	struct ccupdate_struct *new = (struct ccupdate_struct *)info;
 	struct array_cache *old;
 
 	check_irq_off();
-	old = ac_data(new->cachep);
-	
-	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
-	new->new[smp_processor_id()] = old;
-}
+	old = new->cachep->pc_array;
+	new->cachep->pc_array = new->pc_array;
+	new->pc_array = old;
 
+	return 0;
+}
 
-static int do_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount, int shared)
+static int do_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount, int shared, int init)
 {
 	struct ccupdate_struct new;
-	struct array_cache *new_shared;
-	int i;
+	int err, i, memsize = sizeof(void*)*limit+sizeof(struct array_cache);
+	struct array_cache *share_array;
 
-	memset(&new.new,0,sizeof(new.new));
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_online(i)) {
-			new.new[i] = alloc_arraycache(i, limit, batchcount);
-			if (!new.new[i]) {
-				for (i--; i >= 0; i--) kfree(new.new[i]);
-				return -ENOMEM;
-			}
-		} else {
-			new.new[i] = NULL;
+	new.cachep = cachep;
+	new.pc_array = __alloc_percpu(memsize, __alignof__(*new.pc_array));
+	if (!new.pc_array)
+		return -ENOMEM;
+
+	for_each_cpu(i) {
+		per_cpu_ptr(new.pc_array, i)->avail = 0;
+		per_cpu_ptr(new.pc_array, i)->limit = limit;
+		per_cpu_ptr(new.pc_array, i)->batchcount = batchcount;
+		per_cpu_ptr(new.pc_array, i)->touched = 0;
+	}
+
+	/* Either before other CPUs up, or before cache returned. */
+	if (init) {
+		spin_lock_irq(&cachep->spinlock);
+		set_pc_array(&new);
+		spin_unlock_irq(&cachep->spinlock);
+	} else {
+		/* Do it atomically. */
+		err = stop_machine_run(set_pc_array, &new, NR_CPUS);
+		if (err) {
+			free_percpu(new.pc_array);
+			return err;
 		}
 	}
-	new.cachep = cachep;
 
-	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
-	
-	check_irq_on();
 	spin_lock_irq(&cachep->spinlock);
 	cachep->batchcount = batchcount;
 	cachep->limit = limit;
 	cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
 	spin_unlock_irq(&cachep->spinlock);
 
-	for (i = 0; i < NR_CPUS; i++) {
-		struct array_cache *ccold = new.new[i];
-		if (!ccold)
-			continue;
+	/* Old array is returned in new.pc_array. */
+	if (new.pc_array) {
 		spin_lock_irq(&cachep->spinlock);
-		free_block(cachep, ccold, ccold->avail);
+		for_each_cpu(i) {
+			struct array_cache *ac;
+
+			ac = per_cpu_ptr(cachep->pc_array, i);
+			free_block(cachep, ac, ac->avail);
+		}
 		spin_unlock_irq(&cachep->spinlock);
-		kfree(ccold);
+		free_percpu(new.pc_array);
 	}
-	new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
-	if (new_shared) {
+
+	memsize = sizeof(void*)*(batchcount*shared)+sizeof(struct array_cache);
+	share_array = kmalloc(memsize, GFP_KERNEL);
+
+	if (share_array) {
 		struct array_cache *old;
 
+		share_array->avail = 0;
+		share_array->limit = batchcount*shared;
+		share_array->batchcount = 0xbaadf00d;
+		share_array->touched = 0;
+
 		spin_lock_irq(&cachep->spinlock);
 		old = cachep->lists.shared;
-		cachep->lists.shared = new_shared;
+		cachep->lists.shared = share_array;
 		if (old)
 			free_block(cachep, old, old->avail);
 		spin_unlock_irq(&cachep->spinlock);
@@ -2632,7 +2615,7 @@
 	if (limit > 32)
 		limit = 32;
 #endif
-	err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
+	err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared, 1);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
 					cachep->name, -err);
@@ -2653,7 +2636,7 @@
 		}
 		free_block(cachep, ac, tofree);
 		ac->avail -= tofree;
-		memmove(ac->entries, ac->entires + tofree,
+		memmove(ac->entries, ac->entries + tofree,
 			sizeof(void*)*ac->avail);
 	}
 }
@@ -2940,7 +2923,7 @@
 			    shared < 0) {
 				res = -EINVAL;
 			} else {
-				res = do_tune_cpucache(cachep, limit, batchcount, shared);
+				res = do_tune_cpucache(cachep, limit, batchcount, shared, 0);
 			}
 			break;
 		}