Name: Hotplug CPU support for per-cpu structures Author: Zwane Mwaikambo Status: Tested on 2.5.59, 4xi386 (Zwane) Depends: Hotcpu/hotcpu-core.patch.gz D: Handles cpus going up and down in the mm/, fs/ and net/ directories. Index: linux-2.5.59-lch2/mm/slab.c Index: linux-2.5.59-lch2/mm/vmscan.c Index: linux-2.5.59-lch2/fs/buffer.c Index: linux-2.5.59-lch2/net/core/dev.c diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26070-linux-2.5.59-bk3/fs/buffer.c .26070-linux-2.5.59-bk3.updated/fs/buffer.c --- .26070-linux-2.5.59-bk3/fs/buffer.c 2003-02-10 09:57:47.000000000 +1100 +++ .26070-linux-2.5.59-bk3.updated/fs/buffer.c 2003-02-10 15:33:55.000000000 +1100 @@ -2862,7 +2862,18 @@ static void buffer_init_cpu(int cpu) bha->ratelimit = 0; memset(bhl, 0, sizeof(*bhl)); } - + +static void buffer_exit_cpu(int cpu) +{ + int i; + struct bh_lru *b = &per_cpu(bh_lrus, cpu); + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } +} + static int __devinit buffer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -2871,6 +2882,9 @@ static int __devinit buffer_cpu_notify(s case CPU_UP_PREPARE: buffer_init_cpu(cpu); break; + case CPU_OFFLINE: + buffer_exit_cpu(cpu); + break; default: break; } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26070-linux-2.5.59-bk3/mm/slab.c .26070-linux-2.5.59-bk3.updated/mm/slab.c --- .26070-linux-2.5.59-bk3/mm/slab.c 2003-02-10 09:57:52.000000000 +1100 +++ .26070-linux-2.5.59-bk3.updated/mm/slab.c 2003-02-10 15:33:55.000000000 +1100 @@ -527,6 +527,16 @@ static void start_cpu_timer(int cpu) } } +static void stop_cpu_timer(int cpu) +{ + struct timer_list *rt = &reap_timers[cpu]; + + if (rt->function) { + del_timer_sync(rt); + rt->function = NULL; + } +} + /* * Note: if someone calls kmem_cache_alloc() on the new * cpu before the cpuup callback had a chance to allocate @@ -583,6 +593,24 @@ static int __devinit cpuup_callback(stru } up(&cache_chain_sem); break; + + case CPU_DEAD: + down(&cache_chain_sem); + list_for_each(p, &cache_chain) { + struct array_cache *nc; + + kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next); + spin_lock_irq(&cachep->spinlock); + nc = cachep->array[cpu]; + cachep->array[cpu] = NULL; + cachep->free_limit = (num_online_cpus())*cachep->batchcount + + cachep->num; + kfree(nc); + spin_unlock_irq(&cachep->spinlock); + } + up(&cache_chain_sem); + stop_cpu_timer(cpu); + break; } return NOTIFY_OK; bad: @@ -590,7 +618,7 @@ bad: return NOTIFY_BAD; } -static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; +static struct notifier_block __devinitdata cpucache_notifier = { &cpuup_callback, NULL, 0 }; static inline void ** ac_entry(struct array_cache *ac) { @@ -1235,6 +1263,9 @@ int kmem_cache_destroy (kmem_cache_t * c } { int i; + /* no cpu_online check required here since we clear the percpu + * array on cpu offline and set this to NULL. + */ for (i = 0; i < NR_CPUS; i++) kfree(cachep->array[i]); /* NUMA: free the list3 structures */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26070-linux-2.5.59-bk3/mm/vmscan.c .26070-linux-2.5.59-bk3.updated/mm/vmscan.c --- .26070-linux-2.5.59-bk3/mm/vmscan.c 2003-02-10 09:57:52.000000000 +1100 +++ .26070-linux-2.5.59-bk3.updated/mm/vmscan.c 2003-02-10 15:33:55.000000000 +1100 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -931,6 +932,7 @@ int kswapd(void *p) daemonize(); set_cpus_allowed(tsk, node_to_cpumask(pgdat->node_id)); sprintf(tsk->comm, "kswapd%d", pgdat->node_id); + printk("Set %s affinity to %08lX\n", tsk->comm, tsk->cpus_allowed); sigfillset(&tsk->blocked); /* @@ -998,6 +1000,45 @@ int shrink_all_memory(int nr_pages) } #endif +/* It's optimal to keep kswapds on the same CPUs as their memory, but + not required for correctness. So if the last cpu in a node goes + away, let them run anywhere, and as the first one comes back, + restore their cpu bindings. */ +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + pg_data_t *pgdat; + unsigned int hotcpu = (unsigned long)hcpu; + unsigned long mask; + + if (action == CPU_OFFLINE) { + /* Make sure that kswapd never becomes unschedulable. */ + for_each_pgdat(pgdat) { + mask = __node_to_cpu_mask(pgdat->node_id); + if (any_online_cpu(mask) < 0) { + mask = ~0UL; + set_cpus_allowed(pgdat->kswapd, mask); + } + } + } + + if (action == CPU_ONLINE) { + for_each_pgdat(pgdat) { + mask = __node_to_cpu_mask(pgdat->node_id); + mask &= ~(1UL << hotcpu); + if (any_online_cpu(mask) < 0) { + mask |= (1UL << hotcpu); + /* One of our CPUs came back: restore mask */ + set_cpus_allowed(pgdat->kswapd, mask); + } + } + } + return NOTIFY_OK; +} + +static struct notifier_block cpu_nfb = { &cpu_callback, NULL, 0 }; + static int __init kswapd_init(void) { pg_data_t *pgdat; @@ -1005,6 +1046,7 @@ static int __init kswapd_init(void) for_each_pgdat(pgdat) kernel_thread(kswapd, pgdat, CLONE_KERNEL); total_memory = nr_free_pagecache_pages(); + register_cpu_notifier(&cpu_nfb); return 0; } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26070-linux-2.5.59-bk3/net/core/dev.c .26070-linux-2.5.59-bk3.updated/net/core/dev.c --- .26070-linux-2.5.59-bk3/net/core/dev.c 2003-02-07 19:22:29.000000000 +1100 +++ .26070-linux-2.5.59-bk3.updated/net/core/dev.c 2003-02-10 15:33:55.000000000 +1100 @@ -2945,3 +2945,67 @@ static int net_run_sbin_hotplug(struct n return call_usermodehelper(argv [0], argv, envp); } #endif + +static int dev_cpu_callback(struct notifier_block *nfb, unsigned long action, void * ocpu) +{ + struct sk_buff *list_sk, *sk_head; + struct net_device *list_net, *net_head; + struct softnet_data *queue; + struct sk_buff *skb; + unsigned int cpu = smp_processor_id(); + unsigned long oldcpu = (unsigned long) ocpu; + unsigned long flags; + + if (action != CPU_OFFLINE) + return 0; + + local_irq_save(flags); + + /* Move completion queue */ + + list_sk = softnet_data[oldcpu].completion_queue; + if (list_sk != NULL) { + sk_head = list_sk; + while (list_sk->next != NULL) + list_sk = list_sk->next; + list_sk->next = softnet_data[cpu].completion_queue; + softnet_data[cpu].completion_queue = sk_head; + softnet_data[oldcpu].completion_queue = NULL; + } + + /* Move output_queue */ + + list_net = softnet_data[oldcpu].output_queue; + if (list_net != NULL) { + net_head = list_net; + while (list_net->next != NULL) + list_net = list_net->next_sched; + list_net->next_sched = softnet_data[cpu].output_queue; + softnet_data[cpu].output_queue = net_head; + softnet_data[oldcpu].output_queue = NULL; + } + + local_irq_restore(flags); + + /* Move input_pkt_queue */ + + queue = &softnet_data[oldcpu]; + for (;;) { + skb = __skb_dequeue(&queue->input_pkt_queue); + if (skb == NULL) + break; + netif_rx(skb); + } + + return 0; +} + +static struct notifier_block cpu_callback_nfb = {&dev_cpu_callback, NULL, 0 }; + +static int __init dev_cpu_callback_init(void) +{ + register_cpu_notifier(&cpu_callback_nfb); + return 0; +} + +__initcall(dev_cpu_callback_init);