Name: Hotplug CPU Patch: Base XII Author: Matt Fleming, Zwane Mwaikambo, Rusty Russell Status: Experimental Depends: Hotcpu/thread_control.patch.gz Depends: Hotcpu/sigpower.patch.gz D: This is the arch-indep hotplug cpu code, contributed by Matt D: Fleming, Zwane Mwaikambo and me. cpumask_t adoption by Dipankar Sarma. D: D: When a CPU goes down, normal tasks get migrated. If their cpu mask D: doesn't allow this, it gets reset to "any", and they get sent a D: SIGPWR (which CPU is going down is in the siginfo struct). Kernel D: threads which are bound to particular CPUs are not touched: they D: must register notifiers to deal with this themselves. D: D: These changes, while widespread, are fairly well contained in D: CONFIG_HOTPLUG_CPU. There are no doubt more places in the kernel D: which contain per-cpu data structures which need notifying, but D: these are the core ones. D: D: Patch contains: D: - New option: CONFIG_HOTPLUG_CPU. D: - drivers/base/cpu.c: D: Add "online" attribute to sysfs. D: - fs/buffer.c: D: Release the buffer head lru list for cpu when it goes offline. D: - kernel/cpu.c: D: cpu_down implementation D: /sbin/hotplug call for cpu activity. D: New events: CPU_UP_PREPARE (can be NAKed), CPU_UP_CANCELED. D: New events: CPU_DOWN_PREPARE (can be NAKed), CPU_DOWN_CANCELED, CPU_DEAD. D: - kernel/rcupdate.c D: Code for clearing RCU queue of dead CPUs, under CONFIG_HOTPLUG_CPU. D: pull_from_global_queue() in rcu_process_callbacks, noop if !HOTPLUG_CPU. D: - kernel/sched.c: D: Trivial exposed wake_idle_cpu(cpu) function, useful for some archs D: which fake hotplug CPUs. D: cpu_is_offline() check when pulling tasks onto CPU: NOOP if !HOTPLUG_CPU. D: Code to stop migration threads (similar to workqueue.c). D: - kernel/softirq.c D: Code to stop ksoftirqd when cpu goes offline, and to migrate D: irqs when it finally dies (under HOTPLUG_CPU). D: - kernel/timer.c: D: Code to pull timers when cpu dies, under HOTPLUG_CPU. D: - kernel/workqueue.c D: Keep list of all workqueues in system. D: Move name pointer into wq struct, so we can name newly created thread D: if CPU comes up later. D: Grab cpucontrol lock around workqueue creation/destruction/flush. D: Code to kill workqueue threads when a CPU goes offline, under HOTPLUG_CPU. D: - mm/slab.c D: Move ac_entry and ac_delta helper functions earlier in file. D: Clean up list iterators. D: Added stop_cpu_timer(), under HOTPLUG_CPU. D: Add code to free caches when cpu goes down. D: - mm/swap.c D: Spill local vm_committed_space counters into global when cpu dies, D: under HOTPLUG_CPU. D: - mm/vmscan.c D: Migrate kswapd when last cpu in node goes down, and back D: if one comes up, under HOTPLUG_CPU. D: - net/core/dev.c D: Add callback to drain softnet queue when cpu goes down. D: D: Changes since base VI: D: 1) Thread stopping code all uses completion idiom, for simplicity, D: a-la workqueues. This effects ksoftirqd and migration threads. D: 2) Moved mm/slab.c code under CONFIG_HOTPLUG_CPU. D: 3) Simplified code in net/core/dev.c and removed some whitespace. D: D: Changes since base VIII: D: 1) Ignore TASK_DEAD tasks in check_for_tasks, too. D: 2) Expose CPU_UP_CANCELED case in mm/slab.c to !HOTPLUG_CPU. D: 3) Hold cachep->spinlock around free_block() call. D: 4) any_online_cpus return fixed to NR_CPUS. D: D: Changes since base IX: D: 1) cpucontrol lock around workqueue creation/destruction/flushing. D: 2) Cleanup nr_interruptible numbers when CPU_DEAD. D: 3) Add kernel_thread_on_cpu for convenient thread starting. D: 4) Initialize completion in sched.c every time cpu comes up. D: D: Changes since base X: D: 1) Use kthread_create etc. diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/drivers/base/cpu.c .17007-linux-2.6.0-test5-bk11.updated/drivers/base/cpu.c --- .17007-linux-2.6.0-test5-bk11/drivers/base/cpu.c 2003-09-22 10:27:56.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/drivers/base/cpu.c 2003-09-25 14:54:34.000000000 +1000 @@ -7,6 +7,7 @@ #include #include #include +#include struct sysdev_class cpu_sysdev_class = { @@ -14,6 +15,46 @@ struct sysdev_class cpu_sysdev_class = { }; EXPORT_SYMBOL(cpu_sysdev_class); +#ifdef CONFIG_HOTPLUG_CPU +static ssize_t show_online(struct sys_device *dev, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); + + return sprintf(buf, "%u\n", !!cpu_online(cpu->sysdev.id)); +} + +static ssize_t store_online(struct sys_device *dev, const char *buf, + size_t count) +{ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); + ssize_t ret; + + switch (buf[0]) { + case '0': + ret = cpu_down(cpu->sysdev.id); + break; + case '1': + ret = cpu_up(cpu->sysdev.id); + break; + default: + ret = -EINVAL; + } + + if (ret >= 0) + ret = count; + return ret; +} +static SYSDEV_ATTR(online, 0600, show_online, store_online); + +static void __init register_cpu_control(struct cpu *cpu) +{ + sysdev_create_file(&cpu->sysdev, &attr_online); +} +#else /* ... !CONFIG_HOTPLUG_CPU */ +static void __init register_cpu_control(struct cpu *cpu) +{ +} +#endif /* CONFIG_HOTPLUG_CPU */ /* * register_cpu - Setup a driverfs device for a CPU. @@ -23,10 +64,15 @@ EXPORT_SYMBOL(cpu_sysdev_class); */ int __init register_cpu(struct cpu *cpu, int num, struct node *root) { + int ret; + cpu->node_id = cpu_to_node(num); cpu->sysdev.id = num; cpu->sysdev.cls = &cpu_sysdev_class; - return sys_device_register(&cpu->sysdev); + ret = sys_device_register(&cpu->sysdev); + if (ret == 0) + register_cpu_control(cpu); + return ret; } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/fs/buffer.c .17007-linux-2.6.0-test5-bk11.updated/fs/buffer.c --- .17007-linux-2.6.0-test5-bk11/fs/buffer.c 2003-09-22 10:27:35.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/fs/buffer.c 2003-09-25 14:54:34.000000000 +1000 @@ -2996,7 +2996,18 @@ static void buffer_init_cpu(int cpu) bha->ratelimit = 0; memset(bhl, 0, sizeof(*bhl)); } - + +static void buffer_exit_cpu(int cpu) +{ + int i; + struct bh_lru *b = &per_cpu(bh_lrus, cpu); + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } +} + static int __devinit buffer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -3005,6 +3016,9 @@ static int __devinit buffer_cpu_notify(s case CPU_UP_PREPARE: buffer_init_cpu(cpu); break; + case CPU_OFFLINE: + buffer_exit_cpu(cpu); + break; default: break; } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/include/linux/cpu.h .17007-linux-2.6.0-test5-bk11.updated/include/linux/cpu.h --- .17007-linux-2.6.0-test5-bk11/include/linux/cpu.h 2003-09-22 10:09:13.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/include/linux/cpu.h 2003-09-25 14:54:34.000000000 +1000 @@ -37,6 +37,7 @@ extern int register_cpu_notifier(struct extern void unregister_cpu_notifier(struct notifier_block *nb); int cpu_up(unsigned int cpu); +int cpu_down(unsigned int cpu); #else static inline int register_cpu_notifier(struct notifier_block *nb) { diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/include/linux/mmzone.h .17007-linux-2.6.0-test5-bk11.updated/include/linux/mmzone.h --- .17007-linux-2.6.0-test5-bk11/include/linux/mmzone.h 2003-09-22 10:28:12.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/include/linux/mmzone.h 2003-09-25 14:54:34.000000000 +1000 @@ -209,6 +209,7 @@ typedef struct pglist_data { int node_id; struct pglist_data *pgdat_next; wait_queue_head_t kswapd_wait; + struct task_struct *kswapd; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/include/linux/notifier.h .17007-linux-2.6.0-test5-bk11.updated/include/linux/notifier.h --- .17007-linux-2.6.0-test5-bk11/include/linux/notifier.h 2003-09-21 17:31:11.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/include/linux/notifier.h 2003-09-25 14:54:34.000000000 +1000 @@ -60,11 +60,14 @@ extern int notifier_call_chain(struct no #define NETLINK_URELEASE 0x0001 /* Unicast netlink socket released */ -#define CPU_ONLINE 0x0002 /* CPU (unsigned)v is up */ -#define CPU_UP_PREPARE 0x0003 /* CPU (unsigned)v coming up */ -#define CPU_UP_CANCELED 0x0004 /* CPU (unsigned)v NOT coming up */ -#define CPU_OFFLINE 0x0005 /* CPU (unsigned)v offline (still scheduling) */ -#define CPU_DEAD 0x0006 /* CPU (unsigned)v dead */ +#define CPU_ONLINE 0x0002 /* CPU v is up */ +#define CPU_UP_PREPARE 0x0003 /* CPU v coming up (can fail) */ +#define CPU_UP_CANCELED 0x0004 /* CPU v NOT coming up */ +#define CPU_DOWN_PREPARE 0x0005 /* CPU v going down (can fail) */ +#define CPU_DOWN_CANCELED 0x0006 /* CPU v NOT going down */ +#define CPU_OFFLINE 0x0007 /* CPU v offline + (still scheduling) */ +#define CPU_DEAD 0x0008 /* CPU v dead */ #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/include/linux/sched.h .17007-linux-2.6.0-test5-bk11.updated/include/linux/sched.h --- .17007-linux-2.6.0-test5-bk11/include/linux/sched.h 2003-09-25 09:56:38.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/include/linux/sched.h 2003-09-25 16:39:50.000000000 +1000 @@ -518,11 +518,14 @@ extern void node_nr_running_init(void); #define node_nr_running_init() {} #endif +/* Move tasks off this (offline) CPU onto another. */ +extern void migrate_all_tasks(void); extern void set_user_nice(task_t *p, long nice); extern int task_prio(task_t *p); extern int task_nice(task_t *p); extern int task_curr(task_t *p); extern int idle_cpu(int cpu); +extern void wake_idle_cpu(unsigned int cpu); void yield(void); @@ -875,7 +878,6 @@ static inline void set_task_cpu(struct t } #endif /* CONFIG_SMP */ - #endif /* __KERNEL__ */ #endif diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/cpu.c .17007-linux-2.6.0-test5-bk11.updated/kernel/cpu.c --- .17007-linux-2.6.0-test5-bk11/kernel/cpu.c 2003-09-22 10:09:14.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/kernel/cpu.c 2003-09-25 14:54:34.000000000 +1000 @@ -1,14 +1,17 @@ /* CPU control. - * (C) 2001 Rusty Russell + * (C) 2001, 2002, 2003 Rusty Russell + * * This code is licenced under the GPL. */ #include #include #include -#include #include #include +#include /* for hotplug_path */ #include +#include +#include #include /* This protects CPUs going up and down... */ @@ -19,15 +22,159 @@ static struct notifier_block *cpu_chain /* Need to know about CPUs going up/down? */ int register_cpu_notifier(struct notifier_block *nb) { - return notifier_chain_register(&cpu_chain, nb); + int ret; + + if ((ret = down_interruptible(&cpucontrol)) != 0) + return ret; + ret = notifier_chain_register(&cpu_chain, nb); + up(&cpucontrol); + return ret; } void unregister_cpu_notifier(struct notifier_block *nb) { - notifier_chain_unregister(&cpu_chain,nb); + down(&cpucontrol); + notifier_chain_unregister(&cpu_chain, nb); + up(&cpucontrol); } -int __devinit cpu_up(unsigned int cpu) +#ifdef CONFIG_HOTPLUG_CPU +static inline void check_for_tasks(int cpu) +{ + struct task_struct *p; + + write_lock_irq(&tasklist_lock); + for_each_process(p) { + int dying = p->state & (TASK_ZOMBIE | TASK_DEAD); + if (p->thread_info->cpu == cpu && !dying) + printk(KERN_WARNING "Task %s is on cpu %d, " + "not dying\n", p->comm, cpu); + } + write_unlock_irq(&tasklist_lock); +} + +/* Notify userspace when a cpu event occurs, by running '/sbin/hotplug + * cpu' with certain environment variables set. */ +static int cpu_run_sbin_hotplug(unsigned int cpu, const char *action) +{ + char *argv[3], *envp[5], cpu_str[12], action_str[32]; + int i; + + sprintf(cpu_str, "CPU=%d", cpu); + sprintf(action_str, "ACTION=%s", action); + /* FIXME: Add DEVPATH. --RR */ + + i = 0; + argv[i++] = hotplug_path; + argv[i++] = "cpu"; + argv[i] = NULL; + + i = 0; + /* minimal command environment */ + envp [i++] = "HOME=/"; + envp [i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp [i++] = cpu_str; + envp [i++] = action_str; + envp [i] = NULL; + + return call_usermodehelper(argv[0], argv, envp, 0); +} + + +static inline int cpu_down_check(unsigned int cpu) +{ + if (!cpu_online(cpu)) + return -EINVAL; + + if (num_online_cpus() == 1) + return -EBUSY; + + return 0; +} + +static inline int cpu_disable(int cpu) +{ + int ret; + + ret = __cpu_disable(); + if (ret < 0) + return ret; + + /* Everyone looking at cpu_online() should be doing so with + * preemption disabled. */ + synchronize_kernel(); + BUG_ON(cpu_online(cpu)); + return 0; +} + +int cpu_down(unsigned int cpu) +{ + int err, rc; + void *vcpu = (void *)(long)cpu; + cpumask_t mask; + + if ((err = down_interruptible(&cpucontrol)) != 0) + return err; + + if ((err = cpu_down_check(cpu)) != 0) + goto out; + + rc = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, vcpu); + if (rc == NOTIFY_BAD) { + /* FIXME: It'd be nice to only call those who saw + * CPU_DOWN_PREPARE. --RR */ + err = -EBUSY; + goto notify_out; + } + + /* Schedule ourselves on the dying CPU. */ + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + + if ((err = cpu_disable(cpu)) != 0) + goto notify_out; + + cpu_run_sbin_hotplug(cpu, "offline"); + + /* Move other tasks off to other CPUs (simple since they are + not running now). */ + migrate_all_tasks(); + + /* Move off dying CPU, which will revert to idle process. */ + cpus_clear(mask); + cpus_complement(mask); + cpu_clear(cpu, mask); + set_cpus_allowed(current, mask); + + /* Tell kernel threads to go away: they can't fail here. */ + rc = notifier_call_chain(&cpu_chain, CPU_OFFLINE, vcpu); + BUG_ON(rc == NOTIFY_BAD); + + check_for_tasks(cpu); + + /* This actually kills the CPU. */ + __cpu_die(cpu); + + notify_out: + if (err < 0) + rc = notifier_call_chain(&cpu_chain, CPU_DOWN_CANCELED, vcpu); + else { + /* CPU is completely dead: tell everyone. Too late to + * complain. */ + rc = notifier_call_chain(&cpu_chain, CPU_DEAD, vcpu); + } + BUG_ON(rc == NOTIFY_BAD); +out: + up(&cpucontrol); + return err; +} +#else +static inline int cpu_run_sbin_hotplug(unsigned int cpu, const char *action) +{ + return 0; +} +#endif /*CONFIG_HOTPLUG_CPU*/ + +int cpu_up(unsigned int cpu) { int ret; void *hcpu = (void *)(long)cpu; @@ -41,7 +188,7 @@ int __devinit cpu_up(unsigned int cpu) } ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); if (ret == NOTIFY_BAD) { - printk("%s: attempt to bring up CPU %u failed\n", + printk(KERN_DEBUG "%s: attempt to bring up CPU %u failed\n", __FUNCTION__, cpu); ret = -EINVAL; goto out_notify; diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/kthread.c .17007-linux-2.6.0-test5-bk11.updated/kernel/kthread.c --- .17007-linux-2.6.0-test5-bk11/kernel/kthread.c 2003-09-25 14:54:33.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/kernel/kthread.c 2003-09-25 17:59:18.000000000 +1000 @@ -10,6 +10,43 @@ #include #include +/* All thread comms is command -> ack, so we keep it simple. */ +struct kt_message +{ + struct task_struct *from, *to; + void *info; +}; + +static spinlock_t ktm_lock = SPIN_LOCK_UNLOCKED; +static struct kt_message ktm; + +static void ktm_send(struct task_struct *to, void *info) +{ + spin_lock(&ktm_lock); + ktm.to = to; + ktm.from = current; + ktm.info = info; + wake_up_process(ktm.to); + spin_unlock(&ktm_lock); +} + +static struct kt_message ktm_receive(void) +{ + struct kt_message m; + + for (;;) { + spin_lock(&ktm_lock); + if (ktm.to == current) + break; + current->state = TASK_INTERRUPTIBLE; + spin_unlock(&ktm_lock); + schedule(); + } + m = ktm; + spin_unlock(&ktm_lock); + return m; +} + struct kthread { int (*initfn)(void *data); @@ -18,57 +55,50 @@ struct kthread char *name; }; -/* We serialize kthread operations, so they all share these */ -static DECLARE_MUTEX(kthread_lock); -static struct task_struct *kthread_result; -static struct completion kthread_done; -static struct task_struct *kthread_stop; - static int kthread(void *data) { /* Copy data: it's on keventd_init's stack */ struct kthread k = *(struct kthread *)data; - int ret; + struct kt_message m; + int ret = 0; - /* Created by __kthread_create */ + /* Send to spawn_kthread, so it knows who we are. */ daemonize("%s", k.name); - kthread_result = current; - complete(&kthread_done); - schedule(); + ktm_send(current->real_parent, current); - /* Woken by kthread_destroy? */ - if (kthread_stop == current) + /* Receive from kthread_start or kthread_destroy */ + m = ktm_receive(); + if (!m.info) goto stop; - - /* Woken by kthread_init. */ - ret = k.initfn ? k.initfn(k.data) : 0; - if (ret < 0) { - kthread_result = ERR_PTR(ret); + if (k.initfn && (ret = k.initfn(k.data)) < 0) goto stop; - } - kthread_result = current; - complete(&kthread_done); + ktm_send(m.from, current); - while (kthread_stop != current) { - /* Must read kthread_stop before we schedule */ - smp_mb(); - schedule(); - /* Woken by random process or kthread_destroy */ + for (;;) { + /* If it fails, just wait until kthread_destroy. */ + if (k.corefn && (ret = k.corefn(k.data)) < 0) + k.corefn = NULL; - /* If keventd would reap children, we could just exit, - * and do a sys_waitpid() in kthread_destroy, rather - * than hanging around. */ - if (kthread_stop != current && k.corefn) { - ret = k.corefn(k.data); - if (ret < 0) - k.corefn = NULL; + /* Check if we're being told to stop. */ + spin_lock(&ktm_lock); + if (ktm.to == current && ktm.info == NULL) { + m = ktm; + spin_unlock(&ktm_lock); + goto stop; } + current->state == TASK_INTERRUPTIBLE; + spin_unlock(&ktm_lock); + schedule(); } + stop: - complete(&kthread_done); + ktm_send(m.from, NULL); return 0; } +/* Serialize all kthread control stuff, for simplicity. */ +static DECLARE_MUTEX(kthread_control); + struct kthread_create { struct task_struct *result; @@ -82,16 +112,13 @@ static void spawn_kthread(void *data) struct kthread_create *kc = data; int ret; - down(&kthread_lock); - init_completion(&kthread_done); ret = kernel_thread(kthread, &kc->k, CLONE_KERNEL); if (ret < 0) kc->result = ERR_PTR(ret); else { - wait_for_completion(&kthread_done); - kc->result = kthread_result; + struct kt_message m = ktm_receive(); + kc->result = m.info; } - up(&kthread_lock); complete(&kc->done); } @@ -117,36 +144,33 @@ struct task_struct *kthread_create(int ( kc.k.data = data; kc.k.name = name; + down(&kthread_control); schedule_work(&work); wait_for_completion(&kc.done); + up(&kthread_control); return kc.result; } struct task_struct *kthread_start(struct task_struct *k) { - down(&kthread_lock); - init_completion(&kthread_done); - wake_up_process(k); - wait_for_completion(&kthread_done); + struct kt_message m; - k = kthread_result; - up(&kthread_lock); + down(&kthread_control); + ktm_send(k, k); + m = ktm_receive(); + up(&kthread_control); - return k; + return m.info; } int kthread_destroy(struct task_struct *k) { - int ret; - - down(&kthread_lock); - init_completion(&kthread_done); - kthread_stop = k; + struct kt_message m; - wake_up_process(k); - wait_for_completion(&kthread_done); - ret = PTR_ERR(kthread_result); - up(&kthread_lock); + down(&kthread_control); + ktm_send(k, NULL); + m = ktm_receive(); + up(&kthread_control); - return ret; + return PTR_ERR(m.info); } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/rcupdate.c .17007-linux-2.6.0-test5-bk11.updated/kernel/rcupdate.c --- .17007-linux-2.6.0-test5-bk11/kernel/rcupdate.c 2003-09-22 10:27:38.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/kernel/rcupdate.c 2003-09-25 14:54:34.000000000 +1000 @@ -154,6 +154,78 @@ out_unlock: } +#ifdef CONFIG_HOTPLUG_CPU +/* slack queue used for offloading callbacks e.g. in the case of a cpu + * going offline */ +static struct rcu_global_queue_s { + spinlock_t lock; + struct list_head list; +} rcu_global_queue = { + .lock = SPIN_LOCK_UNLOCKED, + .list = LIST_HEAD_INIT(rcu_global_queue.list), +}; + +/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing + * locking requirements, the list it's pulling from has to belong to a cpu + * which is dead and hence not processing interrupts. + */ +static void rcu_move_batch(struct list_head *list) +{ + struct list_head *entry; + unsigned long flags; + + spin_lock_irqsave(&rcu_global_queue.lock, flags); + while (!list_empty(list)) { + entry = list->next; + list_del(entry); + list_add_tail(entry, &rcu_global_queue.list); + } + spin_unlock_irqrestore(&rcu_global_queue.lock, flags); +} + +static void rcu_offline_cpu(int cpu) +{ + /* if the cpu going offline owns the grace period + * we can block indefinitely waiting for it, so flush + * it here + */ + spin_lock_irq(&rcu_ctrlblk.mutex); + if (RCU_batch(cpu) == rcu_ctrlblk.curbatch) { + rcu_ctrlblk.curbatch++; + rcu_start_batch(rcu_ctrlblk.maxbatch); + } + spin_unlock_irq(&rcu_ctrlblk.mutex); + + rcu_move_batch(&RCU_curlist(cpu)); + rcu_move_batch(&RCU_nxtlist(cpu)); + + BUG_ON(!list_empty(&RCU_curlist(cpu))); + BUG_ON(!list_empty(&RCU_nxtlist(cpu))); + + tasklet_kill(&RCU_tasklet(cpu)); + list_del_init(&RCU_curlist(cpu)); + list_del_init(&RCU_nxtlist(cpu)); + memset(&per_cpu(rcu_data, cpu), 0, sizeof(struct rcu_data)); +} + +static inline void pull_from_global_queue(int cpu) +{ + /* Pick up any pending global callbacks. This is rarely used + * so lock contention is fine. Each cpu picks one callback and it's + * ok if we miss one since someone else can pick it up */ + if (unlikely(!list_empty(&rcu_global_queue.list))) { + spin_lock(&rcu_global_queue.lock); + if (!list_empty(&rcu_global_queue.list)) + list_move_tail(&rcu_global_queue.list, &RCU_nxtlist(cpu)); + spin_unlock(&rcu_global_queue.lock); + } +} +#else /* ... !CONFIG_HOTPLUG_CPU */ +static inline void pull_from_global_queue(int cpu) +{ +} +#endif /* CONFIG_HOTPLUG_CPU */ + /* * This does the RCU processing work from tasklet context. */ @@ -169,6 +241,9 @@ static void rcu_process_callbacks(unsign } local_irq_disable(); + + pull_from_global_queue(cpu); + if (!list_empty(&RCU_nxtlist(cpu)) && list_empty(&RCU_curlist(cpu))) { list_splice(&RCU_nxtlist(cpu), &RCU_curlist(cpu)); INIT_LIST_HEAD(&RCU_nxtlist(cpu)); @@ -214,7 +289,11 @@ static int __devinit rcu_cpu_notify(stru case CPU_UP_PREPARE: rcu_online_cpu(cpu); break; - /* Space reserved for CPU_OFFLINE :) */ +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + rcu_offline_cpu(cpu); + break; +#endif default: break; } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/sched.c .17007-linux-2.6.0-test5-bk11.updated/kernel/sched.c --- .17007-linux-2.6.0-test5-bk11/kernel/sched.c 2003-09-25 09:56:39.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/kernel/sched.c 2003-09-25 17:56:49.000000000 +1000 @@ -35,6 +35,8 @@ #include #include #include +#include +#include #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) @@ -568,6 +570,22 @@ repeat: } #endif + +#ifdef CONFIG_HOTPLUG_CPU +/* Wake up a CPU from idle */ +void wake_idle_cpu(unsigned int cpu) +{ + resched_task(cpu_rq(cpu)->idle); +} + +static inline int cpu_is_offline(unsigned int cpu) +{ + return !cpu_online(cpu); +} +#else /* ... !CONFIG_HOTPLUG_CPU */ +#define cpu_is_offline(cpu) 0 +#endif /* CONFIG_HOTPLUG_CPU */ + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -601,7 +619,8 @@ repeat_lock_task: */ if (unlikely(sync && !task_running(rq, p) && (task_cpu(p) != smp_processor_id()) && - cpu_isset(smp_processor_id(), p->cpus_allowed))) { + cpu_isset(smp_processor_id(), p->cpus_allowed) + && !cpu_is_offline(smp_processor_id()))) { set_task_cpu(p, smp_processor_id()); task_rq_unlock(rq, &flags); @@ -815,9 +834,11 @@ unsigned long nr_running(void) { unsigned long i, sum = 0; - for (i = 0; i < NR_CPUS; i++) + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; sum += cpu_rq(i)->nr_running; - + } return sum; } @@ -838,7 +859,7 @@ unsigned long nr_context_switches(void) unsigned long i, sum = 0; for (i = 0; i < NR_CPUS; i++) { - if (!cpu_online(i)) + if (!cpu_possible(i)) continue; sum += cpu_rq(i)->nr_switches; } @@ -1163,6 +1184,10 @@ static void load_balance(runqueue_t *thi struct list_head *head, *curr; task_t *tmp; + /* CPU going down is a special case: we don't pull more tasks onboard */ + if (unlikely(cpu_is_offline(this_cpu))) + goto out; + busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); if (!busiest) goto out; @@ -2598,62 +2623,129 @@ static void move_task_away(struct task_s local_irq_restore(flags); } -typedef struct { - int cpu; - struct completion startup_done; - task_t *task; -} migration_startup_t; - /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto * another runqueue. */ -static int migration_thread(void * data) +static int migration_kthread_init(void *data) { /* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */ struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 }; - migration_startup_t *startup = data; - int cpu = startup->cpu; - runqueue_t *rq; - int ret; - - startup->task = current; - complete(&startup->startup_done); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); + unsigned int cpu = (long)data; BUG_ON(smp_processor_id() != cpu); - daemonize("migration/%d", cpu); set_fs(KERNEL_DS); - ret = setscheduler(0, SCHED_FIFO, ¶m); + setscheduler(0, SCHED_FIFO, ¶m); + return 0; +} - rq = this_rq(); - rq->migration_thread = current; +static int migration_kthread(void *data) +{ + runqueue_t *rq; + struct list_head *head; + migration_req_t *req; - for (;;) { - struct list_head *head; - migration_req_t *req; + rq = this_rq(); - spin_lock_irq(&rq->lock); - head = &rq->migration_queue; - current->state = TASK_INTERRUPTIBLE; - if (list_empty(head)) { - spin_unlock_irq(&rq->lock); - schedule(); - continue; - } + spin_lock_irq(&rq->lock); + head = &rq->migration_queue; + current->state = TASK_INTERRUPTIBLE; + while (!list_empty(head)) { req = list_entry(head->next, migration_req_t, list); list_del_init(head->next); - spin_unlock_irq(&rq->lock); + spin_unlock_irq(&rq->lock); move_task_away(req->task, any_online_cpu(req->task->cpus_allowed)); complete(&req->done); + spin_lock_irq(&rq->lock); } + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* migrate_all_tasks - function to migrate all the tasks from the + * current cpu caller must have already scheduled this to the target + * cpu via set_cpus_allowed */ +void migrate_all_tasks(void) +{ + struct task_struct *tsk, *t; + int dest_cpu, src_cpu, signalme; + unsigned int node; + struct siginfo sig; + + /* We're nailed to this CPU. */ + src_cpu = smp_processor_id(); + + sig.si_signo = SIGPWR; + sig.si_errno = 0; + sig.si_code = SI_KERNEL; + sig.si_cpu = src_cpu; + +again: + /* lock out everyone else intentionally */ + write_lock_irq(&tasklist_lock); + + /* watch out for per node tasks, let's stay on this node */ + node = cpu_to_node(src_cpu); + + do_each_thread(t, tsk) { + cpumask_t mask; + if (tsk == current) + continue; + + if (task_cpu(tsk) != src_cpu) + continue; + + /* Figure out where this task should go (attempting to + * keep it on-node), and check if it can be migrated + * as-is. NOTE that kernel threads bound to more than + * one online cpu will be migrated. */ + mask = node_to_cpumask(node); + cpus_and(mask, mask, tsk->cpus_allowed); + dest_cpu = any_online_cpu(mask); + if (dest_cpu == NR_CPUS) + dest_cpu = any_online_cpu(tsk->cpus_allowed); + if (dest_cpu == NR_CPUS) { + /* Kernel threads which are bound to specific + * processors need to look after themselves + * with their own callbacks. + */ + if (tsk->mm == NULL) + continue; + + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + tsk->pid, tsk->comm, src_cpu); + cpus_clear(tsk->cpus_allowed); + cpus_complement(tsk->cpus_allowed); + dest_cpu = any_online_cpu(tsk->cpus_allowed); + signalme = 1; + } else + signalme = 0; + + get_task_struct(tsk); + move_task_away(tsk, dest_cpu); + if (signalme) + goto do_signal; + put_task_struct(tsk); + } while_each_thread(t, tsk); + + write_unlock_irq(&tasklist_lock); + return; + +do_signal: + /* Need to drop tasklist_lock to send signal, then restart. */ + write_unlock_irq(&tasklist_lock); + send_sig_info(sig.si_signo, &sig, tsk); + put_task_struct(tsk); + goto again; + } +#endif /* CONFIG_HOTPLUG_CPU */ /* * migration_call - callback that gets triggered when a CPU is added. @@ -2663,42 +2755,66 @@ static int migration_call(struct notifie unsigned long action, void *hcpu) { - long cpu = (long) hcpu; - migration_startup_t startup; + unsigned int cpu = (long)hcpu; + struct task_struct *p; switch (action) { - case CPU_ONLINE: - - printk("Starting migration thread for cpu %li\n", cpu); - - startup.cpu = cpu; - startup.task = NULL; - init_completion(&startup.startup_done); - - kernel_thread(migration_thread, &startup, CLONE_KERNEL); - wait_for_completion(&startup.startup_done); - wait_task_inactive(startup.task); + case CPU_UP_PREPARE: + p = kthread_create(migration_kthread_init, migration_kthread, + hcpu, "migration/%d", cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + /* Manually bind to CPU: thread stopped, so this is OK. */ + p->thread_info->cpu = cpu; + p->cpus_allowed = cpumask_of_cpu(cpu); + cpu_rq(cpu)->migration_thread = p; + break; - startup.task->thread_info->cpu = cpu; - startup.task->cpus_allowed = cpumask_of_cpu(cpu); + case CPU_UP_CANCELED: + /* Bind back to this cpu so it can run. */ + p = cpu_rq(cpu)->migration_thread; + cpu_rq(cpu)->migration_thread = NULL; + p->thread_info->cpu = smp_processor_id(); + p->cpus_allowed = cpumask_of_cpu(smp_processor_id()); + kthread_destroy(p); + break; - wake_up_process(startup.task); + case CPU_ONLINE: + kthread_start(cpu_rq(cpu)->migration_thread); + break; - while (!cpu_rq(cpu)->migration_thread) - yield(); +#ifdef CONFIG_HOTPLUG_CPU + case CPU_OFFLINE: + kthread_destroy(cpu_rq(cpu)->migration_thread); + break; + case CPU_DEAD: { + runqueue_t *rq = this_rq_lock(); + runqueue_t *old_rq = cpu_rq(cpu); + rq->nr_uninterruptible += old_rq->nr_uninterruptible; + old_rq->nr_uninterruptible = 0; + BUG_ON(old_rq->nr_running != 0); + BUG_ON(atomic_read(&old_rq->nr_iowait) != 0); + rq_unlock(rq); break; } +#endif + } return NOTIFY_OK; } -static struct notifier_block migration_notifier = { &migration_call, NULL, 0 }; +/* Want this before the other threads, so they can use set_cpus_allowed. */ +static struct notifier_block __devinitdata migration_notifier = { + .notifier_call = migration_call, + .priority = -1, +}; __init int migration_init(void) { + void *cpu = (void *)(long)smp_processor_id(); /* Start one for boot CPU. */ - migration_call(&migration_notifier, CPU_ONLINE, - (void *)(long)smp_processor_id()); + migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); return 0; } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/softirq.c .17007-linux-2.6.0-test5-bk11.updated/kernel/softirq.c --- .17007-linux-2.6.0-test5-bk11/kernel/softirq.c 2003-09-25 09:56:39.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/kernel/softirq.c 2003-09-25 17:58:45.000000000 +1000 @@ -14,6 +14,7 @@ #include #include #include +#include /* - No shared variables, all the data are CPU local. @@ -319,63 +320,87 @@ void __init softirq_init(void) register_cpu_notifier(&tasklet_nb); } -static int ksoftirqd(void * __bind_cpu) +static int ksoftirqd_init(void *__bind_cpu) { - int cpu = (int) (long) __bind_cpu; + unsigned int cpu = (long) __bind_cpu; - daemonize("ksoftirqd/%d", cpu); + BUG_ON(smp_processor_id() != cpu); set_user_nice(current, 19); current->flags |= PF_IOTHREAD; + return 0; +} - /* Migrate to the right CPU */ - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - BUG_ON(smp_processor_id() != cpu); - - __set_current_state(TASK_INTERRUPTIBLE); - mb(); - - __get_cpu_var(ksoftirqd) = current; - - for (;;) { - if (!local_softirq_pending()) - schedule(); - - __set_current_state(TASK_RUNNING); - - while (local_softirq_pending()) { - do_softirq(); - cond_resched(); - } - - __set_current_state(TASK_INTERRUPTIBLE); +static int ksoftirqd(void *__bind_cpu) +{ +again: + set_current_state(TASK_INTERRUPTIBLE); + if (local_softirq_pending()) { + current->state = TASK_RUNNING; + do_softirq(); + cond_resched(); + goto again; } + return 0; } static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; - - if (action == CPU_ONLINE) { - if (kernel_thread(ksoftirqd, hcpu, CLONE_KERNEL) < 0) { - printk("ksoftirqd for %i failed\n", hotcpu); - return NOTIFY_BAD; + unsigned int hotcpu = (unsigned long)hcpu; + int ret = NOTIFY_OK; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(ksoftirqd_init, ksoftirqd, hcpu, + "ksoftirqd/%d", hotcpu); + if (IS_ERR(p)) { + printk("ksoftirqd for %u failed\n", hotcpu); + ret = NOTIFY_BAD; + break; } + per_cpu(ksoftirqd, hotcpu) = p; + break; + case CPU_ONLINE: + set_cpus_allowed(per_cpu(ksoftirqd, hotcpu), + cpumask_of_cpu(hotcpu)); + break; + case CPU_UP_CANCELED: + kthread_destroy(per_cpu(ksoftirqd, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_OFFLINE: + kthread_destroy(per_cpu(ksoftirqd, hotcpu)); + per_cpu(ksoftirqd, hotcpu) = NULL; + case CPU_DEAD: { + struct tasklet_struct *i, *next; - while (!per_cpu(ksoftirqd, hotcpu)) - yield(); - } - return NOTIFY_OK; + /* Pull pending softirqs from dead CPU to us. */ + local_irq_disable(); + for (i = per_cpu(tasklet_vec, hotcpu).list; i; i = next) { + next = i->next; + __tasklet_schedule(i); + } + for (i = per_cpu(tasklet_hi_vec, hotcpu).list; i; i = next) { + next = i->next; + __tasklet_hi_schedule(i); + } + local_irq_enable(); + break; + } +#endif /* CONFIG_HOTPLUG_CPU */ + } + return ret; } -static struct notifier_block __devinitdata cpu_nfb = { - .notifier_call = cpu_callback -}; +static struct notifier_block __devinitdata cpu_nfb = { cpu_callback }; __init int spawn_ksoftirqd(void) { - cpu_callback(&cpu_nfb, CPU_ONLINE, (void *)(long)smp_processor_id()); + void *cpu = (void *)(long)smp_processor_id(); + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/timer.c .17007-linux-2.6.0-test5-bk11.updated/kernel/timer.c --- .17007-linux-2.6.0-test5-bk11/kernel/timer.c 2003-09-22 10:28:13.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/kernel/timer.c 2003-09-25 14:54:34.000000000 +1000 @@ -1206,7 +1206,41 @@ static void __devinit init_timers_cpu(in base->timer_jiffies = jiffies; } + +#ifdef CONFIG_HOTPLUG_CPU +static void __devinit migrate_timers(int cpu) +{ + unsigned long flags; + tvec_base_t *base; + struct list_head *head; + struct timer_list *timer; + int index; + + BUG_ON(cpu_online(cpu)); + base = &per_cpu(tvec_bases, cpu); + spin_lock_irqsave(&base->lock, flags); + index = base->timer_jiffies & TVR_MASK; + + if (!index && + (!cascade(base, &base->tv2, INDEX(0))) && + (!cascade(base, &base->tv3, INDEX(1))) && + !cascade(base, &base->tv4, INDEX(2))) + cascade(base, &base->tv5, INDEX(3)); + +repeat: + head = base->tv1.vec + index; + if (!list_empty(head)) { + timer = list_entry(head->next, struct timer_list, entry); + spin_unlock_irqrestore(&base->lock, flags); + mod_timer(timer, timer->expires + 1); + spin_lock_irqsave(&base->lock, flags); + goto repeat; + } + spin_unlock_irqrestore(&base->lock, flags); +} +#endif /* CONFIG_HOTPLUG_CPU */ + static int __devinit timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1215,6 +1249,11 @@ static int __devinit timer_cpu_notify(st case CPU_UP_PREPARE: init_timers_cpu(cpu); break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + migrate_timers(cpu); + break; +#endif default: break; } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/kernel/workqueue.c .17007-linux-2.6.0-test5-bk11.updated/kernel/workqueue.c --- .17007-linux-2.6.0-test5-bk11/kernel/workqueue.c 2003-09-22 10:27:38.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/kernel/workqueue.c 2003-09-25 17:58:37.000000000 +1000 @@ -25,6 +25,10 @@ #include #include #include +#include +#include +#include +#include /* * The per-CPU workqueue. @@ -43,13 +47,10 @@ struct cpu_workqueue_struct { long insert_sequence; /* Next to add */ struct list_head worklist; - wait_queue_head_t more_work; + struct task_struct *worker; wait_queue_head_t work_done; struct workqueue_struct *wq; - task_t *thread; - struct completion exit; - } ____cacheline_aligned; /* @@ -58,8 +59,13 @@ struct cpu_workqueue_struct { */ struct workqueue_struct { struct cpu_workqueue_struct cpu_wq[NR_CPUS]; + struct list_head list; + const char *name; }; +/* All the workqueues on the system: protected by cpucontrol mutex. */ +static LIST_HEAD(workqueues); + /* * Queue work on a workqueue. Return non-zero if it was successfully * added. @@ -80,7 +86,7 @@ int queue_work(struct workqueue_struct * spin_lock_irqsave(&cwq->lock, flags); list_add_tail(&work->entry, &cwq->worklist); cwq->insert_sequence++; - wake_up(&cwq->more_work); + wake_up_process(cwq->worker); spin_unlock_irqrestore(&cwq->lock, flags); ret = 1; } @@ -101,7 +107,7 @@ static void delayed_work_timer_fn(unsign spin_lock_irqsave(&cwq->lock, flags); list_add_tail(&work->entry, &cwq->worklist); cwq->insert_sequence++; - wake_up(&cwq->more_work); + wake_up_process(cwq->worker); spin_unlock_irqrestore(&cwq->lock, flags); } @@ -151,67 +157,51 @@ static inline void run_workqueue(struct spin_lock_irqsave(&cwq->lock, flags); cwq->remove_sequence++; - wake_up(&cwq->work_done); + wake_up_process(cwq->worker); } spin_unlock_irqrestore(&cwq->lock, flags); } -typedef struct startup_s { - struct cpu_workqueue_struct *cwq; - struct completion done; - const char *name; -} startup_t; - -static int worker_thread(void *__startup) +static int worker_thread_init(void *__cwq) { - startup_t *startup = __startup; - struct cpu_workqueue_struct *cwq = startup->cwq; - int cpu = cwq - cwq->wq->cpu_wq; - DECLARE_WAITQUEUE(wait, current); struct k_sigaction sa; + struct cpu_workqueue_struct *cwq = __cwq; + int cpu = cwq - cwq->wq->cpu_wq; + + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + BUG_ON(smp_processor_id() != cpu); - daemonize("%s/%d", startup->name, cpu); allow_signal(SIGCHLD); current->flags |= PF_IOTHREAD; - cwq->thread = current; - set_user_nice(current, -10); - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - - complete(&startup->done); /* Install a handler so SIGCLD is delivered */ sa.sa.sa_handler = SIG_IGN; sa.sa.sa_flags = 0; siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); + return 0; +} - for (;;) { - set_task_state(current, TASK_INTERRUPTIBLE); - - add_wait_queue(&cwq->more_work, &wait); - if (!cwq->thread) - break; - if (list_empty(&cwq->worklist)) - schedule(); - else - set_task_state(current, TASK_RUNNING); - remove_wait_queue(&cwq->more_work, &wait); - - if (!list_empty(&cwq->worklist)) - run_workqueue(cwq); +static int worker_thread(void *__cwq) +{ + struct cpu_workqueue_struct *cwq = __cwq; - if (signal_pending(current)) { - while (waitpid(-1, NULL, __WALL|WNOHANG) > 0) - /* SIGCHLD - auto-reaping */ ; +again: + set_current_state(TASK_INTERRUPTIBLE); + if (signal_pending(current)) { + while (waitpid(-1, NULL, __WALL|WNOHANG) > 0) + /* SIGCHLD - auto-reaping */ ; - /* zap all other signals */ - flush_signals(current); - } + /* zap all other signals */ + flush_signals(current); } - remove_wait_queue(&cwq->more_work, &wait); - complete(&cwq->exit); + if (!list_empty(&cwq->worklist)) { + current->state = TASK_RUNNING; + run_workqueue(cwq); + goto again; + } return 0; } @@ -236,6 +226,7 @@ void flush_workqueue(struct workqueue_st might_sleep(); + down(&cpucontrol); for (cpu = 0; cpu < NR_CPUS; cpu++) { DEFINE_WAIT(wait); long sequence_needed; @@ -257,41 +248,34 @@ void flush_workqueue(struct workqueue_st finish_wait(&cwq->work_done, &wait); spin_unlock_irq(&cwq->lock); } + up(&cpucontrol); } -static int create_workqueue_thread(struct workqueue_struct *wq, - const char *name, - int cpu) +static struct task_struct * +create_workqueue_thread(struct workqueue_struct *wq, int cpu) { - startup_t startup; struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; - int ret; + struct task_struct *p; spin_lock_init(&cwq->lock); cwq->wq = wq; - cwq->thread = NULL; + cwq->worker = NULL; cwq->insert_sequence = 0; cwq->remove_sequence = 0; INIT_LIST_HEAD(&cwq->worklist); - init_waitqueue_head(&cwq->more_work); init_waitqueue_head(&cwq->work_done); - init_completion(&cwq->exit); - - init_completion(&startup.done); - startup.cwq = cwq; - startup.name = name; - ret = kernel_thread(worker_thread, &startup, CLONE_FS | CLONE_FILES); - if (ret >= 0) { - wait_for_completion(&startup.done); - BUG_ON(!cwq->thread); - } - return ret; + p = kthread_create(worker_thread_init, worker_thread, cwq, + "%s/%d", wq->name, cpu); + if (!IS_ERR(p)) + cwq->worker = p; + return p; } struct workqueue_struct *create_workqueue(const char *name) { int cpu, destroy = 0; struct workqueue_struct *wq; + struct task_struct *p; BUG_ON(strlen(name) > 10); @@ -299,12 +283,21 @@ struct workqueue_struct *create_workqueu if (!wq) return NULL; + /* We don't need the distraction of CPUs appearing and vanishing. */ + down(&cpucontrol); + wq->name = name; for (cpu = 0; cpu < NR_CPUS; cpu++) { if (!cpu_online(cpu)) continue; - if (create_workqueue_thread(wq, name, cpu) < 0) + p = create_workqueue_thread(wq, cpu); + if (IS_ERR(p)) destroy = 1; + else + kthread_start(p); } + + list_add(&wq->list, &workqueues); + /* * Was there any error during startup? If yes then clean up: */ @@ -312,6 +305,7 @@ struct workqueue_struct *create_workqueu destroy_workqueue(wq); wq = NULL; } + up(&cpucontrol); return wq; } @@ -320,13 +314,8 @@ static void cleanup_workqueue_thread(str struct cpu_workqueue_struct *cwq; cwq = wq->cpu_wq + cpu; - if (cwq->thread) { - /* Tell thread to exit and wait for it. */ - cwq->thread = NULL; - wake_up(&cwq->more_work); - - wait_for_completion(&cwq->exit); - } + if (cwq->worker) + kthread_destroy(cwq->worker); } void destroy_workqueue(struct workqueue_struct *wq) @@ -335,10 +324,14 @@ void destroy_workqueue(struct workqueue_ flush_workqueue(wq); + /* We don't need the distraction of CPUs appearing and vanishing. */ + down(&cpucontrol); for (cpu = 0; cpu < NR_CPUS; cpu++) { if (cpu_online(cpu)) cleanup_workqueue_thread(wq, cpu); } + up(&cpucontrol); + list_del(&wq->list); kfree(wq); } @@ -370,14 +363,58 @@ int current_is_keventd(void) if (!cpu_online(cpu)) continue; cwq = keventd_wq->cpu_wq + cpu; - if (current == cwq->thread) + if (current == cwq->worker) return 1; } return 0; } +#ifdef CONFIG_HOTPLUG_CPU +/* We're holding the cpucontrol mutex here */ +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int hotcpu = (unsigned long)hcpu; + struct workqueue_struct *wq; + + switch (action) { + case CPU_UP_PREPARE: + /* Create a new workqueue thread for it. */ + list_for_each_entry(wq, &workqueues, list) { + if (create_workqueue_thread(wq, hotcpu) < 0) { + printk("workqueue for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + } + break; + + case CPU_ONLINE: + /* Start the thread we created for it. */ + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; + kthread_start(cwq->worker); + } + break; + + case CPU_UP_CANCELED: + case CPU_OFFLINE: + list_for_each_entry(wq, &workqueues, list) + cleanup_workqueue_thread(wq, hotcpu); + return NOTIFY_OK; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpu_nfb = { &cpu_callback, NULL, 0 }; +#endif + void init_workqueues(void) { +#ifdef CONFIG_HOTPLUG_CPU + register_cpu_notifier(&cpu_nfb); +#endif keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/mm/slab.c .17007-linux-2.6.0-test5-bk11.updated/mm/slab.c --- .17007-linux-2.6.0-test5-bk11/mm/slab.c 2003-09-25 09:56:39.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/mm/slab.c 2003-09-25 16:49:58.000000000 +1000 @@ -519,9 +519,19 @@ enum { static DEFINE_PER_CPU(struct timer_list, reap_timers); static void reap_timer_fnc(unsigned long data); - +static void free_block (kmem_cache_t* cachep, void** objpp, int len); static void enable_cpucache (kmem_cache_t *cachep); +static inline void ** ac_entry(struct array_cache *ac) +{ + return (void**)(ac+1); +} + +static inline struct array_cache *ac_data(kmem_cache_t *cachep) +{ + return cachep->array[smp_processor_id()]; +} + /* Cal the num objs, wastage, and bytes left over for a given slab size. */ static void cache_estimate (unsigned long gfporder, size_t size, int flags, size_t *left_over, unsigned int *num) @@ -576,27 +586,34 @@ static void start_cpu_timer(int cpu) } } -/* - * Note: if someone calls kmem_cache_alloc() on the new - * cpu before the cpuup callback had a chance to allocate - * the head arrays, it will oops. - * Is CPU_ONLINE early enough? - */ +#ifdef CONFIG_HOTPLUG_CPU +static void stop_cpu_timer(int cpu) +{ + struct timer_list *rt = &per_cpu(reap_timers, cpu); + + if (rt->function) { + del_timer(rt); + synchronize_kernel(); + WARN_ON(timer_pending(rt)); + rt->function = NULL; + } +} +#endif + static int __devinit cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; - struct list_head *p; + kmem_cache_t* cachep; switch (action) { case CPU_UP_PREPARE: down(&cache_chain_sem); - list_for_each(p, &cache_chain) { + list_for_each_entry(cachep, &cache_chain, next) { int memsize; struct array_cache *nc; - kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next); memsize = sizeof(void*)*cachep->limit+sizeof(struct array_cache); nc = kmalloc(memsize, GFP_KERNEL); if (!nc) @@ -616,18 +633,28 @@ static int __devinit cpuup_callback(stru up(&cache_chain_sem); break; case CPU_ONLINE: - if (g_cpucache_up == FULL) - start_cpu_timer(cpu); + start_cpu_timer(cpu); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_OFFLINE: + stop_cpu_timer(cpu); break; +#endif /* CONFIG_HOTPLUG_CPU */ + case CPU_UP_CANCELED: + case CPU_DEAD: down(&cache_chain_sem); - - list_for_each(p, &cache_chain) { + list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; - kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next); + spin_lock_irq(&cachep->spinlock); + /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; cachep->array[cpu] = NULL; + cachep->free_limit -= cachep->batchcount; + free_block(cachep, ac_entry(nc), nc->avail); + spin_unlock_irq(&cachep->spinlock); kfree(nc); } up(&cache_chain_sem); @@ -641,16 +668,6 @@ bad: static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; -static inline void ** ac_entry(struct array_cache *ac) -{ - return (void**)(ac+1); -} - -static inline struct array_cache *ac_data(kmem_cache_t *cachep) -{ - return cachep->array[smp_processor_id()]; -} - /* Initialisation. * Called after the gfp() functions have been enabled, and before smp_init(). */ @@ -1316,7 +1333,6 @@ static void smp_call_function_all_cpus(v preempt_enable(); } -static void free_block (kmem_cache_t* cachep, void** objpp, int len); static void drain_array_locked(kmem_cache_t* cachep, struct array_cache *ac, int force); @@ -1435,6 +1451,9 @@ int kmem_cache_destroy (kmem_cache_t * c return 1; } + /* no cpu_online check required here since we clear the percpu + * array on cpu offline and set this to NULL. + */ for (i = 0; i < NR_CPUS; i++) kfree(cachep->array[i]); diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/mm/swap.c .17007-linux-2.6.0-test5-bk11.updated/mm/swap.c --- .17007-linux-2.6.0-test5-bk11/mm/swap.c 2003-09-22 10:23:17.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/mm/swap.c 2003-09-25 14:54:34.000000000 +1000 @@ -24,6 +24,9 @@ #include #include /* for try_to_release_page() */ #include +#include +#include +#include /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -372,7 +375,33 @@ void vm_acct_memory(long pages) preempt_enable(); } EXPORT_SYMBOL(vm_acct_memory); -#endif + +#ifdef CONFIG_HOTPLUG_CPU +/* Drop the CPU's cached committed space back into the central pool. */ +static int __devinit cpu_swap_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + long *committed; + + committed = &per_cpu(committed_space, (long)hcpu); + if (action == CPU_DEAD) { + atomic_add(*committed, &vm_committed_space); + *committed = 0; + } + return NOTIFY_OK; +} + +static struct notifier_block cpu_swap_notifier __devinitdata += { &cpu_swap_callback }; + +static int __init cpu_swap_setup(void) +{ + return register_cpu_notifier(&cpu_swap_notifier); +} +__initcall(cpu_swap_setup); +#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_SMP */ /* diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/mm/vmscan.c .17007-linux-2.6.0-test5-bk11.updated/mm/vmscan.c --- .17007-linux-2.6.0-test5-bk11/mm/vmscan.c 2003-09-22 10:28:13.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/mm/vmscan.c 2003-09-25 14:54:34.000000000 +1000 @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include @@ -1087,6 +1089,47 @@ int shrink_all_memory(int nr_pages) } #endif +#ifdef CONFIG_HOTPLUG_CPU +/* It's optimal to keep kswapds on the same CPUs as their memory, but + not required for correctness. So if the last cpu in a node goes + away, let them run anywhere, and as the first one comes back, + restore their cpu bindings. */ +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + pg_data_t *pgdat; + unsigned int hotcpu = (unsigned long)hcpu; + cpumask_t mask; + + if (action == CPU_OFFLINE) { + /* Make sure that kswapd never becomes unschedulable. */ + for_each_pgdat(pgdat) { + mask = node_to_cpumask(pgdat->node_id); + if (any_online_cpu(mask) == NR_CPUS) { + cpus_complement(mask); + set_cpus_allowed(pgdat->kswapd, mask); + } + } + } + + if (action == CPU_ONLINE) { + for_each_pgdat(pgdat) { + mask = node_to_cpumask(pgdat->node_id); + cpu_clear(hotcpu, mask); + if (any_online_cpu(mask) == NR_CPUS) { + cpu_set(hotcpu, mask); + /* One of our CPUs came back: restore mask */ + set_cpus_allowed(pgdat->kswapd, mask); + } + } + } + return NOTIFY_OK; +} + +static struct notifier_block cpu_nfb = { &cpu_callback, NULL, 0 }; +#endif /* CONFIG_HOTPLUG_CPU */ + static int __init kswapd_init(void) { pg_data_t *pgdat; @@ -1094,6 +1137,9 @@ static int __init kswapd_init(void) for_each_pgdat(pgdat) kernel_thread(kswapd, pgdat, CLONE_KERNEL); total_memory = nr_free_pagecache_pages(); +#ifdef CONFIG_HOTPLUG_CPU + register_cpu_notifier(&cpu_nfb); +#endif return 0; } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .17007-linux-2.6.0-test5-bk11/net/core/dev.c .17007-linux-2.6.0-test5-bk11.updated/net/core/dev.c --- .17007-linux-2.6.0-test5-bk11/net/core/dev.c 2003-09-25 09:56:40.000000000 +1000 +++ .17007-linux-2.6.0-test5-bk11.updated/net/core/dev.c 2003-09-25 16:49:09.000000000 +1000 @@ -105,6 +105,7 @@ #include #include #include +#include #ifdef CONFIG_NET_RADIO #include /* Note : will define WIRELESS_EXT */ #include @@ -3039,3 +3040,57 @@ out: } subsys_initcall(net_dev_init); + +#ifdef CONFIG_HOTPLUG_CPU +static int dev_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *ocpu) +{ + struct sk_buff **list_skb; + struct net_device **list_net; + struct sk_buff *skb; + unsigned int cpu, oldcpu = (unsigned long)ocpu; + struct softnet_data *sd, *oldsd; + + if (action != CPU_OFFLINE) + return NOTIFY_OK; + + local_irq_disable(); + cpu = smp_processor_id(); + sd = &per_cpu(softnet_data, cpu); + oldsd = &per_cpu(softnet_data, oldcpu); + + /* Find end of our completion_queue. */ + list_skb = &sd->completion_queue; + while (*list_skb) + list_skb = &(*list_skb)->next; + /* Append completion queue from offline CPU. */ + *list_skb = oldsd->completion_queue; + oldsd->completion_queue = NULL; + + /* Find end of our output_queue. */ + list_net = &sd->output_queue; + while (*list_net) + list_net = &(*list_net)->next; + /* Append output queue from offline CPU. */ + *list_net = oldsd->output_queue; + oldsd->output_queue = NULL; + local_irq_enable(); + + /* Process offline CPU's input_pkt_queue */ + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) + netif_rx(skb); + + return NOTIFY_OK; +} + +static struct notifier_block cpu_callback_nfb = {&dev_cpu_callback, NULL, 0 }; + +static int __init dev_cpu_callback_init(void) +{ + register_cpu_notifier(&cpu_callback_nfb); + return 0; +} + +__initcall(dev_cpu_callback_init); +#endif /* CONFIG_HOTPLUG_CPU */