Name: Rewritten Hot-plug CPU Core Infrastructure Author: Rusty Russell Status: Tested on 2.5.15 Depends: Hotcpu/init-removal.patch.gz Hotcpu/hotcpu-boot.patch.gz Hotcpu/hotcpu-boot-i386.patch.gz Hotcpu/hotcpu-primitive-ia64.patch.gz Hotcpu/hotcpu-primitive-ppc.patch.gz D: This is a most recent patch to add the infrastructure to add and D: remove CPUs in a running kernel. diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.20.3071/include/linux/notifier.h linux-2.5.20.3071.updated/include/linux/notifier.h --- linux-2.5.20.3071/include/linux/notifier.h Tue Jun 4 21:29:06 2002 +++ linux-2.5.20.3071.updated/include/linux/notifier.h Tue Jun 4 21:29:39 2002 @@ -58,6 +58,7 @@ #define SYS_HALT 0x0002 /* Notify of system halt */ #define SYS_POWER_OFF 0x0003 /* Notify of system power off */ +#define CPU_OFFLINE 0x0001 /* CPU (unsigned)v going down */ #define CPU_ONLINE 0x0002 /* CPU (unsigned)v coming up */ #endif /* __KERNEL__ */ diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.20.3071/include/linux/sched.h linux-2.5.20.3071.updated/include/linux/sched.h --- linux-2.5.20.3071/include/linux/sched.h Tue Jun 4 21:29:12 2002 +++ linux-2.5.20.3071.updated/include/linux/sched.h Tue Jun 4 21:29:39 2002 @@ -411,6 +411,9 @@ #if CONFIG_SMP extern void set_cpus_allowed(task_t *p, unsigned long new_mask); +#ifdef CONFIG_HOTPLUG +extern int migrate_all_tasks(int cpu); +#endif #else # define set_cpus_allowed(p, new_mask) do { } while (0) #endif diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.20.3071/include/linux/smp.h linux-2.5.20.3071.updated/include/linux/smp.h --- linux-2.5.20.3071/include/linux/smp.h Tue Jun 4 21:29:06 2002 +++ linux-2.5.20.3071.updated/include/linux/smp.h Tue Jun 4 21:29:39 2002 @@ -72,7 +72,8 @@ extern int register_cpu_notifier(struct notifier_block *nb); extern void unregister_cpu_notifier(struct notifier_block *nb); -/* Bring a CPU up */ +/* Bring a CPU down/up */ +int cpu_down(unsigned int cpu); int cpu_up(unsigned int cpu); #else /* !SMP */ @@ -98,6 +99,10 @@ /* Need to know about CPUs going up/down? */ #define register_cpu_notifier(nb) 0 #define unregister_cpu_notifier(nb) do { } while(0) + +/* Bring a CPU down/up */ +static inline int cpu_down(unsigned int cpu) { return -EBUSY; } +static inline int cpu_up(unsigned int cpu) { return -ENOSYS; } #endif /* !SMP */ diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.20.3071/kernel/cpu.c linux-2.5.20.3071.updated/kernel/cpu.c --- linux-2.5.20.3071/kernel/cpu.c Tue Jun 4 21:29:06 2002 +++ linux-2.5.20.3071.updated/kernel/cpu.c Tue Jun 4 21:29:39 2002 @@ -2,22 +2,56 @@ * (C) 2001 Rusty Russell * This code is licenced under the GPL. */ +#define __KERNEL_SYSCALLS__ /* for waitpid() */ #include #include #include #include #include +#include /* for hotplug_path */ #include #include +#include /* This protects CPUs going up and down... */ DECLARE_MUTEX(cpucontrol); static struct notifier_block *cpu_chain = NULL; +#ifdef CONFIG_HOTPLUG +/* Notify userspace when a cpu event occurs, by running '/sbin/hotplug + * cpu' with certain environment variables set. */ +static int cpu_run_sbin_hotplug(unsigned int cpu, const char *action) +{ + char *argv[3], *envp[5], cpu_str[12], action_str[32]; + int i; + + sprintf(cpu_str, "CPU=%d", cpu); + sprintf(action_str, "ACTION=%s", action); + + i = 0; + argv[i++] = hotplug_path; + argv[i++] = "cpu"; + argv[i] = NULL; + + i = 0; + /* minimal command environment */ + envp [i++] = "HOME=/"; + envp [i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp [i++] = cpu_str; + envp [i++] = action_str; + envp [i] = NULL; + + return call_usermodehelper(argv [0], argv, envp); +} +#else +#define cpu_run_sbin_hotplug(cpu, action) 0 +#endif + /* Need to know about CPUs going up/down? */ int register_cpu_notifier(struct notifier_block *nb) { + printk("REGISTER %p\n", nb->notifier_call); return notifier_chain_register(&cpu_chain, nb); } @@ -26,7 +60,7 @@ notifier_chain_unregister(&cpu_chain,nb); } -int __init cpu_up(unsigned int cpu) +int __devinit cpu_up(unsigned int cpu) { int ret; @@ -57,8 +91,170 @@ schedule_timeout(HZ); #endif +#if 0 /* FIXME: Don't do this during boot. --RR */ + cpu_run_sbin_hotplug(cpu, "add"); +#endif out: printk("UP %u done: %i.\n", cpu, ret); up(&cpucontrol); return ret; } + +#ifdef CONFIG_HOTPLUG +int cpu_down(unsigned int cpu) +{ + int ret; + + if ((ret = down_interruptible(&cpucontrol)) != 0) + return ret; + + if (!cpu_online(cpu)) { + ret = -EINVAL; + goto out; + } + + if (num_online_cpus() == 1) { + ret = -EBUSY; + goto out; + } + + /* Schedule ourselves on the dying CPU. */ + printk("%s:%u\n", __FILE__, __LINE__); + set_cpus_allowed(current, (1 << cpu)); + + printk("%s:%u\n", __FILE__, __LINE__); + preempt_disable(); + + /* Disable CPU. */ + printk("%s:%u\n", __FILE__, __LINE__); + ret = __cpu_disable(); + printk("%s:%u\n", __FILE__, __LINE__); + if (ret != 0) { + printk("CPU disable failed: %i\n", ret); + goto preempt_out; + } + printk("%s:%u\n", __FILE__, __LINE__); + BUG_ON(cpu_online(cpu)); + + /* Move other tasks off to other CPUs (simple since they are + not running now). */ + printk("%s:%u\n", __FILE__, __LINE__); + ret = migrate_all_tasks(cpu); + printk("%s:%u\n", __FILE__, __LINE__); + if (ret != 0) { + printk("Moving tasks failed: %i\n", ret); + goto preempt_out; + } + + /* Move off dying CPU, which will revert to idle process. */ + printk("%s:%u\n", __FILE__, __LINE__); + set_cpus_allowed(current, ~(1 << cpu)); + printk("%s:%u\n", __FILE__, __LINE__); + preempt_enable(); + + /* CPU has been disabled: tell everyone */ + printk("CPU chain = %p\n", cpu_chain); + notifier_call_chain(&cpu_chain, CPU_OFFLINE, (void *)cpu); + + /* This makes sure that noone has the cpu bitmask + etc. cached. */ +#if 0 + sync_kernel(); +#else + schedule_timeout(HZ); +#endif + + /* Die, CPU, die!. */ + __cpu_die(cpu); + + cpu_run_sbin_hotplug(cpu, "remove"); + up(&cpucontrol); + return ret; + + preempt_out: + preempt_enable(); + out: + up(&cpucontrol); + return ret; +} +#else +int cpu_down(unsigned int cpu) +{ + return -ENOSYS; +} +#endif /*CONFIG_HOTPLUG*/ + +/* There may be worse design braindamage than sysctl inside the + kernel, but if there is I haven't found it yet. I'm speechless. + Whoever "designed" this interface must have pictures of Linus + molesting children or something, to get this into the mainstream + kernel. + + I give up, and am using proc directly, which is a little + better. --RR */ +int read_online(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + unsigned int cpu = (unsigned int)data; + char *p = page; + int len; + + p += sprintf(p, "%u\n", cpu_online(cpu) ? 1 : 0); + len = (p - page) - off; + if (len < 0) + len = 0; + + *eof = (len <= count) ? 1 : 0; + *start = page + off; + + return len; +} + +int write_online(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + unsigned int cpu = (unsigned int)data; + char input; + int ret = -EINVAL; + + strncpy_from_user(&input, buffer, 1); + + if (input == '0' && (ret = cpu_down(cpu)) == 0) + ret = count; + if (input == '1' && (ret = cpu_up(cpu)) == 0) + ret = count; + + return ret; +} + +static void __init create_entries(struct proc_dir_entry *parent, + unsigned int cpu) +{ + struct proc_dir_entry *e; + + e = create_proc_entry("online", 0644, parent); + e->data = (void *)cpu; + e->read_proc = &read_online; + e->write_proc = &write_online; +} + +static int __init create_per_cpu_entries(void) +{ + unsigned int i; + struct proc_dir_entry *cpudir, *dir; + + cpudir = proc_mkdir("sys/cpu", NULL); + for (i = 0; i < NR_CPUS; i++) { + char cpuname[20]; + + if (cpu_possible(i)) { + sprintf(cpuname, "%i", i); + dir = proc_mkdir(cpuname, cpudir); + + create_entries(dir, i); + } + } + return 0; +} + +__initcall(create_per_cpu_entries); diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.20.3071/kernel/sched.c linux-2.5.20.3071.updated/kernel/sched.c --- linux-2.5.20.3071/kernel/sched.c Tue Jun 4 21:29:12 2002 +++ linux-2.5.20.3071.updated/kernel/sched.c Wed Jun 5 09:27:49 2002 @@ -1718,14 +1718,70 @@ preempt_enable(); } -static int migration_thread(void * bind_cpu) +#ifdef CONFIG_HOTPLUG +/* Move non-kernel-thread tasks off this (offline) CPU, except us. */ +int migrate_all_tasks(int cpu) +{ + task_t *p; + int ret = 0; + unsigned long new_mask, old_mask; + runqueue_t *rq; + +mg_retry: + read_lock(&tasklist_lock); + for_each_task(p) { + if (p->thread_info->cpu != cpu) + continue; + + if (p == current) + continue; + + /* Kernel threads which are bound to specific + processors need to look after themselves. */ + if (p->mm == NULL && p->cpus_allowed != -1) + continue; + + rq = task_rq(p); + read_unlock(&tasklist_lock); + + old_mask = p->cpus_allowed; + new_mask = p->cpus_allowed & ~(1 << cpu); + + /* FIXME: We need to signal these guys --RR */ + if (!(new_mask & cpu_online_map)) { + new_mask |= 1 << smp_processor_id(); + old_mask |= 1 << smp_processor_id(); + } + + set_cpus_allowed(p, new_mask); + + /* FIXME: Do we need this? --RR */ + /* wait until the migration is done */ + while (p->thread_info->cpu == cpu) + schedule_timeout(2); + + /* don't want to change cpus_allowed for CPU hotplug */ + p->cpus_allowed = old_mask; + + goto mg_retry; + } + read_unlock(&tasklist_lock); + return ret; +} +#endif /* CONFIG_HOTPLUG */ + +/* This is the CPU to stop, and who to wake about it */ +static int migration_stop = -1; +static struct completion migration_stopped; + +static int migration_thread(void *home_cpu) { - int cpu = (int) (long) bind_cpu; struct sched_param param = { sched_priority: MAX_RT_PRIO-1 }; runqueue_t *rq; int ret; daemonize(); + reparent_to_init(); sigfillset(¤t->blocked); set_fs(KERNEL_DS); @@ -1737,7 +1793,7 @@ sprintf(current->comm, "migration_CPU%d", smp_processor_id()); - for (;;) { + while (migration_stop != (long)home_cpu) { runqueue_t *rq_src, *rq_dest; struct list_head *head; int cpu_src, cpu_dest; @@ -1750,6 +1806,7 @@ current->state = TASK_INTERRUPTIBLE; if (list_empty(head)) { spin_unlock_irqrestore(&rq->lock, flags); + printk("%s: noone on list\n", current->comm); schedule(); continue; } @@ -1760,6 +1817,8 @@ p = req->task; cpu_dest = __ffs(p->cpus_allowed); rq_dest = cpu_rq(cpu_dest); + printk("%s: Moving %p (%s) to %u\n", + current->comm, p, p->comm, cpu_dest); repeat: cpu_src = p->thread_info->cpu; rq_src = cpu_rq(cpu_src); @@ -1783,6 +1842,23 @@ up(&req->sem); } + current->state = TASK_RUNNING; + + printk("Migration thread for %li exiting\n", (long)home_cpu); + rq->migration_thread = NULL; + complete(&migration_stopped); + + return 0; +} + +/* No locking required: CPU notifiers are serialized */ +static void stop_migration_thread(unsigned int cpu) +{ + init_completion(&migration_stopped); + wmb(); + migration_stop = cpu; + wait_for_completion(&migration_stopped); + migration_stop = -1; } static __devinit int migration_call(struct notifier_block *nfb, @@ -1793,6 +1869,9 @@ case CPU_ONLINE: kernel_thread(migration_thread, hcpu, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + break; + case CPU_OFFLINE: + stop_migration_thread((long)hcpu); break; } return NOTIFY_OK; diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.20.3071/kernel/softirq.c linux-2.5.20.3071.updated/kernel/softirq.c --- linux-2.5.20.3071/kernel/softirq.c Tue Jun 4 21:29:07 2002 +++ linux-2.5.20.3071.updated/kernel/softirq.c Tue Jun 4 21:29:39 2002 @@ -362,11 +362,20 @@ } } +/* This is the CPU to stop, and who to wake about it */ +static int ksoftirq_stop = -1; +static struct task_struct *ksoftirq_killer = NULL; + static int ksoftirqd(void * __bind_cpu) { int cpu = (int) (long) __bind_cpu; + if (ksoftirqd_task(cpu)) + BUG(); + + sprintf(current->comm, "ksoftirqd_CPU%d", cpu); daemonize(); + reparent_to_init(); set_user_nice(current, 19); current->flags |= PF_IOTHREAD; sigfillset(¤t->blocked); @@ -383,7 +392,8 @@ ksoftirqd_task(cpu) = current; - for (;;) { + while (ksoftirq_stop != cpu) { + rmb(); if (!softirq_pending(cpu)) schedule(); @@ -396,11 +406,17 @@ __set_current_state(TASK_INTERRUPTIBLE); } + set_current_state(TASK_RUNNING); + + printk("ksoftirqd for %i dying\n", cpu); + ksoftirqd_task(cpu) = NULL; + wmb(); + wake_up_process(ksoftirq_killer); + + return 0; } -static int __devinit softirq_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int __devinit softirq_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long) hcpu; @@ -412,7 +428,67 @@ while (!ksoftirqd_task(hotcpu)) yield(); return NOTIFY_OK; - } + } + + /* FIXME: Audit this for races --RR */ + if (action == CPU_OFFLINE) { + __u32 pending; + unsigned long flags; + unsigned int cpu = smp_processor_id(); + struct tasklet_struct *list, *t; + struct task_struct * kd_task; + int i; + + local_irq_save(flags); + + pending = softirq_pending(hotcpu); + softirq_pending(hotcpu) = 0; + + /* move pending softirqs */ + for (i=0; i<32; i++) + if (pending & (1<next != NULL) + list = list->next; + list->next = tasklet_vec[cpu].list; + tasklet_vec[cpu].list = t; + tasklet_vec[hotcpu].list = NULL; + } + + list = tasklet_hi_vec[hotcpu].list; + if (list != NULL) { + t = list; + while (list->next != NULL) + list = list->next; + list->next = tasklet_hi_vec[cpu].list; + tasklet_hi_vec[cpu].list = t; + tasklet_hi_vec[hotcpu].list = NULL; + } + local_irq_restore(flags); + + printk("Killing ksoftirqd for %i\n", hotcpu); + /* Kill ksoftirqd: get ref in case it exits before we + wake it */ + ksoftirq_killer = current; + kd_task = ksoftirqd_task(hotcpu); + get_task_struct(kd_task); + set_current_state(TASK_INTERRUPTIBLE); + ksoftirq_stop = hotcpu; + wake_up_process(kd_task); + while (ksoftirqd_task(hotcpu)) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + set_current_state(TASK_RUNNING); + put_task_struct(kd_task); + ksoftirq_stop = -1; + return NOTIFY_OK; + } + + /* We fear change! */ return NOTIFY_BAD; } diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.20.3071/net/core/dev.c linux-2.5.20.3071.updated/net/core/dev.c --- linux-2.5.20.3071/net/core/dev.c Tue Jun 4 21:29:06 2002 +++ linux-2.5.20.3071.updated/net/core/dev.c Tue Jun 4 21:29:39 2002 @@ -100,6 +100,7 @@ #include #include #include +#include #if defined(CONFIG_NET_RADIO) || defined(CONFIG_NET_PCMCIA_RADIO) #include /* Note : will define WIRELESS_EXT */ #include @@ -2853,3 +2854,67 @@ return call_usermodehelper(argv [0], argv, envp); } #endif + +static int dev_cpu_callback(struct notifier_block *nfb, unsigned long action, void * ocpu) +{ + struct sk_buff *list_sk, *sk_head; + struct net_device *list_net, *net_head; + struct softnet_data *queue; + struct sk_buff *skb; + unsigned int cpu = smp_processor_id(); + unsigned long oldcpu = (unsigned long) ocpu; + unsigned long flags; + + if (action != CPU_OFFLINE) + return 0; + + local_irq_save(flags); + + /* Move completion queue */ + + list_sk = softnet_data[oldcpu].completion_queue; + if (list_sk != NULL) { + sk_head = list_sk; + while (list_sk->next != NULL) + list_sk = list_sk->next; + list_sk->next = softnet_data[cpu].completion_queue; + softnet_data[cpu].completion_queue = sk_head; + softnet_data[oldcpu].completion_queue = NULL; + } + + /* Move output_queue */ + + list_net = softnet_data[oldcpu].output_queue; + if (list_net != NULL) { + net_head = list_net; + while (list_net->next != NULL) + list_net = list_net->next_sched; + list_net->next_sched = softnet_data[cpu].output_queue; + softnet_data[cpu].output_queue = net_head; + softnet_data[oldcpu].output_queue = NULL; + } + + local_irq_restore(flags); + + /* Move input_pkt_queue */ + + queue = &softnet_data[oldcpu]; + for (;;) { + skb = __skb_dequeue(&queue->input_pkt_queue); + if (skb == NULL) + break; + netif_rx(skb); + } + + return 0; +} + +static struct notifier_block cpu_callback_nfb = {&dev_cpu_callback, NULL, 0 }; + +static int __init dev_cpu_callback_init(void) +{ + register_cpu_notifier(&cpu_callback_nfb); + return 0; +} + +__initcall(dev_cpu_callback_init);