Name: Hotplug CPU Remove Generic Code Author: Rusty Russell Status: Experimental Depends: Hotcpu/cpucontrols.patch.gz Depends: D: This adds the generic infrastructure to allow removing of CPUs D: in a running kernel. diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/arch/i386/kernel/smpboot.c .11268-linux-2.5.44.updated/arch/i386/kernel/smpboot.c --- .11268-linux-2.5.44/arch/i386/kernel/smpboot.c 2002-10-16 15:01:12.000000000 +1000 +++ .11268-linux-2.5.44.updated/arch/i386/kernel/smpboot.c 2002-10-28 17:54:13.000000000 +1100 @@ -1192,6 +1192,17 @@ int __devinit __cpu_up(unsigned int cpu) return 0; } +int __cpu_disable(void) +{ + return -ENOSYS; +} + +/* Since we fail __cpu_disable, this is never called. */ +void __cpu_die(unsigned int cpu) +{ + BUG(); +} + void __init smp_cpus_done(unsigned int max_cpus) { zap_low_mappings(); diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/arch/ppc/kernel/smp.c .11268-linux-2.5.44.updated/arch/ppc/kernel/smp.c --- .11268-linux-2.5.44/arch/ppc/kernel/smp.c 2002-09-21 13:55:09.000000000 +1000 +++ .11268-linux-2.5.44.updated/arch/ppc/kernel/smp.c 2002-10-28 17:54:13.000000000 +1100 @@ -433,6 +433,17 @@ int __cpu_up(unsigned int cpu) return 0; } +int __cpu_disable(unsigned int cpu) +{ + return -ENOSYS; +} + +/* Since we fail __cpu_disable, this is never called. */ +void __cpu_die(unsigned int cpu) +{ + BUG(); +} + void smp_cpus_done(unsigned int max_cpus) { smp_ops->setup_cpu(0); diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/include/asm-generic/topology.h .11268-linux-2.5.44.updated/include/asm-generic/topology.h --- .11268-linux-2.5.44/include/asm-generic/topology.h 2002-10-15 15:26:26.000000000 +1000 +++ .11268-linux-2.5.44.updated/include/asm-generic/topology.h 2002-10-28 17:54:13.000000000 +1100 @@ -42,7 +42,7 @@ #define __node_to_first_cpu(node) (0) #endif #ifndef __node_to_cpu_mask -#define __node_to_cpu_mask(node) (cpu_online_map) +#define __node_to_cpu_mask(node) (~0UL) #endif #ifndef __node_to_memblk #define __node_to_memblk(node) (0) diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/include/asm-i386/smp.h .11268-linux-2.5.44.updated/include/asm-i386/smp.h --- .11268-linux-2.5.44/include/asm-i386/smp.h 2002-10-19 17:48:09.000000000 +1000 +++ .11268-linux-2.5.44.updated/include/asm-i386/smp.h 2002-10-28 17:54:13.000000000 +1100 @@ -114,6 +114,9 @@ static inline int num_booting_cpus(void) return hweight32(cpu_callout_map); } +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); + #endif /* !__ASSEMBLY__ */ #define NO_PROC_ID 0xFF /* No processor magic marker */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/include/asm-ppc/smp.h .11268-linux-2.5.44.updated/include/asm-ppc/smp.h --- .11268-linux-2.5.44/include/asm-ppc/smp.h 2002-09-21 13:55:18.000000000 +1000 +++ .11268-linux-2.5.44.updated/include/asm-ppc/smp.h 2002-10-28 17:54:13.000000000 +1100 @@ -62,6 +62,8 @@ extern inline int any_online_cpu(unsigne } extern int __cpu_up(unsigned int cpu); +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); extern int smp_hw_index[]; #define hard_smp_processor_id() (smp_hw_index[smp_processor_id()]) diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/include/linux/brlock.h .11268-linux-2.5.44.updated/include/linux/brlock.h --- .11268-linux-2.5.44/include/linux/brlock.h 2002-10-15 15:19:44.000000000 +1000 +++ .11268-linux-2.5.44.updated/include/linux/brlock.h 2002-10-28 17:54:14.000000000 +1100 @@ -34,6 +34,7 @@ /* Register bigreader lock indices here. */ enum brlock_indices { BR_NETPROTO_LOCK, + BR_CPU_LOCK, __BR_END }; diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/include/linux/cpu.h .11268-linux-2.5.44.updated/include/linux/cpu.h --- .11268-linux-2.5.44/include/linux/cpu.h 2002-10-28 17:53:38.000000000 +1100 +++ .11268-linux-2.5.44.updated/include/linux/cpu.h 2002-10-28 17:54:13.000000000 +1100 @@ -1,3 +1,5 @@ +#ifndef _LINUX_CPU_H +#define _LINUX_CPU_H /* * cpu.h - generic cpu defition * @@ -29,5 +31,8 @@ struct cpu { DECLARE_PER_CPU(struct cpu, cpu_devices); -/* Bring a CPU up */ +/* Bring a CPU up and take it down. */ +extern struct semaphore cpucontrol; int cpu_up(unsigned int cpu); +int cpu_down(unsigned int cpu); +#endif /* _LINUX_CPU_H */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/include/linux/mmzone.h .11268-linux-2.5.44.updated/include/linux/mmzone.h --- .11268-linux-2.5.44/include/linux/mmzone.h 2002-10-15 15:31:05.000000000 +1000 +++ .11268-linux-2.5.44.updated/include/linux/mmzone.h 2002-10-28 17:54:13.000000000 +1100 @@ -169,6 +169,7 @@ typedef struct pglist_data { int node_id; struct pglist_data *pgdat_next; wait_queue_head_t kswapd_wait; + struct task_struct *kswapd; } pg_data_t; extern int numnodes; diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/include/linux/notifier.h .11268-linux-2.5.44.updated/include/linux/notifier.h --- .11268-linux-2.5.44/include/linux/notifier.h 2002-07-27 15:24:39.000000000 +1000 +++ .11268-linux-2.5.44.updated/include/linux/notifier.h 2002-10-28 17:54:13.000000000 +1100 @@ -61,6 +61,8 @@ extern int notifier_call_chain(struct no #define NETLINK_URELEASE 0x0001 /* Unicast netlink socket released */ #define CPU_ONLINE 0x0002 /* CPU (unsigned)v coming up */ +#define CPU_OFFLINE 0x0005 /* CPU (unsigned)v offline (still scheduling) */ +#define CPU_DEAD 0x0006 /* CPU (unsigned)v dead */ #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/include/linux/sched.h .11268-linux-2.5.44.updated/include/linux/sched.h --- .11268-linux-2.5.44/include/linux/sched.h 2002-10-19 17:48:10.000000000 +1000 +++ .11268-linux-2.5.44.updated/include/linux/sched.h 2002-10-28 17:54:13.000000000 +1100 @@ -438,11 +438,15 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define _STK_LIM (8*1024*1024) #if CONFIG_SMP +/* You should be holding cpucontrol sem (or be at boot) when calling this. */ extern void set_cpus_allowed(task_t *p, unsigned long new_mask); #else # define set_cpus_allowed(p, new_mask) do { } while (0) #endif +/* Move tasks off this (offline) CPU onto another. */ +extern void migrate_all_tasks(void); + extern void set_user_nice(task_t *p, long nice); extern int task_prio(task_t *p); extern int task_nice(task_t *p); diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/kernel/cpu.c .11268-linux-2.5.44.updated/kernel/cpu.c --- .11268-linux-2.5.44/kernel/cpu.c 2002-10-28 17:53:38.000000000 +1100 +++ .11268-linux-2.5.44.updated/kernel/cpu.c 2002-10-28 17:54:14.000000000 +1100 @@ -8,6 +8,8 @@ #include #include #include +#include /* for hotplug_path */ +#include #include #include #include @@ -29,6 +31,158 @@ void unregister_cpu_notifier(struct noti notifier_chain_unregister(&cpu_chain,nb); } +#ifdef CONFIG_HOTPLUG +/* Notify userspace when a cpu event occurs, by running '/sbin/hotplug + * cpu' with certain environment variables set. */ +static int cpu_run_sbin_hotplug(unsigned int cpu, const char *action) +{ + char *argv[3], *envp[5], cpu_str[12], action_str[32]; + int i; + + sprintf(cpu_str, "CPU=%d", cpu); + sprintf(action_str, "ACTION=%s", action); + + i = 0; + argv[i++] = hotplug_path; + argv[i++] = "cpu"; + argv[i] = NULL; + + i = 0; + /* minimal command environment */ + envp [i++] = "HOME=/"; + envp [i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp [i++] = cpu_str; + envp [i++] = action_str; + envp [i] = NULL; + + return call_usermodehelper(argv [0], argv, envp); +} + +int cpu_down(unsigned int cpu) +{ + int ret; + + if ((ret = down_interruptible(&cpucontrol)) != 0) + return ret; + + if (!cpu_online(cpu)) { + ret = -EINVAL; + goto out; + } + + if (num_online_cpus() == 1) { + ret = -EBUSY; + goto out; + } + + /* Schedule ourselves on the dying CPU. */ + set_cpus_allowed(current, 1UL << cpu); + + /* Disable CPU. */ + ret = __cpu_disable(); + if (ret != 0) { + printk("CPU disable failed: %i\n", ret); + goto out; + } + BUG_ON(cpu_online(cpu)); + + /* Move other tasks off to other CPUs (simple since they are + not running now). */ + migrate_all_tasks(); + + /* Move off dying CPU, which will revert to idle process. */ + set_cpus_allowed(current, ~(1UL << cpu)); + + /* CPU has been disabled: tell everyone */ + notifier_call_chain(&cpu_chain, CPU_OFFLINE, (void *)(long)cpu); + + /* Die, CPU, die!. */ + __cpu_die(cpu); + + /* CPU has is completely dead: tell everyone */ + notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu); + + printk("Done DEAD notifier.\n"); + cpu_run_sbin_hotplug(cpu, "remove"); + up(&cpucontrol); + + /* Debugging, mainly for kernel threads which didn't clean up. */ + { + struct task_struct *p; + + write_lock_irq(&tasklist_lock); + for_each_process(p) { + if (p->thread_info->cpu == cpu + && !(p->state & TASK_ZOMBIE)) + printk("Left %s\n", p->comm); + } + write_unlock_irq(&tasklist_lock); + } + printk("Done cpu down: %i.\n", ret); + return ret; + + out: + up(&cpucontrol); + return ret; +} + +static ssize_t show_online(struct device *dev, + char *buf, + size_t count, + loff_t off) +{ + char out[3]; + struct cpu *cpu = container_of(container_of(dev,struct sys_device,dev), + struct cpu, sysdev); + + sprintf(out, "%i\n", !!cpu_online(cpu->sysdev.id)); + if (off >= strlen(out)) return 0; + if (off + count > strlen(out)) count = strlen(out) - off; + memcpy(buf, out+off, count); + return (ssize_t)count; +} + +static ssize_t store_online(struct device *dev, + const char *buf, + size_t count, + loff_t off) +{ + struct cpu *cpu = container_of(container_of(dev,struct sys_device,dev), + struct cpu, sysdev); + ssize_t ret; + + if (off != 0) + return -EINVAL; + switch (buf[0]) { + case '0': + ret = cpu_down(cpu->sysdev.id); + break; + case '1': + ret = cpu_up(cpu->sysdev.id); + break; + default: + ret = -EINVAL; + } + + if (ret == 0) + ret = count; + return ret; +} + +static DEVICE_ATTR(online, 0600, show_online, store_online); + +#else /* !CONFIG_HOTPLUG */ +int cpu_down(unsigned int cpu) +{ + return -ENOSYS; +} + +static inline int cpu_run_sbin_hotplug(unsigned int cpu, const char *action) +{ + return 0; +} +#endif + int __devinit cpu_up(unsigned int cpu) { int ret; @@ -68,6 +222,10 @@ int __devinit cpu_up(unsigned int cpu) { return -ENOSYS; } +int cpu_down(unsigned int cpu) +{ + return -ENOSYS; +} #endif /* CONFIG_SMP */ static struct device_driver cpu_driver = { @@ -92,6 +250,10 @@ static int __init register_cpus(void) continue; per_cpu(cpu_devices, i).sysdev.id = i; sys_device_register(&per_cpu(cpu_devices, i).sysdev); +#ifdef CONFIG_HOTPLUG + device_create_file(&per_cpu(cpu_devices, i).sysdev.dev, + &dev_attr_online); +#endif } return 0; } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/kernel/sched.c .11268-linux-2.5.44.updated/kernel/sched.c --- .11268-linux-2.5.44/kernel/sched.c 2002-10-16 15:01:26.000000000 +1000 +++ .11268-linux-2.5.44.updated/kernel/sched.c 2002-10-28 17:54:14.000000000 +1100 @@ -32,6 +32,7 @@ #include #include #include +#include /* * Convert user-nice values [ -20 ... 0 ... 19 ] @@ -417,7 +418,8 @@ repeat_lock_task: */ if (unlikely(sync && !task_running(rq, p) && (task_cpu(p) != smp_processor_id()) && - (p->cpus_allowed & (1UL << smp_processor_id())))) { + (p->cpus_allowed & (1UL << smp_processor_id())) && + cpu_online(smp_processor_id()))) { set_task_cpu(p, smp_processor_id()); task_rq_unlock(rq, &flags); @@ -747,6 +749,11 @@ static void load_balance(runqueue_t *thi struct list_head *head, *curr; task_t *tmp; + /* CPU going down is a special case: we don't pull more tasks + onboard */ + if (unlikely(!cpu_online(this_cpu))) + goto out; + busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance); if (!busiest) goto out; @@ -1584,15 +1591,21 @@ asmlinkage int sys_sched_setaffinity(pid if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) return -EFAULT; - new_mask &= cpu_online_map; - if (!new_mask) + /* Stop CPUs going up and down. */ + if (down_interruptible(&cpucontrol) != 0) + return -EINTR; + + if (!(new_mask & cpu_online_map)) { + up(&cpucontrol); return -EINVAL; + } read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (!p) { read_unlock(&tasklist_lock); + up(&cpucontrol); return -ESRCH; } @@ -1613,6 +1626,7 @@ asmlinkage int sys_sched_setaffinity(pid set_cpus_allowed(p, new_mask); out_unlock: + up(&cpucontrol); put_task_struct(p); return retval; } @@ -1643,7 +1657,7 @@ asmlinkage int sys_sched_getaffinity(pid goto out_unlock; retval = 0; - mask = p->cpus_allowed & cpu_online_map; + mask = p->cpus_allowed; out_unlock: read_unlock(&tasklist_lock); @@ -1952,11 +1966,8 @@ void set_cpus_allowed(task_t *p, unsigne migration_req_t req; runqueue_t *rq; -#if 0 /* FIXME: Grab cpu_lock, return error on this case. --RR */ - new_mask &= cpu_online_map; - if (!new_mask) + if (!(new_mask & cpu_online_map)) BUG(); -#endif rq = task_rq_lock(p, &flags); p->cpus_allowed = new_mask; @@ -1987,6 +1998,102 @@ void set_cpus_allowed(task_t *p, unsigne wait_for_completion(&req.done); } +/* Move (not current) task off this cpu, onto dest cpu. Reference to + task must be held. */ +static void move_task_away(struct task_struct *p, unsigned int dest_cpu) +{ + runqueue_t *rq_dest; + unsigned long flags; + + rq_dest = cpu_rq(dest_cpu); + + if (task_cpu(p) != smp_processor_id()) + return; /* Already moved */ + + local_irq_save(flags); + double_rq_lock(this_rq(), rq_dest); + if (task_cpu(p) != smp_processor_id()) + goto out; /* Already moved */ + + set_task_cpu(p, dest_cpu); + if (p->array) { + deactivate_task(p, this_rq()); + activate_task(p, rq_dest); + if (p->prio < rq_dest->curr->prio) + resched_task(rq_dest->curr); + } + out: + double_rq_unlock(this_rq(), rq_dest); + local_irq_restore(flags); +} + +#ifdef CONFIG_HOTPLUG +/* Slow but sure. We don't fight against load_balance, new people + setting affinity, or try_to_wake_up's fast path pulling things in, + as cpu_online() no longer true. */ +static int move_all_tasks(unsigned int kill_it) +{ + unsigned int num_signalled = 0; + unsigned int dest_cpu; + struct task_struct *g, *t; + unsigned long cpus_allowed; + + again: + read_lock(&tasklist_lock); + do_each_thread(g, t) { + if (t == current) + continue; + + /* Kernel threads which are bound to specific + processors need to look after themselves + with their own callbacks */ + if (t->mm == NULL && t->cpus_allowed != ~0UL) + continue; + + if (task_cpu(t) == smp_processor_id()) { + get_task_struct(t); + goto move_one; + } + } while_each_thread(g, t); + read_unlock(&tasklist_lock); + return num_signalled; + + move_one: + read_unlock(&tasklist_lock); + cpus_allowed = t->cpus_allowed & ~(1UL << smp_processor_id()); + dest_cpu = any_online_cpu(cpus_allowed); + if (dest_cpu < 0) { + num_signalled++; + if (!kill_it) { + /* FIXME: New signal needed? --RR */ + force_sig(SIGPWR, t); + goto again; + } + /* Kill it (it can die on any CPU). */ + t->cpus_allowed = ~(1 << smp_processor_id()); + dest_cpu = any_online_cpu(t->cpus_allowed); + force_sig(SIGKILL, t); + } + move_task_away(t, dest_cpu); + put_task_struct(t); + goto again; +} + +/* Move non-kernel-thread tasks off this (offline) CPU, except us. */ +void migrate_all_tasks(void) +{ + if (move_all_tasks(0)) { + /* Wait for processes to react to signal */ + schedule_timeout(30*HZ); + move_all_tasks(1); + } +} +#endif /* CONFIG_HOTPLUG */ + +/* This is the CPU to stop, and who to wake about it */ +static int migration_stop = -1; +static struct completion migration_stopped; + /* * migration_thread - this is a highprio system thread that performs * thread migration by 'pulling' threads into the target runqueue. @@ -2015,13 +2122,10 @@ static int migration_thread(void * data) sprintf(current->comm, "migration/%d", smp_processor_id()); - for (;;) { - runqueue_t *rq_src, *rq_dest; + while (migration_stop != cpu) { struct list_head *head; - int cpu_src, cpu_dest; migration_req_t *req; unsigned long flags; - task_t *p; spin_lock_irqsave(&rq->lock, flags); head = &rq->migration_queue; @@ -2035,34 +2139,34 @@ static int migration_thread(void * data) list_del_init(head->next); spin_unlock_irqrestore(&rq->lock, flags); - p = req->task; - cpu_dest = __ffs(p->cpus_allowed); - rq_dest = cpu_rq(cpu_dest); -repeat: - cpu_src = task_cpu(p); - rq_src = cpu_rq(cpu_src); - - local_irq_save(flags); - double_rq_lock(rq_src, rq_dest); - if (task_cpu(p) != cpu_src) { - double_rq_unlock(rq_src, rq_dest); - local_irq_restore(flags); - goto repeat; - } - if (rq_src == rq) { - set_task_cpu(p, cpu_dest); - if (p->array) { - deactivate_task(p, rq_src); - activate_task(p, rq_dest); - if (p->prio < rq_dest->curr->prio) - resched_task(rq_dest->curr); - } - } - double_rq_unlock(rq_src, rq_dest); - local_irq_restore(flags); - + move_task_away(req->task, + any_online_cpu(req->task->cpus_allowed)); complete(&req->done); } + current->state = TASK_RUNNING; + + printk("Migration thread for %u exiting\n", cpu); + rq->migration_thread = NULL; + complete(&migration_stopped); + + return 0; +} + +/* No locking required: CPU notifiers are serialized */ +static void stop_migration_thread(unsigned int cpu) +{ + /* We want to wake it, but it may exit first. */ + struct task_struct *migthread = cpu_rq(cpu)->migration_thread; + + get_task_struct(migthread); + init_completion(&migration_stopped); + /* They must not access completion until it's initialized. */ + wmb(); + migration_stop = cpu; + wake_up_process(cpu_rq(cpu)->migration_thread); + wait_for_completion(&migration_stopped); + put_task_struct(migthread); + migration_stop = -1; } /* @@ -2081,6 +2185,9 @@ static int migration_call(struct notifie while (!cpu_rq((long)hcpu)->migration_thread) yield(); break; + case CPU_OFFLINE: + stop_migration_thread((long)hcpu); + break; } return NOTIFY_OK; } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/kernel/softirq.c .11268-linux-2.5.44.updated/kernel/softirq.c --- .11268-linux-2.5.44/kernel/softirq.c 2002-10-15 15:31:06.000000000 +1000 +++ .11268-linux-2.5.44.updated/kernel/softirq.c 2002-10-28 17:54:14.000000000 +1100 @@ -266,10 +266,18 @@ void __init softirq_init() open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); } +/* This is the CPU to stop, and who to wake about it */ +static int ksoftirq_stop = -1; +static struct task_struct *ksoftirq_killer = NULL; + static int ksoftirqd(void * __bind_cpu) { int cpu = (int) (long) __bind_cpu; + if (ksoftirqd_task(cpu)) + BUG(); + + sprintf(current->comm, "ksoftirqd/%d", cpu); daemonize(); set_user_nice(current, 19); current->flags |= PF_IOTHREAD; @@ -280,14 +288,13 @@ static int ksoftirqd(void * __bind_cpu) if (smp_processor_id() != cpu) BUG(); - sprintf(current->comm, "ksoftirqd/%d", cpu); - __set_current_state(TASK_INTERRUPTIBLE); mb(); ksoftirqd_task(cpu) = current; - for (;;) { + while (ksoftirq_stop != cpu) { + rmb(); if (!softirq_pending(cpu)) schedule(); @@ -300,13 +307,21 @@ static int ksoftirqd(void * __bind_cpu) __set_current_state(TASK_INTERRUPTIBLE); } + set_current_state(TASK_RUNNING); + + printk("ksoftirqd for %i dying\n", cpu); + ksoftirqd_task(cpu) = NULL; + wmb(); + wake_up_process(ksoftirq_killer); + + return 0; } static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; + unsigned int hotcpu = (unsigned long)hcpu; if (action == CPU_ONLINE) { if (kernel_thread(ksoftirqd, hcpu, CLONE_KERNEL) < 0) { @@ -316,9 +331,45 @@ static int __devinit cpu_callback(struct while (!ksoftirqd_task(hotcpu)) yield(); - return NOTIFY_OK; } - return NOTIFY_BAD; + + if (action == CPU_OFFLINE) { + struct task_struct *kd_task; + + /* Kill ksoftirqd: get ref in case it exits before we + wake it */ + ksoftirq_killer = current; + kd_task = ksoftirqd_task(hotcpu); + get_task_struct(kd_task); + set_current_state(TASK_INTERRUPTIBLE); + ksoftirq_stop = hotcpu; + wake_up_process(kd_task); + while (ksoftirqd_task(hotcpu)) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + set_current_state(TASK_RUNNING); + put_task_struct(kd_task); + ksoftirq_stop = -1; + } + + if (action == CPU_DEAD) { + struct tasklet_struct *i, *next; + + /* Move pending softirqs from dead CPU to us. */ + local_irq_disable(); + for (i = per_cpu(tasklet_vec, hotcpu).list; i; i = next) { + next = i->next; + __tasklet_schedule(i); + } + for (i = per_cpu(tasklet_hi_vec, hotcpu).list; i; i = next) { + next = i->next; + __tasklet_hi_schedule(i); + } + local_irq_enable(); + } + + return NOTIFY_OK; } static struct notifier_block cpu_nfb = { &cpu_callback, NULL, 0 }; diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/kernel/workqueue.c .11268-linux-2.5.44.updated/kernel/workqueue.c --- .11268-linux-2.5.44/kernel/workqueue.c 2002-10-15 15:30:05.000000000 +1000 +++ .11268-linux-2.5.44.updated/kernel/workqueue.c 2002-10-28 17:54:14.000000000 +1100 @@ -25,6 +25,8 @@ #include #include #include +#include +#include /* * The per-CPU workqueue: @@ -50,8 +52,13 @@ struct cpu_workqueue_struct { */ struct workqueue_struct { struct cpu_workqueue_struct cpu_wq[NR_CPUS]; + const char *name; + struct list_head list; }; +/* All the workqueues on the system: protected by cpucontrol mutex. */ +static LIST_HEAD(workqueues); + /* * Queue work on a workqueue. Return non-zero if it was successfully * added. @@ -161,7 +168,6 @@ static inline void run_workqueue(struct typedef struct startup_s { struct cpu_workqueue_struct *cwq; struct completion done; - const char *name; } startup_t; static int worker_thread(void *__startup) @@ -173,7 +179,7 @@ static int worker_thread(void *__startup struct k_sigaction sa; daemonize(); - sprintf(current->comm, "%s/%d", startup->name, cpu); + sprintf(current->comm, "%s/%d", cwq->wq->name, cpu); current->flags |= PF_IOTHREAD; cwq->thread = current; @@ -264,44 +270,52 @@ void flush_workqueue(struct workqueue_st } } -struct workqueue_struct *create_workqueue(const char *name) +static int create_workqueue_thread(struct workqueue_struct *wq, int cpu) { - int ret, cpu, destroy = 0; - struct cpu_workqueue_struct *cwq; startup_t startup; + struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; + int ret; + + spin_lock_init(&cwq->lock); + cwq->wq = wq; + cwq->thread = NULL; + atomic_set(&cwq->nr_queued, 0); + INIT_LIST_HEAD(&cwq->worklist); + init_waitqueue_head(&cwq->more_work); + init_waitqueue_head(&cwq->work_done); + + init_completion(&startup.done); + startup.cwq = cwq; + ret = kernel_thread(worker_thread, &startup, CLONE_FS | CLONE_FILES); + if (ret >= 0) { + wait_for_completion(&startup.done); + BUG_ON(!cwq->thread); + } + return ret; +} + +struct workqueue_struct *create_workqueue(const char *name) +{ + int cpu, destroy = 0; struct workqueue_struct *wq; BUG_ON(strlen(name) > 10); - startup.name = name; wq = kmalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; + wq->name = name; + down(&cpucontrol); for (cpu = 0; cpu < NR_CPUS; cpu++) { if (!cpu_online(cpu)) continue; - cwq = wq->cpu_wq + cpu; - - spin_lock_init(&cwq->lock); - cwq->wq = wq; - cwq->thread = NULL; - atomic_set(&cwq->nr_queued, 0); - INIT_LIST_HEAD(&cwq->worklist); - init_waitqueue_head(&cwq->more_work); - init_waitqueue_head(&cwq->work_done); - - init_completion(&startup.done); - startup.cwq = cwq; - ret = kernel_thread(worker_thread, &startup, - CLONE_FS | CLONE_FILES); - if (ret < 0) + if (create_workqueue_thread(wq, cpu) < 0) destroy = 1; - else { - wait_for_completion(&startup.done); - BUG_ON(!cwq->thread); - } } + + list_add(&wq->list, &workqueues); + /* * Was there any error during startup? If yes then clean up: */ @@ -309,32 +323,78 @@ struct workqueue_struct *create_workqueu destroy_workqueue(wq); wq = NULL; } + + up(&cpucontrol); return wq; } -void destroy_workqueue(struct workqueue_struct *wq) +static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) { struct cpu_workqueue_struct *cwq; + + cwq = wq->cpu_wq + cpu; + if (cwq->thread) { + /* Initiate an exit and wait for it: */ + init_completion(&cwq->exit); + wmb(); + cwq->thread = NULL; + wmb(); + wake_up(&cwq->more_work); + + wait_for_completion(&cwq->exit); + printk("Workqueue thread %s for cpu %i exited\n", + wq->name, cpu); + } else + printk("NO workqueue thread %s for cpu %i\n", + wq->name, cpu); +} + +void destroy_workqueue(struct workqueue_struct *wq) +{ int cpu; flush_workqueue(wq); + down(&cpucontrol); for (cpu = 0; cpu < NR_CPUS; cpu++) { if (!cpu_online(cpu)) continue; - cwq = wq->cpu_wq + cpu; - if (!cwq->thread) - continue; - /* - * Initiate an exit and wait for it: - */ - init_completion(&cwq->exit); - cwq->thread = NULL; - wake_up(&cwq->more_work); - - wait_for_completion(&cwq->exit); + cleanup_workqueue_thread(wq, cpu); } + list_del(&wq->list); kfree(wq); + up(&cpucontrol); +} + +/* We're holding the cpucontrol mutex here */ +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int hotcpu = (unsigned long)hcpu; + struct workqueue_struct *wq; + + switch (action) { + case CPU_ONLINE: + /* Start a new workqueue thread for it. */ + list_for_each_entry(wq, &workqueues, list) { + if (create_workqueue_thread(wq, hotcpu) < 0) { + /* FIXME: Start workqueue at CPU_COMING_UP */ + printk("workqueue for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + } + return NOTIFY_OK; + + case CPU_OFFLINE: + list_for_each_entry(wq, &workqueues, list) { + printk("Cleaning up workqueue for %s\n", wq->name); + cleanup_workqueue_thread(wq, hotcpu); + } + return NOTIFY_OK; + }; + + return NOTIFY_OK; } static struct workqueue_struct *keventd_wq; @@ -371,8 +431,11 @@ int current_is_keventd(void) return 0; } +static struct notifier_block cpu_nfb = { &cpu_callback, NULL, 0 }; + void init_workqueues(void) { + register_cpu_notifier(&cpu_nfb); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/mm/vmscan.c .11268-linux-2.5.44.updated/mm/vmscan.c --- .11268-linux-2.5.44/mm/vmscan.c 2002-10-19 17:48:11.000000000 +1000 +++ .11268-linux-2.5.44.updated/mm/vmscan.c 2002-10-28 17:54:14.000000000 +1100 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -874,6 +875,7 @@ int kswapd(void *p) daemonize(); set_cpus_allowed(tsk, __node_to_cpu_mask(pgdat->node_id)); sprintf(tsk->comm, "kswapd%d", pgdat->node_id); + printk("Set %s affinity to %08lX\n", tsk->comm, tsk->cpus_allowed); sigfillset(&tsk->blocked); /* @@ -930,6 +932,45 @@ int shrink_all_memory(int nr_pages) } #endif +/* It's optimal to keep kswapds on the same CPUs as their memory, but + not required for correctness. So if the last cpu in a node goes + away, let them run anywhere, and as the first one comes back, + restore their cpu bindings. */ +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + pg_data_t *pgdat; + unsigned int hotcpu = (unsigned long)hcpu; + unsigned long mask; + + if (action == CPU_OFFLINE) { + /* Make sure that kswapd never becomes unschedulable. */ + for_each_pgdat(pgdat) { + mask = __node_to_cpu_mask(pgdat->node_id); + if (any_online_cpu(mask) < 0) { + mask = ~0UL; + set_cpus_allowed(pgdat->kswapd, mask); + } + } + } + + if (action == CPU_ONLINE) { + for_each_pgdat(pgdat) { + mask = __node_to_cpu_mask(pgdat->node_id); + mask &= ~(1UL << hotcpu); + if (any_online_cpu(mask) < 0) { + mask |= (1UL << hotcpu); + /* One of our CPUs came back: restore mask */ + set_cpus_allowed(pgdat->kswapd, mask); + } + } + } + return NOTIFY_OK; +} + +static struct notifier_block cpu_nfb = { &cpu_callback, NULL, 0 }; + static int __init kswapd_init(void) { pg_data_t *pgdat; @@ -938,6 +979,7 @@ static int __init kswapd_init(void) for_each_pgdat(pgdat) kernel_thread(kswapd, pgdat, CLONE_KERNEL); total_memory = nr_free_pagecache_pages(); + register_cpu_notifier(&cpu_nfb); return 0; } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11268-linux-2.5.44/net/core/dev.c .11268-linux-2.5.44.updated/net/core/dev.c --- .11268-linux-2.5.44/net/core/dev.c 2002-10-15 15:30:05.000000000 +1000 +++ .11268-linux-2.5.44.updated/net/core/dev.c 2002-10-28 17:54:13.000000000 +1100 @@ -105,6 +105,7 @@ #include #include #include +#include #if defined(CONFIG_NET_RADIO) || defined(CONFIG_NET_PCMCIA_RADIO) #include /* Note : will define WIRELESS_EXT */ #include @@ -2848,3 +2849,67 @@ static int net_run_sbin_hotplug(struct n return call_usermodehelper(argv [0], argv, envp); } #endif + +static int dev_cpu_callback(struct notifier_block *nfb, unsigned long action, void * ocpu) +{ + struct sk_buff *list_sk, *sk_head; + struct net_device *list_net, *net_head; + struct softnet_data *queue; + struct sk_buff *skb; + unsigned int cpu = smp_processor_id(); + unsigned long oldcpu = (unsigned long) ocpu; + unsigned long flags; + + if (action != CPU_OFFLINE) + return 0; + + local_irq_save(flags); + + /* Move completion queue */ + + list_sk = softnet_data[oldcpu].completion_queue; + if (list_sk != NULL) { + sk_head = list_sk; + while (list_sk->next != NULL) + list_sk = list_sk->next; + list_sk->next = softnet_data[cpu].completion_queue; + softnet_data[cpu].completion_queue = sk_head; + softnet_data[oldcpu].completion_queue = NULL; + } + + /* Move output_queue */ + + list_net = softnet_data[oldcpu].output_queue; + if (list_net != NULL) { + net_head = list_net; + while (list_net->next != NULL) + list_net = list_net->next_sched; + list_net->next_sched = softnet_data[cpu].output_queue; + softnet_data[cpu].output_queue = net_head; + softnet_data[oldcpu].output_queue = NULL; + } + + local_irq_restore(flags); + + /* Move input_pkt_queue */ + + queue = &softnet_data[oldcpu]; + for (;;) { + skb = __skb_dequeue(&queue->input_pkt_queue); + if (skb == NULL) + break; + netif_rx(skb); + } + + return 0; +} + +static struct notifier_block cpu_callback_nfb = {&dev_cpu_callback, NULL, 0 }; + +static int __init dev_cpu_callback_init(void) +{ + register_cpu_notifier(&cpu_callback_nfb); + return 0; +} + +__initcall(dev_cpu_callback_init);