diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 05f0754..d8efcbc 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -129,6 +129,8 @@ void __log_wait_for_space(journal_t *journal) if (journal->j_flags & JFS_ABORT) return; spin_unlock(&journal->j_state_lock); + if (current->plug) + io_schedule(); mutex_lock(&journal->j_checkpoint_mutex); /* diff --git a/include/linux/cpu.h b/include/linux/cpu.h index a6bda1b..00d2f6f8 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -75,10 +75,8 @@ enum { /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, CPU_PRI_MIGRATION = 10, - - CPU_PRI_WORKQUEUE_ACTIVE = 5, /* prepare workqueues for others */ - CPU_PRI_NORMAL = 0, - CPU_PRI_WORKQUEUE_INACTIVE = -5, /* flush workqueues after others */ + /* prepare workqueues for other notifiers */ + CPU_PRI_WORKQUEUE = 5, }; #define CPU_ONLINE 0x0002 /* CPU (unsigned)v is up */ diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 0e37086..7408760 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -113,6 +113,9 @@ struct hrtimer { unsigned long state; struct list_head cb_entry; int irqsafe; +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST + ktime_t praecox; +#endif #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; diff --git a/include/linux/sched.h b/include/linux/sched.h index 7fc8321..fb4f4ac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1629,7 +1629,7 @@ struct task_struct { #ifdef CONFIG_WAKEUP_LATENCY_HIST u64 preempt_timestamp_hist; #ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST - unsigned long timer_offset; + long timer_offset; #endif #endif #endif /* CONFIG_TRACING */ @@ -1973,6 +1973,10 @@ extern void do_set_cpus_allowed(struct task_struct *p, extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); +int migrate_me(void); +void tell_sched_cpu_down_begin(int cpu); +void tell_sched_cpu_down_done(int cpu); + #else static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) @@ -1985,6 +1989,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, return -EINVAL; return 0; } +static inline int migrate_me(void) { return 0; } +static inline void tell_sched_cpu_down_begin(int cpu) { } +static inline void tell_sched_cpu_down_done(int cpu) { } #endif #ifndef CONFIG_CPUMASK_OFFSTACK diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 9849be1..af15545 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -254,10 +254,9 @@ enum { WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ - WQ_NON_AFFINE = 1 << 6, /* free to move works around cpus */ - WQ_DRAINING = 1 << 7, /* internal: workqueue is draining */ - WQ_RESCUER = 1 << 8, /* internal: workqueue has rescuer */ + WQ_DRAINING = 1 << 6, /* internal: workqueue is draining */ + WQ_RESCUER = 1 << 7, /* internal: workqueue has rescuer */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ diff --git a/init/Kconfig b/init/Kconfig index 7c0b369..c06208b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -746,6 +746,7 @@ config RT_GROUP_SCHED bool "Group scheduling for SCHED_RR/FIFO" depends on EXPERIMENTAL depends on CGROUP_SCHED + depends on !PREEMPT_RT_FULL default n help This feature lets you explicitly allocate real CPU bandwidth diff --git a/kernel/cpu.c b/kernel/cpu.c index d79d33a..3e722c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -46,12 +46,7 @@ static int cpu_hotplug_disabled; static struct { struct task_struct *active_writer; -#ifdef CONFIG_PREEMPT_RT_FULL - /* Makes the lock keep the task's state */ - spinlock_t lock; -#else struct mutex lock; /* Synchronizes accesses to refcount, */ -#endif /* * Also blocks the new readers during * an ongoing cpu hotplug operation. @@ -59,28 +54,46 @@ static struct { int refcount; } cpu_hotplug = { .active_writer = NULL, -#ifdef CONFIG_PREEMPT_RT_FULL - .lock = __SPIN_LOCK_UNLOCKED(cpu_hotplug.lock), -#else .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), -#endif .refcount = 0, }; -#ifdef CONFIG_PREEMPT_RT_FULL -# define hotplug_lock() rt_spin_lock(&cpu_hotplug.lock) -# define hotplug_unlock() rt_spin_unlock(&cpu_hotplug.lock) -#else -# define hotplug_lock() mutex_lock(&cpu_hotplug.lock) -# define hotplug_unlock() mutex_unlock(&cpu_hotplug.lock) -#endif - +/** + * hotplug_pcp - per cpu hotplug descriptor + * @unplug: set when pin_current_cpu() needs to sync tasks + * @sync_tsk: the task that waits for tasks to finish pinned sections + * @refcount: counter of tasks in pinned sections + * @grab_lock: set when the tasks entering pinned sections should wait + * @synced: notifier for @sync_tsk to tell cpu_down it's finished + * @mutex: the mutex to make tasks wait (used when @grab_lock is true) + * @mutex_init: zero if the mutex hasn't been initialized yet. + * + * Although @unplug and @sync_tsk may point to the same task, the @unplug + * is used as a flag and still exists after @sync_tsk has exited and + * @sync_tsk set to NULL. + */ struct hotplug_pcp { struct task_struct *unplug; + struct task_struct *sync_tsk; int refcount; + int grab_lock; struct completion synced; +#ifdef CONFIG_PREEMPT_RT_FULL + spinlock_t lock; +#else + struct mutex mutex; +#endif + int mutex_init; }; +#ifdef CONFIG_PREEMPT_RT_FULL +# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock) +# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock) +#else +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex) +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex) +#endif + static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp); /** @@ -94,18 +107,40 @@ static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp); void pin_current_cpu(void) { struct hotplug_pcp *hp; + int force = 0; retry: hp = &__get_cpu_var(hotplug_pcp); - if (!hp->unplug || hp->refcount || preempt_count() > 1 || + if (!hp->unplug || hp->refcount || force || preempt_count() > 1 || hp->unplug == current || (current->flags & PF_STOMPER)) { hp->refcount++; return; } - preempt_enable(); - hotplug_lock(); - hotplug_unlock(); + + if (hp->grab_lock) { + preempt_enable(); + hotplug_lock(hp); + hotplug_unlock(hp); + } else { + preempt_enable(); + /* + * Try to push this task off of this CPU. + */ + if (!migrate_me()) { + preempt_disable(); + hp = &__get_cpu_var(hotplug_pcp); + if (!hp->grab_lock) { + /* + * Just let it continue it's already pinned + * or about to sleep. + */ + force = 1; + goto retry; + } + preempt_enable(); + } + } preempt_disable(); goto retry; } @@ -127,26 +162,84 @@ void unpin_current_cpu(void) wake_up_process(hp->unplug); } -/* - * FIXME: Is this really correct under all circumstances ? - */ +static void wait_for_pinned_cpus(struct hotplug_pcp *hp) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + while (hp->refcount) { + schedule_preempt_disabled(); + set_current_state(TASK_UNINTERRUPTIBLE); + } +} + static int sync_unplug_thread(void *data) { struct hotplug_pcp *hp = data; preempt_disable(); hp->unplug = current; + wait_for_pinned_cpus(hp); + + /* + * This thread will synchronize the cpu_down() with threads + * that have pinned the CPU. When the pinned CPU count reaches + * zero, we inform the cpu_down code to continue to the next step. + */ set_current_state(TASK_UNINTERRUPTIBLE); - while (hp->refcount) { - schedule_preempt_disabled(); + preempt_enable(); + complete(&hp->synced); + + /* + * If all succeeds, the next step will need tasks to wait till + * the CPU is offline before continuing. To do this, the grab_lock + * is set and tasks going into pin_current_cpu() will block on the + * mutex. But we still need to wait for those that are already in + * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop() + * will kick this thread out. + */ + while (!hp->grab_lock && !kthread_should_stop()) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + /* Make sure grab_lock is seen before we see a stale completion */ + smp_mb(); + + /* + * Now just before cpu_down() enters stop machine, we need to make + * sure all tasks that are in pinned CPU sections are out, and new + * tasks will now grab the lock, keeping them from entering pinned + * CPU sections. + */ + if (!kthread_should_stop()) { + preempt_disable(); + wait_for_pinned_cpus(hp); + preempt_enable(); + complete(&hp->synced); + } + + set_current_state(TASK_UNINTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); set_current_state(TASK_UNINTERRUPTIBLE); } set_current_state(TASK_RUNNING); - preempt_enable(); - complete(&hp->synced); + + /* + * Force this thread off this CPU as it's going down and + * we don't want any more work on this CPU. + */ + current->flags &= ~PF_THREAD_BOUND; + do_set_cpus_allowed(current, cpu_present_mask); + migrate_me(); return 0; } +static void __cpu_unplug_sync(struct hotplug_pcp *hp) +{ + wake_up_process(hp->sync_tsk); + wait_for_completion(&hp->synced); +} + /* * Start the sync_unplug_thread on the target cpu and wait for it to * complete. @@ -154,23 +247,83 @@ static int sync_unplug_thread(void *data) static int cpu_unplug_begin(unsigned int cpu) { struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu); - struct task_struct *tsk; + int err; + + /* Protected by cpu_hotplug.lock */ + if (!hp->mutex_init) { +#ifdef CONFIG_PREEMPT_RT_FULL + spin_lock_init(&hp->lock); +#else + mutex_init(&hp->mutex); +#endif + hp->mutex_init = 1; + } + + /* Inform the scheduler to migrate tasks off this CPU */ + tell_sched_cpu_down_begin(cpu); init_completion(&hp->synced); - tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu); - if (IS_ERR(tsk)) - return (PTR_ERR(tsk)); - kthread_bind(tsk, cpu); - wake_up_process(tsk); - wait_for_completion(&hp->synced); + + hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu); + if (IS_ERR(hp->sync_tsk)) { + err = PTR_ERR(hp->sync_tsk); + hp->sync_tsk = NULL; + return err; + } + kthread_bind(hp->sync_tsk, cpu); + + /* + * Wait for tasks to get out of the pinned sections, + * it's still OK if new tasks enter. Some CPU notifiers will + * wait for tasks that are going to enter these sections and + * we must not have them block. + */ + __cpu_unplug_sync(hp); + return 0; } +static void cpu_unplug_sync(unsigned int cpu) +{ + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu); + + init_completion(&hp->synced); + /* The completion needs to be initialzied before setting grab_lock */ + smp_wmb(); + + /* Grab the mutex before setting grab_lock */ + hotplug_lock(hp); + hp->grab_lock = 1; + + /* + * The CPU notifiers have been completed. + * Wait for tasks to get out of pinned CPU sections and have new + * tasks block until the CPU is completely down. + */ + __cpu_unplug_sync(hp); + + /* All done with the sync thread */ + kthread_stop(hp->sync_tsk); + hp->sync_tsk = NULL; +} + static void cpu_unplug_done(unsigned int cpu) { struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu); hp->unplug = NULL; + /* Let all tasks know cpu unplug is finished before cleaning up */ + smp_wmb(); + + if (hp->sync_tsk) + kthread_stop(hp->sync_tsk); + + if (hp->grab_lock) { + hotplug_unlock(hp); + /* protected by cpu_hotplug.lock */ + hp->grab_lock = 0; + } + tell_sched_cpu_down_done(cpu); } void get_online_cpus(void) @@ -178,9 +331,9 @@ void get_online_cpus(void) might_sleep(); if (cpu_hotplug.active_writer == current) return; - hotplug_lock(); + mutex_lock(&cpu_hotplug.lock); cpu_hotplug.refcount++; - hotplug_unlock(); + mutex_unlock(&cpu_hotplug.lock); } EXPORT_SYMBOL_GPL(get_online_cpus); @@ -189,10 +342,10 @@ void put_online_cpus(void) { if (cpu_hotplug.active_writer == current) return; - hotplug_lock(); + mutex_lock(&cpu_hotplug.lock); if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) wake_up_process(cpu_hotplug.active_writer); - hotplug_unlock(); + mutex_unlock(&cpu_hotplug.lock); } EXPORT_SYMBOL_GPL(put_online_cpus); @@ -224,11 +377,11 @@ static void cpu_hotplug_begin(void) cpu_hotplug.active_writer = current; for (;;) { - hotplug_lock(); + mutex_lock(&cpu_hotplug.lock); if (likely(!cpu_hotplug.refcount)) break; __set_current_state(TASK_UNINTERRUPTIBLE); - hotplug_unlock(); + mutex_unlock(&cpu_hotplug.lock); schedule(); } } @@ -236,7 +389,7 @@ static void cpu_hotplug_begin(void) static void cpu_hotplug_done(void) { cpu_hotplug.active_writer = NULL; - hotplug_unlock(); + mutex_unlock(&cpu_hotplug.lock); } #else /* #if CONFIG_HOTPLUG_CPU */ @@ -370,6 +523,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) goto out_release; } + /* Notifiers are done. Don't let any more tasks pin this CPU. */ + cpu_unplug_sync(cpu); + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ diff --git a/kernel/events/core.c b/kernel/events/core.c index fd126f8..451d452 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5403,6 +5403,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swevent_hrtimer; + hwc->hrtimer.irqsafe = 1; /* * Since hrtimers have a fixed rate, we can do a static freq->period diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 3991464..a080e62 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1021,6 +1021,17 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, #endif } +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST + { + ktime_t now = new_base->get_time(); + + if (ktime_to_ns(tim) < ktime_to_ns(now)) + timer->praecox = now; + else + timer->praecox = ktime_set(0, 0); + } +#endif + hrtimer_set_expires_range_ns(timer, tim, delta_ns); timer_stats_hrtimer_set_start_info(timer); @@ -1458,8 +1469,9 @@ retry: timer = container_of(node, struct hrtimer, node); trace_hrtimer_interrupt(raw_smp_processor_id(), - ktime_to_ns(ktime_sub( - hrtimer_get_expires(timer), basenow)), + ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ? + timer->praecox : hrtimer_get_expires(timer), + basenow)), current, timer->function == hrtimer_wakeup ? container_of(timer, struct hrtimer_sleeper, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e8d5a10..f5ee392 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3184,7 +3184,7 @@ void migrate_disable(void) { struct task_struct *p = current; - if (in_atomic() || p->flags & PF_THREAD_BOUND) { + if (in_atomic()) { #ifdef CONFIG_SCHED_DEBUG p->migrate_disable_atomic++; #endif @@ -3215,7 +3215,7 @@ void migrate_enable(void) unsigned long flags; struct rq *rq; - if (in_atomic() || p->flags & PF_THREAD_BOUND) { + if (in_atomic()) { #ifdef CONFIG_SCHED_DEBUG p->migrate_disable_atomic--; #endif @@ -5114,6 +5114,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) cpumask_copy(&p->cpus_allowed, new_mask); } +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks); +static DEFINE_MUTEX(sched_down_mutex); +static cpumask_t sched_down_cpumask; + +void tell_sched_cpu_down_begin(int cpu) +{ + mutex_lock(&sched_down_mutex); + cpumask_set_cpu(cpu, &sched_down_cpumask); + mutex_unlock(&sched_down_mutex); +} + +void tell_sched_cpu_down_done(int cpu) +{ + mutex_lock(&sched_down_mutex); + cpumask_clear_cpu(cpu, &sched_down_cpumask); + mutex_unlock(&sched_down_mutex); +} + +/** + * migrate_me - try to move the current task off this cpu + * + * Used by the pin_current_cpu() code to try to get tasks + * to move off the current CPU as it is going down. + * It will only move the task if the task isn't pinned to + * the CPU (with migrate_disable, affinity or THREAD_BOUND) + * and the task has to be in a RUNNING state. Otherwise the + * movement of the task will wake it up (change its state + * to running) when the task did not expect it. + * + * Returns 1 if it succeeded in moving the current task + * 0 otherwise. + */ +int migrate_me(void) +{ + struct task_struct *p = current; + struct migration_arg arg; + struct cpumask *cpumask; + struct cpumask *mask; + unsigned long flags; + unsigned int dest_cpu; + struct rq *rq; + + /* + * We can not migrate tasks bounded to a CPU or tasks not + * running. The movement of the task will wake it up. + */ + if (p->flags & PF_THREAD_BOUND || p->state) + return 0; + + mutex_lock(&sched_down_mutex); + rq = task_rq_lock(p, &flags); + + cpumask = &__get_cpu_var(sched_cpumasks); + mask = &p->cpus_allowed; + + cpumask_andnot(cpumask, mask, &sched_down_cpumask); + + if (!cpumask_weight(cpumask)) { + /* It's only on this CPU? */ + task_rq_unlock(rq, p, &flags); + mutex_unlock(&sched_down_mutex); + return 0; + } + + dest_cpu = cpumask_any_and(cpu_active_mask, cpumask); + + arg.task = p; + arg.dest_cpu = dest_cpu; + + task_rq_unlock(rq, p, &flags); + + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + tlb_migrate_finish(p->mm); + mutex_unlock(&sched_down_mutex); + + return 1; +} + /* * This is how migration works: * diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c index 9d49fcb..6a4c869 100644 --- a/kernel/trace/latency_hist.c +++ b/kernel/trace/latency_hist.c @@ -27,6 +27,8 @@ #include "trace.h" #include +#define NSECS_PER_USECS 1000L + #define CREATE_TRACE_POINTS #include @@ -46,11 +48,11 @@ enum { struct hist_data { atomic_t hist_mode; /* 0 log, 1 don't log */ long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */ - unsigned long min_lat; - unsigned long max_lat; + long min_lat; + long max_lat; unsigned long long below_hist_bound_samples; unsigned long long above_hist_bound_samples; - unsigned long long accumulate_lat; + long long accumulate_lat; unsigned long long total_samples; unsigned long long hist_array[MAX_ENTRY_NUM]; }; @@ -152,8 +154,8 @@ static struct enable_data timerandwakeup_enabled_data = { static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc); #endif -void notrace latency_hist(int latency_type, int cpu, unsigned long latency, - unsigned long timeroffset, cycle_t stop, +void notrace latency_hist(int latency_type, int cpu, long latency, + long timeroffset, cycle_t stop, struct task_struct *p) { struct hist_data *my_hist; @@ -224,7 +226,7 @@ void notrace latency_hist(int latency_type, int cpu, unsigned long latency, my_hist->hist_array[latency]++; if (unlikely(latency > my_hist->max_lat || - my_hist->min_lat == ULONG_MAX)) { + my_hist->min_lat == LONG_MAX)) { #if defined(CONFIG_WAKEUP_LATENCY_HIST) || \ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST) if (latency_type == WAKEUP_LATENCY || @@ -263,15 +265,14 @@ static void *l_start(struct seq_file *m, loff_t *pos) atomic_dec(&my_hist->hist_mode); if (likely(my_hist->total_samples)) { - unsigned long avg = (unsigned long) - div64_u64(my_hist->accumulate_lat, + long avg = (long) div64_s64(my_hist->accumulate_lat, my_hist->total_samples); snprintf(minstr, sizeof(minstr), "%ld", - (long) my_hist->min_lat - my_hist->offset); + my_hist->min_lat - my_hist->offset); snprintf(avgstr, sizeof(avgstr), "%ld", - (long) avg - my_hist->offset); + avg - my_hist->offset); snprintf(maxstr, sizeof(maxstr), "%ld", - (long) my_hist->max_lat - my_hist->offset); + my_hist->max_lat - my_hist->offset); } else { strcpy(minstr, ""); strcpy(avgstr, minstr); @@ -376,10 +377,10 @@ static void hist_reset(struct hist_data *hist) memset(hist->hist_array, 0, sizeof(hist->hist_array)); hist->below_hist_bound_samples = 0ULL; hist->above_hist_bound_samples = 0ULL; - hist->min_lat = ULONG_MAX; - hist->max_lat = 0UL; + hist->min_lat = LONG_MAX; + hist->max_lat = LONG_MIN; hist->total_samples = 0ULL; - hist->accumulate_lat = 0ULL; + hist->accumulate_lat = 0LL; atomic_inc(&hist->hist_mode); } @@ -790,9 +791,9 @@ static notrace void probe_preemptirqsoff_hist(void *v, int reason, stop = ftrace_now(cpu); time_set++; - if (start && stop >= start) { - unsigned long latency = - nsecs_to_usecs(stop - start); + if (start) { + long latency = ((long) (stop - start)) / + NSECS_PER_USECS; latency_hist(IRQSOFF_LATENCY, cpu, latency, 0, stop, NULL); @@ -808,9 +809,9 @@ static notrace void probe_preemptirqsoff_hist(void *v, int reason, if (!(time_set++)) stop = ftrace_now(cpu); - if (start && stop >= start) { - unsigned long latency = - nsecs_to_usecs(stop - start); + if (start) { + long latency = ((long) (stop - start)) / + NSECS_PER_USECS; latency_hist(PREEMPTOFF_LATENCY, cpu, latency, 0, stop, NULL); @@ -827,9 +828,10 @@ static notrace void probe_preemptirqsoff_hist(void *v, int reason, if (!time_set) stop = ftrace_now(cpu); - if (start && stop >= start) { - unsigned long latency = - nsecs_to_usecs(stop - start); + if (start) { + long latency = ((long) (stop - start)) / + NSECS_PER_USECS; + latency_hist(PREEMPTIRQSOFF_LATENCY, cpu, latency, 0, stop, NULL); } @@ -908,7 +910,7 @@ static notrace void probe_wakeup_latency_hist_stop(void *v, { unsigned long flags; int cpu = task_cpu(next); - unsigned long latency; + long latency; cycle_t stop; struct task_struct *cpu_wakeup_task; @@ -933,13 +935,17 @@ static notrace void probe_wakeup_latency_hist_stop(void *v, goto out; } + if (current->prio == cpu_wakeup_task->prio) + per_cpu(wakeup_sharedprio, cpu) = 1; + /* * The task we are waiting for is about to be switched to. * Calculate latency and store it in histogram. */ stop = ftrace_now(raw_smp_processor_id()); - latency = nsecs_to_usecs(stop - next->preempt_timestamp_hist); + latency = ((long) (stop - next->preempt_timestamp_hist)) / + NSECS_PER_USECS; if (per_cpu(wakeup_sharedprio, cpu)) { latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop, @@ -975,7 +981,7 @@ static notrace void probe_hrtimer_interrupt(void *v, int cpu, (task->prio < curr->prio || (task->prio == curr->prio && !cpumask_test_cpu(cpu, &task->cpus_allowed)))) { - unsigned long latency; + long latency; cycle_t now; if (missed_timer_offsets_pid) { @@ -985,7 +991,7 @@ static notrace void probe_hrtimer_interrupt(void *v, int cpu, } now = ftrace_now(cpu); - latency = (unsigned long) div_s64(-latency_ns, 1000); + latency = (long) div_s64(-latency_ns, NSECS_PER_USECS); latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now, task); #ifdef CONFIG_WAKEUP_LATENCY_HIST @@ -1026,7 +1032,7 @@ static __init int latency_hist_init(void) &per_cpu(irqsoff_hist, i), &latency_hist_fops); my_hist = &per_cpu(irqsoff_hist, i); atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = 0xFFFFFFFFUL; + my_hist->min_lat = LONG_MAX; } entry = debugfs_create_file("reset", 0644, dentry, (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops); @@ -1041,7 +1047,7 @@ static __init int latency_hist_init(void) &per_cpu(preemptoff_hist, i), &latency_hist_fops); my_hist = &per_cpu(preemptoff_hist, i); atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = 0xFFFFFFFFUL; + my_hist->min_lat = LONG_MAX; } entry = debugfs_create_file("reset", 0644, dentry, (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops); @@ -1056,7 +1062,7 @@ static __init int latency_hist_init(void) &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops); my_hist = &per_cpu(preemptirqsoff_hist, i); atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = 0xFFFFFFFFUL; + my_hist->min_lat = LONG_MAX; } entry = debugfs_create_file("reset", 0644, dentry, (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops); @@ -1081,14 +1087,14 @@ static __init int latency_hist_init(void) &latency_hist_fops); my_hist = &per_cpu(wakeup_latency_hist, i); atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = 0xFFFFFFFFUL; + my_hist->min_lat = LONG_MAX; entry = debugfs_create_file(name, 0444, dentry_sharedprio, &per_cpu(wakeup_latency_hist_sharedprio, i), &latency_hist_fops); my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i); atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = 0xFFFFFFFFUL; + my_hist->min_lat = LONG_MAX; sprintf(name, cpufmt_maxlatproc, i); @@ -1122,7 +1128,7 @@ static __init int latency_hist_init(void) &per_cpu(missed_timer_offsets, i), &latency_hist_fops); my_hist = &per_cpu(missed_timer_offsets, i); atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = 0xFFFFFFFFUL; + my_hist->min_lat = LONG_MAX; sprintf(name, cpufmt_maxlatproc, i); mp = &per_cpu(missed_timer_offsets_maxlatproc, i); @@ -1150,7 +1156,7 @@ static __init int latency_hist_init(void) &latency_hist_fops); my_hist = &per_cpu(timerandwakeup_latency_hist, i); atomic_set(&my_hist->hist_mode, 1); - my_hist->min_lat = 0xFFFFFFFFUL; + my_hist->min_lat = LONG_MAX; sprintf(name, cpufmt_maxlatproc, i); mp = &per_cpu(timerandwakeup_maxlatproc, i); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc867e8..33d1095 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,7 +41,6 @@ #include #include #include -#include #include "workqueue_sched.h" @@ -58,10 +57,20 @@ enum { WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ - WORKER_CPU_INTENSIVE = 1 << 4, /* cpu intensive */ - WORKER_UNBOUND = 1 << 5, /* worker is unbound */ + WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ + WORKER_REBIND = 1 << 5, /* mom is home, come back */ + WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ + WORKER_UNBOUND = 1 << 7, /* worker is unbound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | + WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + + /* gcwq->trustee_state */ + TRUSTEE_START = 0, /* start */ + TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ + TRUSTEE_BUTCHER = 2, /* butcher workers */ + TRUSTEE_RELEASE = 3, /* release workers */ + TRUSTEE_DONE = 4, /* trustee is done */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, @@ -75,6 +84,7 @@ enum { (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ + TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ /* * Rescue workers are used only on emergencies and shared by @@ -126,6 +136,7 @@ struct worker { unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ + struct work_struct rebind_work; /* L: rebind worker to cpu */ int sleeping; /* None */ }; @@ -153,8 +164,10 @@ struct global_cwq { struct ida worker_ida; /* L: for worker IDs */ + struct task_struct *trustee; /* L: for gcwq shutdown */ + unsigned int trustee_state; /* L: trustee state */ + wait_queue_head_t trustee_wait; /* trustee wait */ struct worker *first_idle; /* L: first idle worker */ - wait_queue_head_t idle_wait; } ____cacheline_aligned_in_smp; /* @@ -956,38 +969,13 @@ static bool is_chained_work(struct workqueue_struct *wq) return false; } -static void ___queue_work(struct workqueue_struct *wq, struct global_cwq *gcwq, - struct work_struct *work) -{ - struct cpu_workqueue_struct *cwq; - struct list_head *worklist; - unsigned int work_flags; - - /* gcwq determined, get cwq and queue */ - cwq = get_cwq(gcwq->cpu, wq); - trace_workqueue_queue_work(gcwq->cpu, cwq, work); - - BUG_ON(!list_empty(&work->entry)); - - cwq->nr_in_flight[cwq->work_color]++; - work_flags = work_color_to_flags(cwq->work_color); - - if (likely(cwq->nr_active < cwq->max_active)) { - trace_workqueue_activate_work(work); - cwq->nr_active++; - worklist = gcwq_determine_ins_pos(gcwq, cwq); - } else { - work_flags |= WORK_STRUCT_DELAYED; - worklist = &cwq->delayed_works; - } - - insert_work(cwq, work, worklist, work_flags); -} - static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct global_cwq *gcwq; + struct cpu_workqueue_struct *cwq; + struct list_head *worklist; + unsigned int work_flags; unsigned long flags; debug_work_activate(work); @@ -1033,32 +1021,27 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, spin_lock_irqsave(&gcwq->lock, flags); } - ___queue_work(wq, gcwq, work); + /* gcwq determined, get cwq and queue */ + cwq = get_cwq(gcwq->cpu, wq); + trace_workqueue_queue_work(cpu, cwq, work); - spin_unlock_irqrestore(&gcwq->lock, flags); -} + BUG_ON(!list_empty(&work->entry)); -/** - * queue_work_on - queue work on specific cpu - * @cpu: CPU number to execute work on - * @wq: workqueue to use - * @work: work to queue - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - * - * We queue the work to a specific CPU, the caller must ensure it - * can't go away. - */ -static int -__queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) -{ - int ret = 0; + cwq->nr_in_flight[cwq->work_color]++; + work_flags = work_color_to_flags(cwq->work_color); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - __queue_work(cpu, wq, work); - ret = 1; + if (likely(cwq->nr_active < cwq->max_active)) { + trace_workqueue_activate_work(work); + cwq->nr_active++; + worklist = gcwq_determine_ins_pos(gcwq, cwq); + } else { + work_flags |= WORK_STRUCT_DELAYED; + worklist = &cwq->delayed_works; } - return ret; + + insert_work(cwq, work, worklist, work_flags); + + spin_unlock_irqrestore(&gcwq->lock, flags); } /** @@ -1075,19 +1058,34 @@ int queue_work(struct workqueue_struct *wq, struct work_struct *work) { int ret; - ret = __queue_work_on(get_cpu_light(), wq, work); + ret = queue_work_on(get_cpu_light(), wq, work); put_cpu_light(); return ret; } EXPORT_SYMBOL_GPL(queue_work); +/** + * queue_work_on - queue work on specific cpu + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @work: work to queue + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to a specific CPU, the caller must ensure it + * can't go away. + */ int queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { - WARN_ON(wq->flags & WQ_NON_AFFINE); + int ret = 0; - return __queue_work_on(cpu, wq, work); + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_work(cpu, wq, work); + ret = 1; + } + return ret; } EXPORT_SYMBOL_GPL(queue_work_on); @@ -1133,8 +1131,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - WARN_ON((wq->flags & WQ_NON_AFFINE) && cpu != -1); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { unsigned int lcpu; @@ -1200,13 +1196,12 @@ static void worker_enter_idle(struct worker *worker) /* idle_list is LIFO */ list_add(&worker->entry, &gcwq->idle_list); - if (gcwq->nr_idle == gcwq->nr_workers) - wake_up_all(&gcwq->idle_wait); - - if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) { - mod_timer(&gcwq->idle_timer, - jiffies + IDLE_WORKER_TIMEOUT); - } + if (likely(!(worker->flags & WORKER_ROGUE))) { + if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) + mod_timer(&gcwq->idle_timer, + jiffies + IDLE_WORKER_TIMEOUT); + } else + wake_up_all(&gcwq->trustee_wait); /* sanity check nr_running */ WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && @@ -1283,14 +1278,8 @@ __acquires(&gcwq->lock) return false; if (task_cpu(task) == gcwq->cpu && cpumask_equal(¤t->cpus_allowed, - get_cpu_mask(gcwq->cpu))) { - /* - * Since we're binding to a particular cpu and need to - * stay there for correctness, mark us PF_THREAD_BOUND. - */ - task->flags |= PF_THREAD_BOUND; + get_cpu_mask(gcwq->cpu))) return true; - } spin_unlock_irq(&gcwq->lock); /* @@ -1304,15 +1293,20 @@ __acquires(&gcwq->lock) } } -static void worker_unbind_and_unlock(struct worker *worker) +/* + * Function for worker->rebind_work used to rebind rogue busy workers + * to the associated cpu which is coming back online. This is + * scheduled by cpu up but can race with other cpu hotplug operations + * and may be executed twice without intervening cpu down. + */ +static void worker_rebind_fn(struct work_struct *work) { + struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->gcwq; - struct task_struct *task = worker->task; - /* - * Its no longer required we're PF_THREAD_BOUND, the work is done. - */ - task->flags &= ~PF_THREAD_BOUND; + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_REBIND); + spin_unlock_irq(&gcwq->lock); } @@ -1324,6 +1318,7 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); + INIT_WORK(&worker->rebind_work, worker_rebind_fn); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1378,9 +1373,15 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) if (IS_ERR(worker->task)) goto fail; + /* + * A rogue worker will become a regular one if CPU comes + * online later on. Make sure every worker has + * PF_THREAD_BOUND set. + */ if (bind && !on_unbound_cpu) kthread_bind(worker->task, gcwq->cpu); else { + worker->task->flags |= PF_THREAD_BOUND; if (on_unbound_cpu) worker->flags |= WORKER_UNBOUND; } @@ -1657,6 +1658,13 @@ static bool manage_workers(struct worker *worker) gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + /* + * The trustee might be waiting to take over the manager + * position, tell it we're done. + */ + if (unlikely(gcwq->trustee)) + wake_up_all(&gcwq->trustee_wait); + return ret; } @@ -2057,7 +2065,7 @@ repeat: if (keep_working(gcwq)) wake_up_worker(gcwq); - worker_unbind_and_unlock(rescuer); + spin_unlock_irq(&gcwq->lock); } schedule(); @@ -3007,6 +3015,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (IS_ERR(rescuer->task)) goto err; + rescuer->task->flags |= PF_THREAD_BOUND; wake_up_process(rescuer->task); } @@ -3196,76 +3205,171 @@ EXPORT_SYMBOL_GPL(work_busy); * gcwqs serve mix of short, long and very long running works making * blocked draining impractical. * + * This is solved by allowing a gcwq to be detached from CPU, running + * it with unbound (rogue) workers and allowing it to be reattached + * later if the cpu comes back online. A separate thread is created + * to govern a gcwq in such state and is called the trustee of the + * gcwq. + * + * Trustee states and their descriptions. + * + * START Command state used on startup. On CPU_DOWN_PREPARE, a + * new trustee is started with this state. + * + * IN_CHARGE Once started, trustee will enter this state after + * assuming the manager role and making all existing + * workers rogue. DOWN_PREPARE waits for trustee to + * enter this state. After reaching IN_CHARGE, trustee + * tries to execute the pending worklist until it's empty + * and the state is set to BUTCHER, or the state is set + * to RELEASE. + * + * BUTCHER Command state which is set by the cpu callback after + * the cpu has went down. Once this state is set trustee + * knows that there will be no new works on the worklist + * and once the worklist is empty it can proceed to + * killing idle workers. + * + * RELEASE Command state which is set by the cpu callback if the + * cpu down has been canceled or it has come online + * again. After recognizing this state, trustee stops + * trying to drain or butcher and clears ROGUE, rebinds + * all remaining workers back to the cpu and releases + * manager role. + * + * DONE Trustee will enter this state after BUTCHER or RELEASE + * is complete. + * + * trustee CPU draining + * took over down complete + * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE + * | | ^ + * | CPU is back online v return workers | + * ----------------> RELEASE -------------- */ -static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct global_cwq *gcwq = get_gcwq(cpu); - struct worker *uninitialized_var(new_worker); - unsigned long flags; +/** + * trustee_wait_event_timeout - timed event wait for trustee + * @cond: condition to wait for + * @timeout: timeout in jiffies + * + * wait_event_timeout() for trustee to use. Handles locking and + * checks for RELEASE request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * Positive indicating left time if @cond is satisfied, 0 if timed + * out, -1 if canceled. + */ +#define trustee_wait_event_timeout(cond, timeout) ({ \ + long __ret = (timeout); \ + while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ + __ret) { \ + spin_unlock_irq(&gcwq->lock); \ + __wait_event_timeout(gcwq->trustee_wait, (cond) || \ + (gcwq->trustee_state == TRUSTEE_RELEASE), \ + __ret); \ + spin_lock_irq(&gcwq->lock); \ + } \ + gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ +}) - action &= ~CPU_TASKS_FROZEN; +/** + * trustee_wait_event - event wait for trustee + * @cond: condition to wait for + * + * wait_event() for trustee to use. Automatically handles locking and + * checks for CANCEL request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * 0 if @cond is satisfied, -1 if canceled. + */ +#define trustee_wait_event(cond) ({ \ + long __ret1; \ + __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ + __ret1 < 0 ? -1 : 0; \ +}) - switch (action) { - case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - new_worker = create_worker(gcwq, false); - if (!new_worker) - return NOTIFY_BAD; - case CPU_UP_CANCELED: - case CPU_ONLINE: - break; - default: - return notifier_from_errno(0); - } +static int __cpuinit trustee_thread(void *__gcwq) +{ + struct global_cwq *gcwq = __gcwq; + struct worker *worker; + struct work_struct *work; + struct hlist_node *pos; + long rc; + int i; - /* some are called w/ irq disabled, don't disturb irq status */ - spin_lock_irqsave(&gcwq->lock, flags); + BUG_ON(gcwq->cpu != smp_processor_id()); - switch (action) { - case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - gcwq->first_idle = new_worker; - break; + spin_lock_irq(&gcwq->lock); + /* + * Claim the manager position and make all workers rogue. + * Trustee must be bound to the target cpu and can't be + * cancelled. + */ + BUG_ON(gcwq->cpu != smp_processor_id()); + rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); + BUG_ON(rc < 0); - case CPU_UP_CANCELED: - destroy_worker(gcwq->first_idle); - gcwq->first_idle = NULL; - break; + gcwq->flags |= GCWQ_MANAGING_WORKERS; - case CPU_ONLINE: - spin_unlock_irq(&gcwq->lock); - kthread_bind(gcwq->first_idle->task, cpu); - spin_lock_irq(&gcwq->lock); - gcwq->flags |= GCWQ_MANAGE_WORKERS; - start_worker(gcwq->first_idle); - gcwq->first_idle = NULL; - break; - } + list_for_each_entry(worker, &gcwq->idle_list, entry) + worker->flags |= WORKER_ROGUE; - spin_unlock_irqrestore(&gcwq->lock, flags); + for_each_busy_worker(worker, i, pos, gcwq) + worker->flags |= WORKER_ROGUE; - return notifier_from_errno(0); -} + /* + * Call schedule() so that we cross rq->lock and thus can + * guarantee sched callbacks see the rogue flag. This is + * necessary as scheduler callbacks may be invoked from other + * cpus. + */ + spin_unlock_irq(&gcwq->lock); + schedule(); + spin_lock_irq(&gcwq->lock); -static void flush_gcwq(struct global_cwq *gcwq) -{ - struct work_struct *work, *nw; - struct worker *worker, *n; - LIST_HEAD(non_affine_works); + /* + * Sched callbacks are disabled now. Zap nr_running. After + * this, nr_running stays zero and need_more_worker() and + * keep_working() are always true as long as the worklist is + * not empty. + */ + atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); + spin_unlock_irq(&gcwq->lock); + del_timer_sync(&gcwq->idle_timer); spin_lock_irq(&gcwq->lock); - list_for_each_entry_safe(work, nw, &gcwq->worklist, entry) { - struct workqueue_struct *wq = get_work_cwq(work)->wq; - if (wq->flags & WQ_NON_AFFINE) - list_move(&work->entry, &non_affine_works); - } + /* + * We're now in charge. Notify and proceed to drain. We need + * to keep the gcwq running during the whole CPU down + * procedure as other cpu hotunplug callbacks may need to + * flush currently running tasks. + */ + gcwq->trustee_state = TRUSTEE_IN_CHARGE; + wake_up_all(&gcwq->trustee_wait); - while (!list_empty(&gcwq->worklist)) { + /* + * The original cpu is in the process of dying and may go away + * anytime now. When that happens, we and all workers would + * be migrated to other cpus. Try draining any left work. We + * want to get it over with ASAP - spam rescuers, wake up as + * many idlers as necessary and create new ones till the + * worklist is empty. Note that if the gcwq is frozen, there + * may be frozen works in freezable cwqs. Don't declare + * completion while frozen. + */ + while (gcwq->nr_workers != gcwq->nr_idle || + gcwq->flags & GCWQ_FREEZING || + gcwq->trustee_state == TRUSTEE_IN_CHARGE) { int nr_works = 0; list_for_each_entry(work, &gcwq->worklist, entry) { @@ -3279,55 +3383,200 @@ static void flush_gcwq(struct global_cwq *gcwq) wake_up_process(worker->task); } - spin_unlock_irq(&gcwq->lock); - if (need_to_create_worker(gcwq)) { - worker = create_worker(gcwq, true); - if (worker) + spin_unlock_irq(&gcwq->lock); + worker = create_worker(gcwq, false); + spin_lock_irq(&gcwq->lock); + if (worker) { + worker->flags |= WORKER_ROGUE; start_worker(worker); + } } - wait_event_timeout(gcwq->idle_wait, - gcwq->nr_idle == gcwq->nr_workers, HZ/10); - - spin_lock_irq(&gcwq->lock); + /* give a breather */ + if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) + break; } - WARN_ON(gcwq->nr_workers != gcwq->nr_idle); + /* + * Either all works have been scheduled and cpu is down, or + * cpu down has already been canceled. Wait for and butcher + * all workers till we're canceled. + */ + do { + rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); + while (!list_empty(&gcwq->idle_list)) + destroy_worker(list_first_entry(&gcwq->idle_list, + struct worker, entry)); + } while (gcwq->nr_workers && rc >= 0); - list_for_each_entry_safe(worker, n, &gcwq->idle_list, entry) - destroy_worker(worker); + /* + * At this point, either draining has completed and no worker + * is left, or cpu down has been canceled or the cpu is being + * brought back up. There shouldn't be any idle one left. + * Tell the remaining busy ones to rebind once it finishes the + * currently scheduled works by scheduling the rebind_work. + */ + WARN_ON(!list_empty(&gcwq->idle_list)); - WARN_ON(gcwq->nr_workers || gcwq->nr_idle); + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; - spin_unlock_irq(&gcwq->lock); + /* + * Rebind_work may race with future cpu hotplug + * operations. Use a separate flag to mark that + * rebinding is scheduled. + */ + worker->flags |= WORKER_REBIND; + worker->flags &= ~WORKER_ROGUE; - gcwq = get_gcwq(get_cpu_light()); - spin_lock_irq(&gcwq->lock); - list_for_each_entry_safe(work, nw, &non_affine_works, entry) { - list_del_init(&work->entry); - ___queue_work(get_work_cwq(work)->wq, gcwq, work); + /* queue rebind_work, wq doesn't matter, use the default one */ + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + debug_work_activate(rebind_work); + insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); } + + /* relinquish manager role */ + gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + + /* notify completion */ + gcwq->trustee = NULL; + gcwq->trustee_state = TRUSTEE_DONE; + wake_up_all(&gcwq->trustee_wait); spin_unlock_irq(&gcwq->lock); - put_cpu_light(); + return 0; } -static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, +/** + * wait_trustee_state - wait for trustee to enter the specified state + * @gcwq: gcwq the trustee of interest belongs to + * @state: target state to wait for + * + * Wait for the trustee to reach @state. DONE is already matched. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by cpu_callback. + */ +static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) +{ + if (!(gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE)) { + spin_unlock_irq(&gcwq->lock); + __wait_event(gcwq->trustee_wait, + gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE); + spin_lock_irq(&gcwq->lock); + } +} + +static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct global_cwq *gcwq = get_gcwq(cpu); + struct task_struct *new_trustee = NULL; + struct worker *uninitialized_var(new_worker); + unsigned long flags; action &= ~CPU_TASKS_FROZEN; - switch (action) { - case CPU_DOWN_PREPARE: - flush_gcwq(gcwq); - break; - } + switch (action) { + case CPU_DOWN_PREPARE: + new_trustee = kthread_create(trustee_thread, gcwq, + "workqueue_trustee/%d\n", cpu); + if (IS_ERR(new_trustee)) + return notifier_from_errno(PTR_ERR(new_trustee)); + kthread_bind(new_trustee, cpu); + /* fall through */ + case CPU_UP_PREPARE: + BUG_ON(gcwq->first_idle); + new_worker = create_worker(gcwq, false); + if (!new_worker) { + if (new_trustee) + kthread_stop(new_trustee); + return NOTIFY_BAD; + } + break; + case CPU_POST_DEAD: + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + case CPU_ONLINE: + break; + case CPU_DYING: + /* + * We access this lockless. We are on the dying CPU + * and called from stomp machine. + * + * Before this, the trustee and all workers except for + * the ones which are still executing works from + * before the last CPU down must be on the cpu. After + * this, they'll all be diasporas. + */ + gcwq->flags |= GCWQ_DISASSOCIATED; + default: + goto out; + } + + /* some are called w/ irq disabled, don't disturb irq status */ + spin_lock_irqsave(&gcwq->lock, flags); + + switch (action) { + case CPU_DOWN_PREPARE: + /* initialize trustee and tell it to acquire the gcwq */ + BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); + gcwq->trustee = new_trustee; + gcwq->trustee_state = TRUSTEE_START; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); + /* fall through */ + case CPU_UP_PREPARE: + BUG_ON(gcwq->first_idle); + gcwq->first_idle = new_worker; + break; + + case CPU_POST_DEAD: + gcwq->trustee_state = TRUSTEE_BUTCHER; + /* fall through */ + case CPU_UP_CANCELED: + destroy_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; + case CPU_DOWN_FAILED: + case CPU_ONLINE: + gcwq->flags &= ~GCWQ_DISASSOCIATED; + if (gcwq->trustee_state != TRUSTEE_DONE) { + gcwq->trustee_state = TRUSTEE_RELEASE; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_DONE); + } + + /* + * Trustee is done and there might be no worker left. + * Put the first_idle in and request a real manager to + * take a look. + */ + spin_unlock_irq(&gcwq->lock); + kthread_bind(gcwq->first_idle->task, cpu); + spin_lock_irq(&gcwq->lock); + gcwq->flags |= GCWQ_MANAGE_WORKERS; + start_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; + } + + spin_unlock_irqrestore(&gcwq->lock, flags); +out: return notifier_from_errno(0); } @@ -3524,8 +3773,7 @@ static int __init init_workqueues(void) unsigned int cpu; int i; - cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_ACTIVE); - hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_INACTIVE); + cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { @@ -3548,7 +3796,9 @@ static int __init init_workqueues(void) (unsigned long)gcwq); ida_init(&gcwq->worker_ida); - init_waitqueue_head(&gcwq->idle_wait); + + gcwq->trustee_state = TRUSTEE_DONE; + init_waitqueue_head(&gcwq->trustee_wait); } /* create the initial worker */ diff --git a/localversion-rt b/localversion-rt index 9f7d0bd..08b3e75 100644 --- a/localversion-rt +++ b/localversion-rt @@ -1 +1 @@ --rt13 +-rt14 diff --git a/mm/slab.c b/mm/slab.c index 15bce6b..64eb636 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -743,8 +743,26 @@ slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg) { unsigned int i; + get_cpu_light(); for_each_online_cpu(i) func(arg, i); + put_cpu_light(); +} + +static void lock_slab_on(unsigned int cpu) +{ + if (cpu == smp_processor_id()) + local_lock_irq(slab_lock); + else + local_spin_lock_irq(slab_lock, &per_cpu(slab_lock, cpu).lock); +} + +static void unlock_slab_on(unsigned int cpu) +{ + if (cpu == smp_processor_id()) + local_unlock_irq(slab_lock); + else + local_spin_unlock_irq(slab_lock, &per_cpu(slab_lock, cpu).lock); } #endif @@ -2692,10 +2710,10 @@ static void do_drain(void *arg, int cpu) { LIST_HEAD(tmp); - spin_lock_irq(&per_cpu(slab_lock, cpu).lock); + lock_slab_on(cpu); __do_drain(arg, cpu); list_splice_init(&per_cpu(slab_free_list, cpu), &tmp); - spin_unlock_irq(&per_cpu(slab_lock, cpu).lock); + unlock_slab_on(cpu); free_delayed(&tmp); } #endif @@ -4163,9 +4181,9 @@ static void do_ccupdate_local(void *info) #else static void do_ccupdate_local(void *info, int cpu) { - spin_lock_irq(&per_cpu(slab_lock, cpu).lock); + lock_slab_on(cpu); __do_ccupdate_local(info, cpu); - spin_unlock_irq(&per_cpu(slab_lock, cpu).lock); + unlock_slab_on(cpu); } #endif