diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pic.c b/arch/powerpc/platforms/52xx/mpc52xx_pic.c index b89ef65..b69221b 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_pic.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_pic.c @@ -373,8 +373,9 @@ static int mpc52xx_irqhost_map(struct irq_domain *h, unsigned int virq, case MPC52xx_IRQ_L1_PERP: irqchip = &mpc52xx_periph_irqchip; break; case MPC52xx_IRQ_L1_SDMA: irqchip = &mpc52xx_sdma_irqchip; break; case MPC52xx_IRQ_L1_CRIT: + default: pr_warn("%s: Critical IRQ #%d is unsupported! Nopping it.\n", - __func__, l2irq); + __func__, l1irq); irq_set_chip(virq, &no_irq_chip); return 0; } diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c index b7b7c90..6f61d5f 100644 --- a/drivers/misc/hwlat_detector.c +++ b/drivers/misc/hwlat_detector.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -51,6 +50,7 @@ #include #include #include +#include #define BUF_SIZE_DEFAULT 262144UL /* 8K*(sizeof(entry)) */ #define BUF_FLAGS (RB_FL_OVERWRITE) /* no block on full */ @@ -106,7 +106,6 @@ struct data; /* Global state */ /* Sampling functions */ static int __buffer_add_sample(struct sample *sample); static struct sample *buffer_get_sample(struct sample *sample); -static int get_sample(void *unused); /* Threading and state */ static int kthread_fn(void *unused); @@ -143,11 +142,12 @@ static void detector_exit(void); struct sample { u64 seqnum; /* unique sequence */ u64 duration; /* ktime delta */ + u64 outer_duration; /* ktime delta (outer loop) */ struct timespec timestamp; /* wall time */ unsigned long lost; }; -/* keep the global state somewhere. Mostly used under stop_machine. */ +/* keep the global state somewhere. */ static struct data { struct mutex lock; /* protect changes */ @@ -170,7 +170,7 @@ static struct data { * @sample: The new latency sample value * * This receives a new latency sample and records it in a global ring buffer. - * No additional locking is used in this case - suited for stop_machine use. + * No additional locking is used in this case. */ static int __buffer_add_sample(struct sample *sample) { @@ -210,29 +210,60 @@ static struct sample *buffer_get_sample(struct sample *sample) return sample; } +#ifndef CONFIG_TRACING +#define time_type ktime_t +#define time_get() ktime_get() +#define time_to_us(x) ktime_to_us(x) +#define time_sub(a, b) ktime_sub(a, b) +#define init_time(a, b) (a).tv64 = b +#define time_u64(a) (a).tv64 +#else +#define time_type u64 +#define time_get() trace_clock_local() +#define time_to_us(x) div_u64(x, 1000) +#define time_sub(a, b) ((a) - (b)) +#define init_time(a, b) a = b +#define time_u64(a) a +#endif /** * get_sample - sample the CPU TSC and look for likely hardware latencies - * @unused: This is not used but is a part of the stop_machine API * * Used to repeatedly capture the CPU TSC (or similar), looking for potential - * hardware-induced latency. Called under stop_machine, with data.lock held. + * hardware-induced latency. Called with interrupts disabled and with data.lock held. */ -static int get_sample(void *unused) +static int get_sample(void) { - ktime_t start, t1, t2; + time_type start, t1, t2, last_t2; s64 diff, total = 0; u64 sample = 0; - int ret = 1; + u64 outer_sample = 0; + int ret = -1; - start = ktime_get(); /* start timestamp */ + init_time(last_t2, 0); + start = time_get(); /* start timestamp */ do { - t1 = ktime_get(); /* we'll look for a discontinuity */ - t2 = ktime_get(); + t1 = time_get(); /* we'll look for a discontinuity */ + t2 = time_get(); + + if (time_u64(last_t2)) { + /* Check the delta from the outer loop (t2 to next t1) */ + diff = time_to_us(time_sub(t1, last_t2)); + /* This shouldn't happen */ + if (diff < 0) { + printk(KERN_ERR BANNER "time running backwards\n"); + goto out; + } + if (diff > outer_sample) + outer_sample = diff; + } + last_t2 = t2; + + total = time_to_us(time_sub(t2, start)); /* sample width */ - total = ktime_to_us(ktime_sub(t2, start)); /* sample width */ - diff = ktime_to_us(ktime_sub(t2, t1)); /* current diff */ + /* This checks the inner loop (t1 to t2) */ + diff = time_to_us(time_sub(t2, t1)); /* current diff */ /* This shouldn't happen */ if (diff < 0) { @@ -245,13 +276,18 @@ static int get_sample(void *unused) } while (total <= data.sample_width); + ret = 0; + /* If we exceed the threshold value, we have found a hardware latency */ - if (sample > data.threshold) { + if (sample > data.threshold || outer_sample > data.threshold) { struct sample s; + ret = 1; + data.count++; s.seqnum = data.count; s.duration = sample; + s.outer_duration = outer_sample; s.timestamp = CURRENT_TIME; __buffer_add_sample(&s); @@ -260,7 +296,6 @@ static int get_sample(void *unused) data.max_sample = sample; } - ret = 0; out: return ret; } @@ -270,32 +305,30 @@ out: * @unused: A required part of the kthread API. * * Used to periodically sample the CPU TSC via a call to get_sample. We - * use stop_machine, whith does (intentionally) introduce latency since we + * disable interrupts, which does (intentionally) introduce latency since we * need to ensure nothing else might be running (and thus pre-empting). * Obviously this should never be used in production environments. * - * stop_machine will schedule us typically only on CPU0 which is fine for - * almost every real-world hardware latency situation - but we might later - * generalize this if we find there are any actualy systems with alternate - * SMI delivery or other non CPU0 hardware latencies. + * Currently this runs on which ever CPU it was scheduled on, but most + * real-worald hardware latency situations occur across several CPUs, + * but we might later generalize this if we find there are any actualy + * systems with alternate SMI delivery or other hardware latencies. */ static int kthread_fn(void *unused) { - int err = 0; - u64 interval = 0; + int ret; + u64 interval; while (!kthread_should_stop()) { mutex_lock(&data.lock); - err = stop_machine(get_sample, unused, 0); - if (err) { - /* Houston, we have a problem */ - mutex_unlock(&data.lock); - goto err_out; - } + local_irq_disable(); + ret = get_sample(); + local_irq_enable(); - wake_up(&data.wq); /* wake up reader(s) */ + if (ret > 0) + wake_up(&data.wq); /* wake up reader(s) */ interval = data.sample_window - data.sample_width; do_div(interval, USEC_PER_MSEC); /* modifies interval value */ @@ -303,15 +336,10 @@ static int kthread_fn(void *unused) mutex_unlock(&data.lock); if (msleep_interruptible(interval)) - goto out; + break; } - goto out; -err_out: - printk(KERN_ERR BANNER "could not call stop_machine, disabling\n"); - enabled = 0; -out: - return err; + return 0; } /** @@ -407,8 +435,7 @@ out: * This function provides a generic read implementation for the global state * "data" structure debugfs filesystem entries. It would be nice to use * simple_attr_read directly, but we need to make sure that the data.lock - * spinlock is held during the actual read (even though we likely won't ever - * actually race here as the updater runs under a stop_machine context). + * is held during the actual read. */ static ssize_t simple_data_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos, const u64 *entry) @@ -443,8 +470,7 @@ static ssize_t simple_data_read(struct file *filp, char __user *ubuf, * This function provides a generic write implementation for the global state * "data" structure debugfs filesystem entries. It would be nice to use * simple_attr_write directly, but we need to make sure that the data.lock - * spinlock is held during the actual write (even though we likely won't ever - * actually race here as the updater runs under a stop_machine context). + * is held during the actual write. */ static ssize_t simple_data_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos, u64 *entry) @@ -738,10 +764,11 @@ static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf, } } - len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\n", - sample->timestamp.tv_sec, - sample->timestamp.tv_nsec, - sample->duration); + len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\t%llu\n", + sample->timestamp.tv_sec, + sample->timestamp.tv_nsec, + sample->duration, + sample->outer_duration); /* handling partial reads is more trouble than it's worth */ diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 4f33806..80a44b1 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -583,7 +583,7 @@ static void set_performant_mode(struct ctlr_info *h, struct CommandList *c) c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1); if (likely(h->msix_vector)) c->Header.ReplyQueue = - smp_processor_id() % h->nreply_queues; + raw_smp_processor_id() % h->nreply_queues; } } diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 11bdb1e..838d4bd 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -263,6 +263,7 @@ struct irq_affinity_notify { unsigned int irq; struct kref kref; struct work_struct work; + struct list_head list; void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask); void (*release)(struct kref *ref); }; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 81a28dd..cf5b5f7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -163,6 +163,62 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } +#ifdef CONFIG_PREEMPT_RT_FULL +static void _irq_affinity_notify(struct irq_affinity_notify *notify); +static struct task_struct *set_affinity_helper; +static LIST_HEAD(affinity_list); +static DEFINE_RAW_SPINLOCK(affinity_list_lock); + +static int set_affinity_thread(void *unused) +{ + while (1) { + struct irq_affinity_notify *notify; + int empty; + + set_current_state(TASK_INTERRUPTIBLE); + + raw_spin_lock_irq(&affinity_list_lock); + empty = list_empty(&affinity_list); + raw_spin_unlock_irq(&affinity_list_lock); + + if (empty) + schedule(); + if (kthread_should_stop()) + break; + set_current_state(TASK_RUNNING); +try_next: + notify = NULL; + + raw_spin_lock_irq(&affinity_list_lock); + if (!list_empty(&affinity_list)) { + notify = list_first_entry(&affinity_list, + struct irq_affinity_notify, list); + list_del_init(¬ify->list); + } + raw_spin_unlock_irq(&affinity_list_lock); + + if (!notify) + continue; + _irq_affinity_notify(notify); + goto try_next; + } + return 0; +} + +static void init_helper_thread(void) +{ + if (set_affinity_helper) + return; + set_affinity_helper = kthread_run(set_affinity_thread, NULL, + "affinity-cb"); + WARN_ON(IS_ERR(set_affinity_helper)); +} +#else + +static inline void init_helper_thread(void) { } + +#endif + int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) { struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -181,7 +237,17 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); + +#ifdef CONFIG_PREEMPT_RT_FULL + raw_spin_lock(&affinity_list_lock); + if (list_empty(&desc->affinity_notify->list)) + list_add_tail(&affinity_list, + &desc->affinity_notify->list); + raw_spin_unlock(&affinity_list_lock); + wake_up_process(set_affinity_helper); +#else schedule_work(&desc->affinity_notify->work); +#endif } irqd_set(data, IRQD_AFFINITY_SET); @@ -222,10 +288,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) } EXPORT_SYMBOL_GPL(irq_set_affinity_hint); -static void irq_affinity_notify(struct work_struct *work) +static void _irq_affinity_notify(struct irq_affinity_notify *notify) { - struct irq_affinity_notify *notify = - container_of(work, struct irq_affinity_notify, work); struct irq_desc *desc = irq_to_desc(notify->irq); cpumask_var_t cpumask; unsigned long flags; @@ -247,6 +311,13 @@ out: kref_put(¬ify->kref, notify->release); } +static void irq_affinity_notify(struct work_struct *work) +{ + struct irq_affinity_notify *notify = + container_of(work, struct irq_affinity_notify, work); + _irq_affinity_notify(notify); +} + /** * irq_set_affinity_notifier - control notification of IRQ affinity changes * @irq: Interrupt for which to enable/disable notification @@ -276,6 +347,8 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) notify->irq = irq; kref_init(¬ify->kref); INIT_WORK(¬ify->work, irq_affinity_notify); + INIT_LIST_HEAD(¬ify->list); + init_helper_thread(); } raw_spin_lock_irqsave(&desc->lock, flags); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 505e08f..d7c1471 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1321,10 +1321,6 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); p->on_rq = 1; - - /* if a worker is waking up, notify workqueue */ - if (p->flags & PF_WQ_WORKER) - wq_worker_waking_up(p, cpu_of(rq)); } /* @@ -1551,42 +1547,6 @@ out: } /** - * try_to_wake_up_local - try to wake up a local task with rq lock held - * @p: the thread to be awakened - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. - */ -static void try_to_wake_up_local(struct task_struct *p) -{ - struct rq *rq = task_rq(p); - - if (WARN_ON_ONCE(rq != this_rq()) || - WARN_ON_ONCE(p == current)) - return; - - lockdep_assert_held(&rq->lock); - - if (!raw_spin_trylock(&p->pi_lock)) { - raw_spin_unlock(&rq->lock); - raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - } - - if (!(p->state & TASK_NORMAL)) - goto out; - - if (!p->on_rq) - ttwu_activate(rq, p, ENQUEUE_WAKEUP); - - ttwu_do_wakeup(rq, p, 0); - ttwu_stat(p, smp_processor_id(), 0); -out: - raw_spin_unlock(&p->pi_lock); -} - -/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -3139,21 +3099,6 @@ need_resched: } else { deactivate_task(rq, prev, DEQUEUE_SLEEP); prev->on_rq = 0; - - /* - * If a worker went to sleep, notify and ask workqueue - * whether it wants to wake up a task to maintain - * concurrency. - * Only call wake up if prev isn't blocked on a sleeping - * spin lock. - */ - if (prev->flags & PF_WQ_WORKER && !prev->saved_state) { - struct task_struct *to_wakeup; - - to_wakeup = wq_worker_sleeping(prev, cpu); - if (to_wakeup) - try_to_wake_up_local(to_wakeup); - } } switch_count = &prev->nvcsw; } @@ -3197,6 +3142,14 @@ static inline void sched_submit_work(struct task_struct *tsk) { if (!tsk->state || tsk_is_pi_blocked(tsk)) return; + + /* + * If a worker went to sleep, notify and ask workqueue whether + * it wants to wake up a task to maintain concurrency. + */ + if (tsk->flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. @@ -3205,12 +3158,19 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } +static inline void sched_update_worker(struct task_struct *tsk) +{ + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); +} + asmlinkage void __sched schedule(void) { struct task_struct *tsk = current; sched_submit_work(tsk); __schedule(); + sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index bb1edfa..3e93516 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -529,10 +530,49 @@ static void sync_cmos_clock(struct work_struct *work) schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } +#ifdef CONFIG_PREEMPT_RT_FULL +/* + * RT can not call schedule_delayed_work from real interrupt context. + * Need to make a thread to do the real work. + */ +static struct task_struct *cmos_delay_thread; +static bool do_cmos_delay; + +static int run_cmos_delay(void *ignore) +{ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (do_cmos_delay) { + do_cmos_delay = false; + schedule_delayed_work(&sync_cmos_work, 0); + } + schedule(); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static void notify_cmos_timer(void) +{ + do_cmos_delay = true; + /* Make visible before waking up process */ + smp_wmb(); + wake_up_process(cmos_delay_thread); +} + +static __init int create_cmos_delay_thread(void) +{ + cmos_delay_thread = kthread_run(run_cmos_delay, NULL, "kcmosdelayd"); + BUG_ON(!cmos_delay_thread); + return 0; +} +early_initcall(create_cmos_delay_thread); +#else static void notify_cmos_timer(void) { schedule_delayed_work(&sync_cmos_work, 0); } +#endif /* CONFIG_PREEMPT_RT_FULL */ #else static inline void notify_cmos_timer(void) { } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 3947835..1bbb1b2 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -44,6 +44,7 @@ u64 notrace trace_clock_local(void) return clock; } +EXPORT_SYMBOL_GPL(trace_clock_local); /* * trace_clock(): 'between' trace clock. Not completely serialized, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 99855b3..788b017 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -149,6 +149,7 @@ struct worker { unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ + int sleeping; /* None */ /* for rebinding worker to CPU */ struct work_struct rebind_work; /* L: for busy worker */ @@ -730,54 +731,45 @@ static void wake_up_worker(struct worker_pool *pool) } /** - * wq_worker_waking_up - a worker is waking up - * @task: task waking up - * @cpu: CPU @task is waking up to + * wq_worker_running - a worker is running again + * @task: task returning from sleep * - * This function is called during try_to_wake_up() when a worker is - * being awoken. - * - * CONTEXT: - * spin_lock_irq(rq->lock) + * This function is called when a worker returns from schedule() */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +void wq_worker_running(struct task_struct *task) { struct worker *worker = kthread_data(task); - if (!(worker->flags & WORKER_NOT_RUNNING)) { - WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); + if (!worker->sleeping) + return; + if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(get_pool_nr_running(worker->pool)); - } + worker->sleeping = 0; } /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep - * @cpu: CPU in question, must be the current CPU number - * - * This function is called during schedule() when a busy worker is - * going to sleep. Worker on the same cpu can be woken up by - * returning pointer to its task. - * - * CONTEXT: - * spin_lock_irq(rq->lock) - * - * RETURNS: - * Worker task on @cpu to wake up, %NULL if none. + * This function is called from schedule() when a busy worker is + * going to sleep. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu) +void wq_worker_sleeping(struct task_struct *task) { - struct worker *worker = kthread_data(task), *to_wakeup = NULL; - struct worker_pool *pool = worker->pool; - atomic_t *nr_running = get_pool_nr_running(pool); + struct worker *next, *worker = kthread_data(task); + struct worker_pool *pool; + atomic_t *nr_running; if (worker->flags & WORKER_NOT_RUNNING) - return NULL; + return; + + if (WARN_ON_ONCE(worker->sleeping)) + return; - /* this can only happen on the local cpu */ - BUG_ON(cpu != raw_smp_processor_id()); + pool = worker->pool; + nr_running = get_pool_nr_running(pool); + worker->sleeping = 1; + spin_lock_irq(&pool->gcwq->lock); /* * The counterpart of the following dec_and_test, implied mb, * worklist not empty test sequence is in insert_work(). @@ -789,9 +781,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * manipulating idle_list, so dereferencing idle_list without gcwq * lock is safe. */ - if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) - to_wakeup = first_worker(pool); - return to_wakeup ? to_wakeup->task : NULL; + if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) { + next = first_worker(pool); + if (next) + wake_up_process(next->task); + } + spin_unlock_irq(&pool->gcwq->lock); } /** diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h index 2d10fc9..3bf73e2 100644 --- a/kernel/workqueue_sched.h +++ b/kernel/workqueue_sched.h @@ -4,6 +4,5 @@ * Scheduler hooks for concurrency managed workqueue. Only to be * included from sched.c and workqueue.c. */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu); +void wq_worker_running(struct task_struct *task); +void wq_worker_sleeping(struct task_struct *task); diff --git a/localversion-rt b/localversion-rt index 18777ec..1199eba 100644 --- a/localversion-rt +++ b/localversion-rt @@ -1 +1 @@ --rt15 +-rt16