Index: linux-2.6/drivers/rtc/interface.c =================================================================== --- linux-2.6.orig/drivers/rtc/interface.c +++ linux-2.6/drivers/rtc/interface.c @@ -636,6 +636,29 @@ void rtc_irq_unregister(struct rtc_devic } EXPORT_SYMBOL_GPL(rtc_irq_unregister); +static int rtc_update_hrtimer(struct rtc_device *rtc, int enabled) +{ + /* + * We unconditionally cancel the timer here, because otherwise + * we could run into BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); + * when we manage to start the timer before the callback + * returns HRTIMER_RESTART. + * + * We cannot use hrtimer_cancel() here as a running callback + * could be blocked on rtc->irq_task_lock and hrtimer_cancel() + * would spin forever. + */ + if (hrtimer_try_to_cancel(&rtc->pie_timer) < 0) + return -1; + + if (enabled) { + ktime_t period = ktime_set(0, NSEC_PER_SEC / rtc->irq_freq); + + hrtimer_start(&rtc->pie_timer, period, HRTIMER_MODE_REL); + } + return 0; +} + /** * rtc_irq_set_state - enable/disable 2^N Hz periodic IRQs * @rtc: the rtc device @@ -651,21 +674,21 @@ int rtc_irq_set_state(struct rtc_device int err = 0; unsigned long flags; +retry: spin_lock_irqsave(&rtc->irq_task_lock, flags); if (rtc->irq_task != NULL && task == NULL) err = -EBUSY; if (rtc->irq_task != task) err = -EACCES; - - if (enabled) { - ktime_t period = ktime_set(0, NSEC_PER_SEC/rtc->irq_freq); - hrtimer_start(&rtc->pie_timer, period, HRTIMER_MODE_REL); - } else { - hrtimer_cancel(&rtc->pie_timer); + if (!err) { + if (rtc_update_hrtimer(rtc, enabled) < 0) { + spin_unlock_irqrestore(&rtc->irq_task_lock, flags); + cpu_relax(); + goto retry; + } + rtc->pie_enabled = enabled; } - rtc->pie_enabled = enabled; spin_unlock_irqrestore(&rtc->irq_task_lock, flags); - return err; } EXPORT_SYMBOL_GPL(rtc_irq_set_state); @@ -685,22 +708,20 @@ int rtc_irq_set_freq(struct rtc_device * int err = 0; unsigned long flags; - if (freq <= 0) + if (freq <= 0 || freq > 5000) return -EINVAL; - +retry: spin_lock_irqsave(&rtc->irq_task_lock, flags); if (rtc->irq_task != NULL && task == NULL) err = -EBUSY; if (rtc->irq_task != task) err = -EACCES; - if (err == 0) { + if (!err) { rtc->irq_freq = freq; - if (rtc->pie_enabled) { - ktime_t period; - hrtimer_cancel(&rtc->pie_timer); - period = ktime_set(0, NSEC_PER_SEC/rtc->irq_freq); - hrtimer_start(&rtc->pie_timer, period, - HRTIMER_MODE_REL); + if (rtc->pie_enabled && rtc_update_hrtimer(rtc, 1) < 0) { + spin_unlock_irqrestore(&rtc->irq_task_lock, flags); + cpu_relax(); + goto retry; } } spin_unlock_irqrestore(&rtc->irq_task_lock, flags); Index: linux-2.6/kernel/trace/ftrace.c =================================================================== --- linux-2.6.orig/kernel/trace/ftrace.c +++ linux-2.6/kernel/trace/ftrace.c @@ -1182,8 +1182,14 @@ alloc_and_copy_ftrace_hash(int size_bits return NULL; } +static void +ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +static void +ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); + static int -ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) { struct ftrace_func_entry *entry; struct hlist_node *tp, *tn; @@ -1193,9 +1199,16 @@ ftrace_hash_move(struct ftrace_hash **ds unsigned long key; int size = src->count; int bits = 0; + int ret; int i; /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable(ops, enable); + + /* * If the new source is empty, just free dst and assign it * the empty_hash. */ @@ -1215,9 +1228,10 @@ ftrace_hash_move(struct ftrace_hash **ds if (bits > FTRACE_HASH_MAX_BITS) bits = FTRACE_HASH_MAX_BITS; + ret = -ENOMEM; new_hash = alloc_ftrace_hash(bits); if (!new_hash) - return -ENOMEM; + goto out; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1236,7 +1250,16 @@ ftrace_hash_move(struct ftrace_hash **ds rcu_assign_pointer(*dst, new_hash); free_ftrace_hash_rcu(old_hash); - return 0; + ret = 0; + out: + /* + * Enable regardless of ret: + * On success, we enable the new hash. + * On failure, we re-enable the original hash. + */ + ftrace_hash_rec_enable(ops, enable); + + return ret; } /* @@ -2857,7 +2880,7 @@ ftrace_set_regex(struct ftrace_ops *ops, ftrace_match_records(hash, buf, len); mutex_lock(&ftrace_lock); - ret = ftrace_hash_move(orig_hash, hash); + ret = ftrace_hash_move(ops, enable, orig_hash, hash); mutex_unlock(&ftrace_lock); mutex_unlock(&ftrace_regex_lock); @@ -3040,18 +3063,12 @@ ftrace_regex_release(struct inode *inode orig_hash = &iter->ops->notrace_hash; mutex_lock(&ftrace_lock); - /* - * Remove the current set, update the hash and add - * them back. - */ - ftrace_hash_rec_disable(iter->ops, filter_hash); - ret = ftrace_hash_move(orig_hash, iter->hash); - if (!ret) { - ftrace_hash_rec_enable(iter->ops, filter_hash); - if (iter->ops->flags & FTRACE_OPS_FL_ENABLED - && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); - } + ret = ftrace_hash_move(iter->ops, filter_hash, + orig_hash, iter->hash); + if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftrace_lock); } free_ftrace_hash(iter->hash); Index: linux-2.6/drivers/block/floppy.c =================================================================== --- linux-2.6.orig/drivers/block/floppy.c +++ linux-2.6/drivers/block/floppy.c @@ -4250,7 +4250,7 @@ static int __init floppy_init(void) use_virtual_dma = can_use_virtual_dma & 1; fdc_state[0].address = FDC1; if (fdc_state[0].address == -1) { - del_timer(&fd_timeout); + del_timer_sync(&fd_timeout); err = -ENODEV; goto out_unreg_region; } @@ -4261,7 +4261,7 @@ static int __init floppy_init(void) fdc = 0; /* reset fdc in case of unexpected interrupt */ err = floppy_grab_irq_and_dma(); if (err) { - del_timer(&fd_timeout); + del_timer_sync(&fd_timeout); err = -EBUSY; goto out_unreg_region; } @@ -4318,7 +4318,7 @@ static int __init floppy_init(void) user_reset_fdc(-1, FD_RESET_ALWAYS, false); } fdc = 0; - del_timer(&fd_timeout); + del_timer_sync(&fd_timeout); current_drive = 0; initialized = true; if (have_no_fdc) { @@ -4368,7 +4368,7 @@ out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); out_put_disk: while (dr--) { - del_timer(&motor_off_timer[dr]); + del_timer_sync(&motor_off_timer[dr]); if (disks[dr]->queue) blk_cleanup_queue(disks[dr]->queue); put_disk(disks[dr]); Index: linux-2.6/drivers/gpu/drm/drm_irq.c =================================================================== --- linux-2.6.orig/drivers/gpu/drm/drm_irq.c +++ linux-2.6/drivers/gpu/drm/drm_irq.c @@ -109,10 +109,7 @@ static void vblank_disable_and_save(stru /* Prevent vblank irq processing while disabling vblank irqs, * so no updates of timestamps or count can happen after we've * disabled. Needed to prevent races in case of delayed irq's. - * Disable preemption, so vblank_time_lock is held as short as - * possible, even under a kernel with PREEMPT_RT patches. */ - preempt_disable(); spin_lock_irqsave(&dev->vblank_time_lock, irqflags); dev->driver->disable_vblank(dev, crtc); @@ -163,7 +160,6 @@ static void vblank_disable_and_save(stru clear_vblank_timestamps(dev, crtc); spin_unlock_irqrestore(&dev->vblank_time_lock, irqflags); - preempt_enable(); } static void vblank_disable_fn(unsigned long arg) @@ -875,10 +871,6 @@ int drm_vblank_get(struct drm_device *de spin_lock_irqsave(&dev->vbl_lock, irqflags); /* Going from 0->1 means we have to enable interrupts again */ if (atomic_add_return(1, &dev->vblank_refcount[crtc]) == 1) { - /* Disable preemption while holding vblank_time_lock. Do - * it explicitely to guard against PREEMPT_RT kernel. - */ - preempt_disable(); spin_lock_irqsave(&dev->vblank_time_lock, irqflags2); if (!dev->vblank_enabled[crtc]) { /* Enable vblank irqs under vblank_time_lock protection. @@ -898,7 +890,6 @@ int drm_vblank_get(struct drm_device *de } } spin_unlock_irqrestore(&dev->vblank_time_lock, irqflags2); - preempt_enable(); } else { if (!dev->vblank_enabled[crtc]) { atomic_dec(&dev->vblank_refcount[crtc]); Index: linux-2.6/arch/x86/kernel/kprobes.c =================================================================== --- linux-2.6.orig/arch/x86/kernel/kprobes.c +++ linux-2.6/arch/x86/kernel/kprobes.c @@ -475,7 +475,6 @@ static void __kprobes setup_singlestep(s * stepping. */ regs->ip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); return; } #endif Index: linux-2.6/drivers/ide/ide_platform.c =================================================================== --- linux-2.6.orig/drivers/ide/ide_platform.c +++ linux-2.6/drivers/ide/ide_platform.c @@ -95,7 +95,7 @@ static int __devinit plat_ide_probe(stru plat_ide_setup_ports(&hw, base, alt_base, pdata, res_irq->start); hw.dev = &pdev->dev; - d.irq_flags = res_irq->flags; + d.irq_flags = 0; if (mmio) d.host_flags |= IDE_HFLAG_MMIO; Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -185,6 +185,7 @@ void init_rt_bandwidth(struct rt_bandwid hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.irqsafe = 1; rt_b->rt_period_timer.function = sched_rt_period_timer; } @@ -800,7 +801,11 @@ late_initcall(sched_init_debug); * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ +#ifndef CONFIG_PREEMPT_RT_FULL const_debug unsigned int sysctl_sched_nr_migrate = 32; +#else +const_debug unsigned int sysctl_sched_nr_migrate = 8; +#endif /* * period over which we average the RT time consumption, measured @@ -1136,6 +1141,7 @@ static void init_rq_hrtick(struct rq *rq hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; + rq->hrtick_timer.irqsafe = 1; } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -2378,11 +2384,11 @@ static int select_fallback_rq(int cpu, s /* Look for allowed, online CPU in same node. */ for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); if (dest_cpu < nr_cpu_ids) return dest_cpu; @@ -2419,7 +2425,7 @@ int select_task_rq(struct task_struct *p * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || !cpu_online(cpu))) cpu = select_fallback_rq(task_cpu(p), p); @@ -2477,10 +2483,6 @@ static void ttwu_activate(struct rq *rq, { activate_task(rq, p, en_flags); p->on_rq = 1; - - /* if a worker is waking up, notify workqueue */ - if (p->flags & PF_WQ_WORKER) - wq_worker_waking_up(p, cpu_of(rq)); } /* @@ -2678,8 +2680,25 @@ try_to_wake_up(struct task_struct *p, un smp_wmb(); raw_spin_lock_irqsave(&p->pi_lock, flags); - if (!(p->state & state)) + if (!(p->state & state)) { + /* + * The task might be running due to a spinlock sleeper + * wakeup. Check the saved state and set it to running + * if the wakeup condition is true. + */ + if (!(wake_flags & WF_LOCK_SLEEPER)) { + if (p->saved_state & state) + p->saved_state = TASK_RUNNING; + } goto out; + } + + /* + * If this is a regular wakeup, then we can unconditionally + * clear the saved state of a "lock sleeper". + */ + if (!(wake_flags & WF_LOCK_SLEEPER)) + p->saved_state = TASK_RUNNING; success = 1; /* we're going to change ->state */ cpu = task_cpu(p); @@ -2735,40 +2754,6 @@ out: } /** - * try_to_wake_up_local - try to wake up a local task with rq lock held - * @p: the thread to be awakened - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. - */ -static void try_to_wake_up_local(struct task_struct *p) -{ - struct rq *rq = task_rq(p); - - BUG_ON(rq != this_rq()); - BUG_ON(p == current); - lockdep_assert_held(&rq->lock); - - if (!raw_spin_trylock(&p->pi_lock)) { - raw_spin_unlock(&rq->lock); - raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - } - - if (!(p->state & TASK_NORMAL)) - goto out; - - if (!p->on_rq) - ttwu_activate(rq, p, ENQUEUE_WAKEUP); - - ttwu_do_wakeup(rq, p, 0); - ttwu_stat(p, smp_processor_id(), 0); -out: - raw_spin_unlock(&p->pi_lock); -} - -/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -2785,6 +2770,18 @@ int wake_up_process(struct task_struct * } EXPORT_SYMBOL(wake_up_process); +/** + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock" + * @p: The process to be woken up. + * + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate + * the nature of the wakeup. + */ +int wake_up_lock_sleeper(struct task_struct *p) +{ + return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER); +} + int wake_up_state(struct task_struct *p, unsigned int state) { return try_to_wake_up(p, state, 0); @@ -2825,7 +2822,7 @@ static void __sched_fork(struct task_str void sched_fork(struct task_struct *p) { unsigned long flags; - int cpu = get_cpu(); + int cpu; __sched_fork(p); /* @@ -2865,6 +2862,7 @@ void sched_fork(struct task_struct *p) if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + cpu = get_cpu(); if (p->sched_class->task_fork) p->sched_class->task_fork(p); @@ -2876,8 +2874,9 @@ void sched_fork(struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); + set_task_cpu(p, smp_processor_id()); raw_spin_unlock_irqrestore(&p->pi_lock, flags); + put_cpu(); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2893,8 +2892,6 @@ void sched_fork(struct task_struct *p) #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); #endif - - put_cpu(); } /* @@ -3060,8 +3057,12 @@ static void finish_task_switch(struct rq finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); + /* + * We use mmdrop_delayed() here so we don't have to do the + * full __mmdrop() when we are the last user. + */ if (mm) - mmdrop(mm); + mmdrop_delayed(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this @@ -4242,9 +4243,9 @@ pick_next_task(struct rq *rq) } /* - * schedule() is the main scheduler function. + * __schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +static void __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -4272,29 +4273,6 @@ need_resched: } else { deactivate_task(rq, prev, DEQUEUE_SLEEP); prev->on_rq = 0; - - /* - * If a worker went to sleep, notify and ask workqueue - * whether it wants to wake up a task to maintain - * concurrency. - */ - if (prev->flags & PF_WQ_WORKER) { - struct task_struct *to_wakeup; - - to_wakeup = wq_worker_sleeping(prev, cpu); - if (to_wakeup) - try_to_wake_up_local(to_wakeup); - } - - /* - * If we are going to sleep and we have plugged IO - * queued, make sure to submit it to avoid deadlocks. - */ - if (blk_needs_flush_plug(prev)) { - raw_spin_unlock(&rq->lock); - blk_schedule_flush_plug(prev); - raw_spin_lock(&rq->lock); - } } switch_count = &prev->nvcsw; } @@ -4328,12 +4306,62 @@ need_resched: post_schedule(rq); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); if (need_resched()) goto need_resched; } + +static inline void sched_submit_work(struct task_struct *tsk) +{ + if (!tsk->state || tsk_is_pi_blocked(tsk)) + return; + + /* + * If a worker went to sleep, notify and ask workqueue whether + * it wants to wake up a task to maintain concurrency. + */ + if (tsk->flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(tsk)) + blk_schedule_flush_plug(tsk); +} + +static inline void sched_update_worker(struct task_struct *tsk) +{ + if (tsk_is_pi_blocked(tsk)) + return; + + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); +} + +asmlinkage void schedule(void) +{ + struct task_struct *tsk = current; + + sched_submit_work(tsk); + __schedule(); + sched_update_worker(tsk); +} EXPORT_SYMBOL(schedule); +/** + * schedule_preempt_disabled - called with preemption disabled + * + * Returns with preemption disabled. Note: preempt_count must be 1 + */ +void __sched schedule_preempt_disabled(void) +{ + __preempt_enable_no_resched(); + schedule(); + preempt_disable(); +} + #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline bool owner_running(struct mutex *lock, struct task_struct *owner) @@ -4405,7 +4433,7 @@ asmlinkage void __sched notrace preempt_ do { add_preempt_count_notrace(PREEMPT_ACTIVE); - schedule(); + __schedule(); sub_preempt_count_notrace(PREEMPT_ACTIVE); /* @@ -4433,7 +4461,7 @@ asmlinkage void __sched preempt_schedule do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); - schedule(); + __schedule(); local_irq_disable(); sub_preempt_count(PREEMPT_ACTIVE); @@ -4828,9 +4856,8 @@ long __sched sleep_on_timeout(wait_queue EXPORT_SYMBOL(sleep_on_timeout); #ifdef CONFIG_RT_MUTEXES - /* - * rt_mutex_setprio - set the current priority of a task + * task_setprio - set the current priority of a task * @p: task * @prio: prio value (kernel-internal form) * @@ -4839,7 +4866,7 @@ EXPORT_SYMBOL(sleep_on_timeout); * * Used by the rt_mutex code to implement priority inheritance logic. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void task_setprio(struct task_struct *p, int prio) { int oldprio, on_rq, running; struct rq *rq; @@ -4849,6 +4876,24 @@ void rt_mutex_setprio(struct task_struct rq = __task_rq_lock(p); + /* + * Idle task boosting is a nono in general. There is one + * exception, when PREEMPT_RT and NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; @@ -4872,9 +4917,9 @@ void rt_mutex_setprio(struct task_struct enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio); +out_unlock: __task_rq_unlock(rq); } - #endif void set_user_nice(struct task_struct *p, long nice) @@ -5543,7 +5588,7 @@ SYSCALL_DEFINE0(sched_yield) __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); do_raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); schedule(); @@ -5557,9 +5602,17 @@ static inline int should_resched(void) static void __cond_resched(void) { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + do { + add_preempt_count(PREEMPT_ACTIVE); + __schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + /* + * Check again in case we missed a preemption + * opportunity between schedule and now. + */ + barrier(); + + } while (need_resched()); } int __sched _cond_resched(void) @@ -5600,6 +5653,7 @@ int __cond_resched_lock(spinlock_t *lock } EXPORT_SYMBOL(__cond_resched_lock); +#ifndef CONFIG_PREEMPT_RT_FULL int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); @@ -5613,6 +5667,7 @@ int __sched __cond_resched_softirq(void) return 0; } EXPORT_SYMBOL(__cond_resched_softirq); +#endif /** * yield - yield the current processor to other threads. @@ -5859,7 +5914,7 @@ void show_state_filter(unsigned long sta printk(KERN_INFO " task PC stack pid father\n"); #endif - read_lock(&tasklist_lock); + rcu_read_lock(); do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -5875,7 +5930,7 @@ void show_state_filter(unsigned long sta #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * Only show locks if all tasks are dumped: */ @@ -5997,12 +6052,12 @@ static inline void sched_init_granularit #ifdef CONFIG_SMP void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { - if (p->sched_class && p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - else { - cpumask_copy(&p->cpus_allowed, new_mask); + if (!p->migrate_disable) { + if (p->sched_class && p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } + cpumask_copy(&p->cpus_allowed, new_mask); } /* @@ -6053,7 +6108,7 @@ int set_cpus_allowed_ptr(struct task_str do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) + if (cpumask_test_cpu(task_cpu(p), new_mask) || p->migrate_disable) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); @@ -6072,6 +6127,83 @@ out: } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +void migrate_disable(void) +{ + struct task_struct *p = current; + const struct cpumask *mask; + unsigned long flags; + struct rq *rq; + + preempt_disable(); + if (p->migrate_disable) { + p->migrate_disable++; + preempt_enable(); + return; + } + + pin_current_cpu(); + if (unlikely(!scheduler_running)) { + p->migrate_disable = 1; + preempt_enable(); + return; + } + rq = task_rq_lock(p, &flags); + p->migrate_disable = 1; + mask = tsk_cpus_allowed(p); + + WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask)); + + if (!cpumask_equal(&p->cpus_allowed, mask)) { + if (p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, mask); + p->rt.nr_cpus_allowed = cpumask_weight(mask); + } + task_rq_unlock(rq, p, &flags); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); + +void migrate_enable(void) +{ + struct task_struct *p = current; + const struct cpumask *mask; + unsigned long flags; + struct rq *rq; + + WARN_ON_ONCE(p->migrate_disable <= 0); + + preempt_disable(); + if (p->migrate_disable > 1) { + p->migrate_disable--; + preempt_enable(); + return; + } + + if (unlikely(!scheduler_running)) { + p->migrate_disable = 0; + unpin_current_cpu(); + preempt_enable(); + return; + } + + rq = task_rq_lock(p, &flags); + p->migrate_disable = 0; + mask = tsk_cpus_allowed(p); + + WARN_ON(!cpumask_test_cpu(smp_processor_id(), mask)); + + if (!cpumask_equal(&p->cpus_allowed, mask)) { + if (p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, mask); + p->rt.nr_cpus_allowed = cpumask_weight(mask); + } + + task_rq_unlock(rq, p, &flags); + unpin_current_cpu(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_enable); + /* * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() @@ -6100,7 +6232,7 @@ static int __migrate_task(struct task_st if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; /* @@ -6142,6 +6274,8 @@ static int migration_cpu_stop(void *data #ifdef CONFIG_HOTPLUG_CPU +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm); + /* * Ensures that the idle task is using init_mm right before its cpu goes * offline. @@ -6154,7 +6288,12 @@ void idle_task_exit(void) if (mm != &init_mm) switch_mm(mm, &init_mm, current); - mmdrop(mm); + + /* + * Defer the cleanup to an alive cpu. On RT we can neither + * call mmdrop() nor mmdrop_delayed() from here. + */ + per_cpu(idle_last_mm, smp_processor_id()) = mm; } /* @@ -6472,6 +6611,12 @@ migration_call(struct notifier_block *nf migrate_nr_uninterruptible(rq); calc_global_load_remove(rq); break; + case CPU_DEAD: + if (per_cpu(idle_last_mm, cpu)) { + mmdrop(per_cpu(idle_last_mm, cpu)); + per_cpu(idle_last_mm, cpu) = NULL; + } + break; #endif } @@ -8188,7 +8333,8 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + int nested = (preempt_count() & ~PREEMPT_ACTIVE) + + sched_rcu_preempt_depth(); return (nested == preempt_offset); } Index: linux-2.6/block/blk-core.c =================================================================== --- linux-2.6.orig/block/blk-core.c +++ linux-2.6/block/blk-core.c @@ -236,7 +236,7 @@ EXPORT_SYMBOL(blk_delay_queue); **/ void blk_start_queue(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); __blk_run_queue(q); @@ -301,7 +301,11 @@ void __blk_run_queue(struct request_queu { if (unlikely(blk_queue_stopped(q))) return; - + /* + * q->request_fn() can drop q->queue_lock and reenable + * interrupts, but must return with q->queue_lock held and + * interrupts disabled. + */ q->request_fn(q); } EXPORT_SYMBOL(__blk_run_queue); @@ -2667,11 +2671,11 @@ static void queue_unplugged(struct reque * this lock). */ if (from_schedule) { - spin_unlock(q->queue_lock); + spin_unlock_irq(q->queue_lock); blk_run_queue_async(q); } else { __blk_run_queue(q); - spin_unlock(q->queue_lock); + spin_unlock_irq(q->queue_lock); } } @@ -2697,7 +2701,6 @@ static void flush_plug_callbacks(struct void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request_queue *q; - unsigned long flags; struct request *rq; LIST_HEAD(list); unsigned int depth; @@ -2718,11 +2721,6 @@ void blk_flush_plug_list(struct blk_plug q = NULL; depth = 0; - /* - * Save and disable interrupts here, to avoid doing it for every - * queue lock we have to take. - */ - local_irq_save(flags); while (!list_empty(&list)) { rq = list_entry_rq(list.next); list_del_init(&rq->queuelist); @@ -2735,7 +2733,7 @@ void blk_flush_plug_list(struct blk_plug queue_unplugged(q, depth, from_schedule); q = rq->q; depth = 0; - spin_lock(q->queue_lock); + spin_lock_irq(q->queue_lock); } /* * rq is already accounted, so use raw insert @@ -2753,8 +2751,6 @@ void blk_flush_plug_list(struct blk_plug */ if (q) queue_unplugged(q, depth, from_schedule); - - local_irq_restore(flags); } void blk_finish_plug(struct blk_plug *plug) Index: linux-2.6/kernel/workqueue.c =================================================================== --- linux-2.6.orig/kernel/workqueue.c +++ linux-2.6/kernel/workqueue.c @@ -137,6 +137,7 @@ struct worker { unsigned int flags; /* X: flags */ int id; /* I: worker id */ struct work_struct rebind_work; /* L: rebind worker to cpu */ + int sleeping; /* None */ }; /* @@ -657,66 +658,58 @@ static void wake_up_worker(struct global } /** - * wq_worker_waking_up - a worker is waking up - * @task: task waking up - * @cpu: CPU @task is waking up to + * wq_worker_running - a worker is running again + * @task: task returning from sleep * - * This function is called during try_to_wake_up() when a worker is - * being awoken. - * - * CONTEXT: - * spin_lock_irq(rq->lock) + * This function is called when a worker returns from schedule() */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +void wq_worker_running(struct task_struct *task) { struct worker *worker = kthread_data(task); + if (!worker->sleeping) + return; if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_gcwq_nr_running(cpu)); + atomic_inc(get_gcwq_nr_running(smp_processor_id())); + worker->sleeping = 0; } /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep - * @cpu: CPU in question, must be the current CPU number - * - * This function is called during schedule() when a busy worker is - * going to sleep. Worker on the same cpu can be woken up by - * returning pointer to its task. - * - * CONTEXT: - * spin_lock_irq(rq->lock) * - * RETURNS: - * Worker task on @cpu to wake up, %NULL if none. + * This function is called from schedule() when a busy worker is + * going to sleep. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu) +void wq_worker_sleeping(struct task_struct *task) { - struct worker *worker = kthread_data(task), *to_wakeup = NULL; - struct global_cwq *gcwq = get_gcwq(cpu); - atomic_t *nr_running = get_gcwq_nr_running(cpu); + struct worker *worker = kthread_data(task); + struct global_cwq *gcwq; + int cpu; if (worker->flags & WORKER_NOT_RUNNING) - return NULL; + return; + + if (WARN_ON_ONCE(worker->sleeping)) + return; - /* this can only happen on the local cpu */ - BUG_ON(cpu != raw_smp_processor_id()); + worker->sleeping = 1; + cpu = smp_processor_id(); + gcwq = get_gcwq(cpu); + spin_lock_irq(&gcwq->lock); /* * The counterpart of the following dec_and_test, implied mb, * worklist not empty test sequence is in insert_work(). * Please read comment there. - * - * NOT_RUNNING is clear. This means that trustee is not in - * charge and we're running on the local cpu w/ rq lock held - * and preemption disabled, which in turn means that none else - * could be manipulating idle_list, so dereferencing idle_list - * without gcwq lock is safe. - */ - if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) - to_wakeup = first_worker(gcwq); - return to_wakeup ? to_wakeup->task : NULL; + */ + if (atomic_dec_and_test(get_gcwq_nr_running(cpu)) && + !list_empty(&gcwq->worklist)) { + worker = first_worker(gcwq); + if (worker) + wake_up_process(worker->task); + } + spin_unlock_irq(&gcwq->lock); } /** @@ -1067,8 +1060,8 @@ int queue_work(struct workqueue_struct * { int ret; - ret = queue_work_on(get_cpu(), wq, work); - put_cpu(); + ret = queue_work_on(get_cpu_light(), wq, work); + put_cpu_light(); return ret; } @@ -3484,6 +3477,25 @@ static int __devinit workqueue_cpu_callb kthread_stop(new_trustee); return NOTIFY_BAD; } + break; + case CPU_POST_DEAD: + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + case CPU_ONLINE: + break; + case CPU_DYING: + /* + * We access this lockless. We are on the dying CPU + * and called from stomp machine. + * + * Before this, the trustee and all workers except for + * the ones which are still executing works from + * before the last CPU down must be on the cpu. After + * this, they'll all be diasporas. + */ + gcwq->flags |= GCWQ_DISASSOCIATED; + default: + goto out; } /* some are called w/ irq disabled, don't disturb irq status */ @@ -3503,16 +3515,6 @@ static int __devinit workqueue_cpu_callb gcwq->first_idle = new_worker; break; - case CPU_DYING: - /* - * Before this, the trustee and all workers except for - * the ones which are still executing works from - * before the last CPU down must be on the cpu. After - * this, they'll all be diasporas. - */ - gcwq->flags |= GCWQ_DISASSOCIATED; - break; - case CPU_POST_DEAD: gcwq->trustee_state = TRUSTEE_BUTCHER; /* fall through */ @@ -3546,6 +3548,7 @@ static int __devinit workqueue_cpu_callb spin_unlock_irqrestore(&gcwq->lock, flags); +out: return notifier_from_errno(0); } Index: linux-2.6/kernel/workqueue_sched.h =================================================================== --- linux-2.6.orig/kernel/workqueue_sched.h +++ linux-2.6/kernel/workqueue_sched.h @@ -4,6 +4,5 @@ * Scheduler hooks for concurrency managed workqueue. Only to be * included from sched.c and workqueue.c. */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu); +void wq_worker_running(struct task_struct *task); +void wq_worker_sleeping(struct task_struct *task); Index: linux-2.6/arch/x86/kernel/cpu/intel_cacheinfo.c =================================================================== --- linux-2.6.orig/arch/x86/kernel/cpu/intel_cacheinfo.c +++ linux-2.6/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -151,28 +151,17 @@ union _cpuid4_leaf_ecx { u32 full; }; -struct amd_l3_cache { - struct amd_northbridge *nb; - unsigned indices; - u8 subcaches[4]; -}; - -struct _cpuid4_info { +struct _cpuid4_info_regs { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - struct amd_l3_cache *l3; - DECLARE_BITMAP(shared_cpu_map, NR_CPUS); + struct amd_northbridge *nb; }; -/* subset of above _cpuid4_info w/o shared_cpu_map */ -struct _cpuid4_info_regs { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; - unsigned long size; - struct amd_l3_cache *l3; +struct _cpuid4_info { + struct _cpuid4_info_regs base; + DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; unsigned short num_cache_leaves; @@ -314,12 +303,13 @@ struct _cache_attr { /* * L3 cache descriptors */ -static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) +static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) { + struct amd_l3_cache *l3 = &nb->l3_cache; unsigned int sc0, sc1, sc2, sc3; u32 val = 0; - pci_read_config_dword(l3->nb->misc, 0x1C4, &val); + pci_read_config_dword(nb->misc, 0x1C4, &val); /* calculate subcache sizes */ l3->subcaches[0] = sc0 = !(val & BIT(0)); @@ -333,33 +323,16 @@ static void __cpuinit amd_calc_l3_indice static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) { - static struct amd_l3_cache *__cpuinitdata l3_caches; int node; /* only for L3, and not in virtualized environments */ - if (index < 3 || amd_nb_num() == 0) + if (index < 3) return; - /* - * Strictly speaking, the amount in @size below is leaked since it is - * never freed but this is done only on shutdown so it doesn't matter. - */ - if (!l3_caches) { - int size = amd_nb_num() * sizeof(struct amd_l3_cache); - - l3_caches = kzalloc(size, GFP_ATOMIC); - if (!l3_caches) - return; - } - node = amd_get_nb_id(smp_processor_id()); - - if (!l3_caches[node].nb) { - l3_caches[node].nb = node_to_amd_nb(node); - amd_calc_l3_indices(&l3_caches[node]); - } - - this_leaf->l3 = &l3_caches[node]; + this_leaf->nb = node_to_amd_nb(node); + if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) + amd_calc_l3_indices(this_leaf->nb); } /* @@ -369,11 +342,11 @@ static void __cpuinit amd_init_l3_cache( * * @returns: the disabled index if used or negative value if slot free. */ -int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) +int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) { unsigned int reg = 0; - pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, ®); + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); /* check whether this slot is activated already */ if (reg & (3UL << 30)) @@ -387,11 +360,10 @@ static ssize_t show_cache_disable(struct { int index; - if (!this_leaf->l3 || - !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; - index = amd_get_l3_disable_slot(this_leaf->l3, slot); + index = amd_get_l3_disable_slot(this_leaf->base.nb, slot); if (index >= 0) return sprintf(buf, "%d\n", index); @@ -408,7 +380,7 @@ show_cache_disable_##slot(struct _cpuid4 SHOW_CACHE_DISABLE(0) SHOW_CACHE_DISABLE(1) -static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, unsigned slot, unsigned long idx) { int i; @@ -421,10 +393,10 @@ static void amd_l3_disable_index(struct for (i = 0; i < 4; i++) { u32 reg = idx | (i << 20); - if (!l3->subcaches[i]) + if (!nb->l3_cache.subcaches[i]) continue; - pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); /* * We need to WBINVD on a core on the node containing the L3 @@ -434,7 +406,7 @@ static void amd_l3_disable_index(struct wbinvd_on_cpu(cpu); reg |= BIT(31); - pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); } } @@ -448,24 +420,24 @@ static void amd_l3_disable_index(struct * * @return: 0 on success, error status on failure */ -int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, +int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, unsigned long index) { int ret = 0; /* check if @slot is already used or the index is already disabled */ - ret = amd_get_l3_disable_slot(l3, slot); + ret = amd_get_l3_disable_slot(nb, slot); if (ret >= 0) return -EINVAL; - if (index > l3->indices) + if (index > nb->l3_cache.indices) return -EINVAL; /* check whether the other slot has disabled the same index already */ - if (index == amd_get_l3_disable_slot(l3, !slot)) + if (index == amd_get_l3_disable_slot(nb, !slot)) return -EINVAL; - amd_l3_disable_index(l3, cpu, slot, index); + amd_l3_disable_index(nb, cpu, slot, index); return 0; } @@ -480,8 +452,7 @@ static ssize_t store_cache_disable(struc if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || - !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); @@ -489,7 +460,7 @@ static ssize_t store_cache_disable(struc if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); + err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); if (err) { if (err == -EEXIST) printk(KERN_WARNING "L3 disable slot %d in use!\n", @@ -518,7 +489,7 @@ static struct _cache_attr cache_disable_ static ssize_t show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); @@ -533,7 +504,7 @@ store_subcaches(struct _cpuid4_info *thi if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; if (strict_strtoul(buf, 16, &val) < 0) @@ -769,7 +740,7 @@ static void __cpuinit cache_shared_cpu_m return; } this_leaf = CPUID4_INFO_IDX(cpu, index); - num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; + num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; if (num_threads_sharing == 1) cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); @@ -820,29 +791,19 @@ static void __cpuinit free_cache_attribu for (i = 0; i < num_cache_leaves; i++) cache_remove_shared_cpu_map(cpu, i); - kfree(per_cpu(ici_cpuid4_info, cpu)->l3); kfree(per_cpu(ici_cpuid4_info, cpu)); per_cpu(ici_cpuid4_info, cpu) = NULL; } -static int -__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) -{ - struct _cpuid4_info_regs *leaf_regs = - (struct _cpuid4_info_regs *)this_leaf; - - return cpuid4_cache_lookup_regs(index, leaf_regs); -} - static void __cpuinit get_cpu_leaves(void *_retval) { int j, *retval = _retval, cpu = smp_processor_id(); /* Do cpuid and store the results */ for (j = 0; j < num_cache_leaves; j++) { - struct _cpuid4_info *this_leaf; - this_leaf = CPUID4_INFO_IDX(cpu, j); - *retval = cpuid4_cache_lookup(j, this_leaf); + struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j); + + *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base); if (unlikely(*retval < 0)) { int i; @@ -900,16 +861,16 @@ static ssize_t show_##file_name(struct _ return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } -show_one_plus(level, eax.split.level, 0); -show_one_plus(coherency_line_size, ebx.split.coherency_line_size, 1); -show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1); -show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); -show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); +show_one_plus(level, base.eax.split.level, 0); +show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1); +show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1); +show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1); +show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1); static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - return sprintf(buf, "%luK\n", this_leaf->size / 1024); + return sprintf(buf, "%luK\n", this_leaf->base.size / 1024); } static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, @@ -946,7 +907,7 @@ static inline ssize_t show_shared_cpu_li static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - switch (this_leaf->eax.split.type) { + switch (this_leaf->base.eax.split.type) { case CACHE_TYPE_DATA: return sprintf(buf, "Data\n"); case CACHE_TYPE_INST: @@ -1135,7 +1096,7 @@ static int __cpuinit cache_add_dev(struc ktype_cache.default_attrs = default_attrs; #ifdef CONFIG_AMD_NB - if (this_leaf->l3) + if (this_leaf->base.nb) ktype_cache.default_attrs = amd_l3_attrs(); #endif retval = kobject_init_and_add(&(this_object->kobj), Index: linux-2.6/arch/x86/include/asm/amd_nb.h =================================================================== --- linux-2.6.orig/arch/x86/include/asm/amd_nb.h +++ linux-2.6/arch/x86/include/asm/amd_nb.h @@ -19,9 +19,15 @@ extern int amd_numa_init(void); extern int amd_get_subcaches(int); extern int amd_set_subcaches(int, int); +struct amd_l3_cache { + unsigned indices; + u8 subcaches[4]; +}; + struct amd_northbridge { struct pci_dev *misc; struct pci_dev *link; + struct amd_l3_cache l3_cache; }; struct amd_northbridge_info { Index: linux-2.6/arch/mips/sibyte/sb1250/irq.c =================================================================== --- linux-2.6.orig/arch/mips/sibyte/sb1250/irq.c +++ linux-2.6/arch/mips/sibyte/sb1250/irq.c @@ -178,7 +178,7 @@ static void ack_sb1250_irq(struct irq_da static struct irq_chip sb1250_irq_type = { .name = "SB1250-IMR", - .irq_mask_ack = ack_sb1250_irq, + .irq_mask = ack_sb1250_irq, .irq_unmask = enable_sb1250_irq, #ifdef CONFIG_SMP .irq_set_affinity = sb1250_set_affinity Index: linux-2.6/arch/mips/kernel/ftrace.c =================================================================== --- linux-2.6.orig/arch/mips/kernel/ftrace.c +++ linux-2.6/arch/mips/kernel/ftrace.c @@ -19,6 +19,26 @@ #include +#if defined(KBUILD_MCOUNT_RA_ADDRESS) && defined(CONFIG_32BIT) +#define MCOUNT_OFFSET_INSNS 5 +#else +#define MCOUNT_OFFSET_INSNS 4 +#endif + +/* + * Check if the address is in kernel space + * + * Clone core_kernel_text() from kernel/extable.c, but doesn't call + * init_kernel_text() for Ftrace doesn't trace functions in init sections. + */ +static inline int in_kernel_space(unsigned long ip) +{ + if (ip >= (unsigned long)_stext && + ip <= (unsigned long)_etext) + return 1; + return 0; +} + #ifdef CONFIG_DYNAMIC_FTRACE #define JAL 0x0c000000 /* jump & link: ip --> ra, jump to target */ @@ -54,20 +74,6 @@ static inline void ftrace_dyn_arch_init_ #endif } -/* - * Check if the address is in kernel space - * - * Clone core_kernel_text() from kernel/extable.c, but doesn't call - * init_kernel_text() for Ftrace doesn't trace functions in init sections. - */ -static inline int in_kernel_space(unsigned long ip) -{ - if (ip >= (unsigned long)_stext && - ip <= (unsigned long)_etext) - return 1; - return 0; -} - static int ftrace_modify_code(unsigned long ip, unsigned int new_code) { int faulted; @@ -112,11 +118,6 @@ static int ftrace_modify_code(unsigned l * 1: offset = 4 instructions */ -#if defined(KBUILD_MCOUNT_RA_ADDRESS) && defined(CONFIG_32BIT) -#define MCOUNT_OFFSET_INSNS 5 -#else -#define MCOUNT_OFFSET_INSNS 4 -#endif #define INSN_B_1F (0x10000000 | MCOUNT_OFFSET_INSNS) int ftrace_make_nop(struct module *mod, Index: linux-2.6/arch/mips/loongson/fuloong-2e/irq.c =================================================================== --- linux-2.6.orig/arch/mips/loongson/fuloong-2e/irq.c +++ linux-2.6/arch/mips/loongson/fuloong-2e/irq.c @@ -42,6 +42,7 @@ asmlinkage void mach_irq_dispatch(unsign static struct irqaction cascade_irqaction = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; void __init mach_init_irq(void) Index: linux-2.6/arch/mips/loongson/lemote-2f/irq.c =================================================================== --- linux-2.6.orig/arch/mips/loongson/lemote-2f/irq.c +++ linux-2.6/arch/mips/loongson/lemote-2f/irq.c @@ -96,12 +96,13 @@ static irqreturn_t ip6_action(int cpl, v struct irqaction ip6_irqaction = { .handler = ip6_action, .name = "cascade", - .flags = IRQF_SHARED, + .flags = IRQF_SHARED | IRQF_NO_THREAD, }; struct irqaction cascade_irqaction = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; void __init mach_init_irq(void) Index: linux-2.6/arch/mips/ar7/irq.c =================================================================== --- linux-2.6.orig/arch/mips/ar7/irq.c +++ linux-2.6/arch/mips/ar7/irq.c @@ -98,7 +98,8 @@ static struct irq_chip ar7_sec_irq_type static struct irqaction ar7_cascade_action = { .handler = no_action, - .name = "AR7 cascade interrupt" + .name = "AR7 cascade interrupt", + .flags = IRQF_NO_THREAD, }; static void __init ar7_irq_init(int base) Index: linux-2.6/arch/mips/bcm63xx/irq.c =================================================================== --- linux-2.6.orig/arch/mips/bcm63xx/irq.c +++ linux-2.6/arch/mips/bcm63xx/irq.c @@ -222,6 +222,7 @@ static struct irq_chip bcm63xx_external_ static struct irqaction cpu_ip2_cascade_action = { .handler = no_action, .name = "cascade_ip2", + .flags = IRQF_NO_THREAD, }; void __init arch_init_irq(void) Index: linux-2.6/arch/mips/cobalt/irq.c =================================================================== --- linux-2.6.orig/arch/mips/cobalt/irq.c +++ linux-2.6/arch/mips/cobalt/irq.c @@ -48,6 +48,7 @@ asmlinkage void plat_irq_dispatch(void) static struct irqaction cascade = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; void __init arch_init_irq(void) Index: linux-2.6/arch/mips/dec/setup.c =================================================================== --- linux-2.6.orig/arch/mips/dec/setup.c +++ linux-2.6/arch/mips/dec/setup.c @@ -101,20 +101,24 @@ int cpu_fpu_mask = DEC_CPU_IRQ_MASK(DEC_ static struct irqaction ioirq = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; static struct irqaction fpuirq = { .handler = no_action, .name = "fpu", + .flags = IRQF_NO_THREAD, }; static struct irqaction busirq = { .flags = IRQF_DISABLED, .name = "bus error", + .flags = IRQF_NO_THREAD, }; static struct irqaction haltirq = { .handler = dec_intr_halt, .name = "halt", + .flags = IRQF_NO_THREAD, }; Index: linux-2.6/arch/mips/emma/markeins/irq.c =================================================================== --- linux-2.6.orig/arch/mips/emma/markeins/irq.c +++ linux-2.6/arch/mips/emma/markeins/irq.c @@ -169,7 +169,7 @@ void emma2rh_gpio_irq_init(void) static struct irqaction irq_cascade = { .handler = no_action, - .flags = 0, + .flags = IRQF_NO_THREAD, .name = "cascade", .dev_id = NULL, .next = NULL, Index: linux-2.6/arch/mips/lasat/interrupt.c =================================================================== --- linux-2.6.orig/arch/mips/lasat/interrupt.c +++ linux-2.6/arch/mips/lasat/interrupt.c @@ -105,6 +105,7 @@ asmlinkage void plat_irq_dispatch(void) static struct irqaction cascade = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; void __init arch_init_irq(void) Index: linux-2.6/arch/mips/mti-malta/malta-int.c =================================================================== --- linux-2.6.orig/arch/mips/mti-malta/malta-int.c +++ linux-2.6/arch/mips/mti-malta/malta-int.c @@ -350,12 +350,14 @@ unsigned int plat_ipi_resched_int_xlate( static struct irqaction i8259irq = { .handler = no_action, - .name = "XT-PIC cascade" + .name = "XT-PIC cascade", + .flags = IRQF_NO_THREAD, }; static struct irqaction corehi_irqaction = { .handler = no_action, - .name = "CoreHi" + .name = "CoreHi", + .flags = IRQF_NO_THREAD, }; static msc_irqmap_t __initdata msc_irqmap[] = { Index: linux-2.6/arch/mips/pmc-sierra/msp71xx/msp_irq.c =================================================================== --- linux-2.6.orig/arch/mips/pmc-sierra/msp71xx/msp_irq.c +++ linux-2.6/arch/mips/pmc-sierra/msp71xx/msp_irq.c @@ -109,11 +109,13 @@ asmlinkage void plat_irq_dispatch(struct static struct irqaction cic_cascade_msp = { .handler = no_action, .name = "MSP CIC cascade" + .flags = IRQF_NO_THREAD, }; static struct irqaction per_cascade_msp = { .handler = no_action, .name = "MSP PER cascade" + .flags = IRQF_NO_THREAD, }; void __init arch_init_irq(void) Index: linux-2.6/arch/mips/pnx8550/common/int.c =================================================================== --- linux-2.6.orig/arch/mips/pnx8550/common/int.c +++ linux-2.6/arch/mips/pnx8550/common/int.c @@ -167,7 +167,7 @@ static struct irq_chip level_irq_type = static struct irqaction gic_action = { .handler = no_action, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NO_THREAD, .name = "GIC", }; Index: linux-2.6/arch/mips/sgi-ip22/ip22-int.c =================================================================== --- linux-2.6.orig/arch/mips/sgi-ip22/ip22-int.c +++ linux-2.6/arch/mips/sgi-ip22/ip22-int.c @@ -155,32 +155,32 @@ static void __irq_entry indy_buserror_ir static struct irqaction local0_cascade = { .handler = no_action, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NO_THREAD, .name = "local0 cascade", }; static struct irqaction local1_cascade = { .handler = no_action, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NO_THREAD, .name = "local1 cascade", }; static struct irqaction buserr = { .handler = no_action, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NO_THREAD, .name = "Bus Error", }; static struct irqaction map0_cascade = { .handler = no_action, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NO_THREAD, .name = "mapable0 cascade", }; #ifdef USE_LIO3_IRQ static struct irqaction map1_cascade = { .handler = no_action, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NO_THREAD, .name = "mapable1 cascade", }; #define SGI_INTERRUPTS SGINT_END Index: linux-2.6/arch/mips/sni/rm200.c =================================================================== --- linux-2.6.orig/arch/mips/sni/rm200.c +++ linux-2.6/arch/mips/sni/rm200.c @@ -359,6 +359,7 @@ void sni_rm200_init_8259A(void) static struct irqaction sni_rm200_irq2 = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; static struct resource sni_rm200_pic1_resource = { Index: linux-2.6/arch/mips/vr41xx/common/irq.c =================================================================== --- linux-2.6.orig/arch/mips/vr41xx/common/irq.c +++ linux-2.6/arch/mips/vr41xx/common/irq.c @@ -34,6 +34,7 @@ static irq_cascade_t irq_cascade[NR_IRQS static struct irqaction cascade_irqaction = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; int cascade_irq(unsigned int irq, int (*get_irq)(unsigned int)) Index: linux-2.6/arch/mips/Kconfig =================================================================== --- linux-2.6.orig/arch/mips/Kconfig +++ linux-2.6/arch/mips/Kconfig @@ -24,6 +24,7 @@ config MIPS select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select HAVE_ARCH_JUMP_LABEL + select IRQ_FORCED_THREADING menu "Machine selection" @@ -2038,7 +2039,7 @@ config CPU_R4400_WORKAROUNDS # config HIGHMEM bool "High Memory Support" - depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM + depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !PREEMPT_RT_FULL config CPU_SUPPORTS_HIGHMEM bool Index: linux-2.6/arch/mips/kernel/traps.c =================================================================== --- linux-2.6.orig/arch/mips/kernel/traps.c +++ linux-2.6/arch/mips/kernel/traps.c @@ -364,7 +364,7 @@ static int regs_to_trapnr(struct pt_regs return (regs->cp0_cause >> 2) & 0x1f; } -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); void __noreturn die(const char *str, struct pt_regs *regs) { @@ -378,7 +378,7 @@ void __noreturn die(const char *str, str sig = 0; console_verbose(); - spin_lock_irq(&die_lock); + raw_spin_lock_irq(&die_lock); bust_spinlocks(1); #ifdef CONFIG_MIPS_MT_SMTC mips_mt_regdump(dvpret); @@ -387,7 +387,7 @@ void __noreturn die(const char *str, str printk("%s[#%d]:\n", str, ++die_counter); show_registers(regs); add_taint(TAINT_DIE); - spin_unlock_irq(&die_lock); + raw_spin_unlock_irq(&die_lock); if (in_interrupt()) panic("Fatal exception in interrupt"); Index: linux-2.6/arch/mips/kernel/signal.c =================================================================== --- linux-2.6.orig/arch/mips/kernel/signal.c +++ linux-2.6/arch/mips/kernel/signal.c @@ -603,6 +603,9 @@ static void do_signal(struct pt_regs *re if (!user_mode(regs)) return; + local_irq_enable(); + preempt_check_resched(); + if (test_thread_flag(TIF_RESTORE_SIGMASK)) oldset = ¤t->saved_sigmask; else Index: linux-2.6/arch/arm/kernel/signal.c =================================================================== --- linux-2.6.orig/arch/arm/kernel/signal.c +++ linux-2.6/arch/arm/kernel/signal.c @@ -673,6 +673,9 @@ static void do_signal(struct pt_regs *re if (!user_mode(regs)) return; + local_irq_enable(); + preempt_check_resched(); + /* * If we were from a system call, check for system call restarting... */ Index: linux-2.6/kernel/time/clocksource.c =================================================================== --- linux-2.6.orig/kernel/time/clocksource.c +++ linux-2.6/kernel/time/clocksource.c @@ -186,6 +186,7 @@ static struct timer_list watchdog_timer; static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; +static atomic_t watchdog_reset_pending; static int clocksource_watchdog_kthread(void *data); static void __clocksource_change_rating(struct clocksource *cs, int rating); @@ -247,12 +248,14 @@ static void clocksource_watchdog(unsigne struct clocksource *cs; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; - int next_cpu; + int next_cpu, reset_pending; spin_lock(&watchdog_lock); if (!watchdog_running) goto out; + reset_pending = atomic_read(&watchdog_reset_pending); + list_for_each_entry(cs, &watchdog_list, wd_list) { /* Clocksource already marked unstable? */ @@ -268,7 +271,8 @@ static void clocksource_watchdog(unsigne local_irq_enable(); /* Clocksource initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || + atomic_read(&watchdog_reset_pending)) { cs->flags |= CLOCK_SOURCE_WATCHDOG; cs->wd_last = wdnow; cs->cs_last = csnow; @@ -283,8 +287,11 @@ static void clocksource_watchdog(unsigne cs->cs_last = csnow; cs->wd_last = wdnow; + if (atomic_read(&watchdog_reset_pending)) + continue; + /* Check the deviation from the watchdog clocksource. */ - if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { clocksource_unstable(cs, cs_nsec - wd_nsec); continue; } @@ -303,6 +310,13 @@ static void clocksource_watchdog(unsigne } /* + * We only clear the watchdog_reset_pending, when we did a + * full cycle through all clocksources. + */ + if (reset_pending) + atomic_dec(&watchdog_reset_pending); + + /* * Cycle through CPUs to check if the CPUs stay synchronized * to each other. */ @@ -344,23 +358,7 @@ static inline void clocksource_reset_wat static void clocksource_resume_watchdog(void) { - unsigned long flags; - - /* - * We use trylock here to avoid a potential dead lock when - * kgdb calls this code after the kernel has been stopped with - * watchdog_lock held. When watchdog_lock is held we just - * return and accept, that the watchdog might trigger and mark - * the monitored clock source (usually TSC) unstable. - * - * This does not affect the other caller clocksource_resume() - * because at this point the kernel is UP, interrupts are - * disabled and nothing can hold watchdog_lock. - */ - if (!spin_trylock_irqsave(&watchdog_lock, flags)) - return; - clocksource_reset_watchdog(); - spin_unlock_irqrestore(&watchdog_lock, flags); + atomic_inc(&watchdog_reset_pending); } static void clocksource_enqueue_watchdog(struct clocksource *cs) Index: linux-2.6/kernel/watchdog.c =================================================================== --- linux-2.6.orig/kernel/watchdog.c +++ linux-2.6/kernel/watchdog.c @@ -208,6 +208,8 @@ static struct perf_event_attr wd_hw_attr .disabled = 1, }; +static DEFINE_RAW_SPINLOCK(watchdog_output_lock); + /* Callback function for perf event subsystem */ static void watchdog_overflow_callback(struct perf_event *event, int nmi, struct perf_sample_data *data, @@ -234,10 +236,19 @@ static void watchdog_overflow_callback(s if (__this_cpu_read(hard_watchdog_warn) == true) return; - if (hardlockup_panic) + /* + * If early-printk is enabled then make sure we do not + * lock up in printk() and kill console logging: + */ + printk_kill(); + + if (hardlockup_panic) { panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - else + } else { + raw_spin_lock(&watchdog_output_lock); WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + raw_spin_unlock(&watchdog_output_lock); + } __this_cpu_write(hard_watchdog_warn, true); return; @@ -320,7 +331,7 @@ static enum hrtimer_restart watchdog_tim */ static int watchdog(void *unused) { - static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); sched_setscheduler(current, SCHED_FIFO, ¶m); @@ -349,7 +360,8 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); - + param.sched_priority = 0; + sched_setscheduler(current, SCHED_NORMAL, ¶m); return 0; } @@ -422,6 +434,7 @@ static void watchdog_prepare_cpu(int cpu WARN_ON(per_cpu(softlockup_watchdog, cpu)); hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; + hrtimer->irqsafe = 1; } static int watchdog_enable(int cpu) Index: linux-2.6/kernel/rtmutex-debug.c =================================================================== --- linux-2.6.orig/kernel/rtmutex-debug.c +++ linux-2.6/kernel/rtmutex-debug.c @@ -29,61 +29,6 @@ #include "rtmutex_common.h" -# define TRACE_WARN_ON(x) WARN_ON(x) -# define TRACE_BUG_ON(x) BUG_ON(x) - -# define TRACE_OFF() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - if (raw_spin_is_locked(¤t->pi_lock)) \ - raw_spin_unlock(¤t->pi_lock); \ - } \ -} while (0) - -# define TRACE_OFF_NOLOCK() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - } \ -} while (0) - -# define TRACE_BUG_LOCKED() \ -do { \ - TRACE_OFF(); \ - BUG(); \ -} while (0) - -# define TRACE_WARN_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) { \ - TRACE_OFF(); \ - WARN_ON(1); \ - } \ -} while (0) - -# define TRACE_BUG_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) \ - TRACE_BUG_LOCKED(); \ -} while (0) - -#ifdef CONFIG_SMP -# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) -#else -# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) -#endif - -/* - * deadlock detection flag. We turn it off when we detect - * the first problem because we dont want to recurse back - * into the tracing code when doing error printk or - * executing a BUG(): - */ -static int rt_trace_on = 1; - static void printk_task(struct task_struct *p) { if (p) @@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex void rt_mutex_debug_task_free(struct task_struct *task) { - WARN_ON(!plist_head_empty(&task->pi_waiters)); - WARN_ON(task->pi_blocked_on); + DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } /* @@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, { struct task_struct *task; - if (!rt_trace_on || detect || !act_waiter) + if (!debug_locks || detect || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); @@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struc { struct task_struct *task; - if (!waiter->deadlock_lock || !rt_trace_on) + if (!waiter->deadlock_lock || !debug_locks) return; rcu_read_lock(); @@ -149,7 +94,8 @@ void debug_rt_mutex_print_deadlock(struc return; } - TRACE_OFF_NOLOCK(); + if (!debug_locks_off()) + return; printk("\n============================================\n"); printk( "[ BUG: circular locking deadlock detected! ]\n"); @@ -180,7 +126,6 @@ void debug_rt_mutex_print_deadlock(struc printk("[ turning off deadlock detection." "Please report this trace. ]\n\n"); - local_irq_disable(); } void debug_rt_mutex_lock(struct rt_mutex *lock) @@ -189,7 +134,7 @@ void debug_rt_mutex_lock(struct rt_mutex void debug_rt_mutex_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } void @@ -199,7 +144,7 @@ debug_rt_mutex_proxy_lock(struct rt_mute void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); } void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) @@ -213,8 +158,8 @@ void debug_rt_mutex_init_waiter(struct r void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { put_pid(waiter->deadlock_task_pid); - TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); - TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); memset(waiter, 0x22, sizeof(*waiter)); } Index: linux-2.6/arch/arm/kernel/perf_event.c =================================================================== --- linux-2.6.orig/arch/arm/kernel/perf_event.c +++ linux-2.6/arch/arm/kernel/perf_event.c @@ -420,7 +420,7 @@ armpmu_reserve_hardware(void) continue; err = request_irq(irq, handle_irq, - IRQF_DISABLED | IRQF_NOBALANCING, + IRQF_DISABLED | IRQF_NOBALANCING | IRQF_NO_THREAD, "armpmu", NULL); if (err) { pr_warning("unable to request IRQ%d for ARM perf " Index: linux-2.6/arch/arm/Kconfig =================================================================== --- linux-2.6.orig/arch/arm/Kconfig +++ linux-2.6/arch/arm/Kconfig @@ -29,6 +29,7 @@ config ARM select HAVE_GENERIC_HARDIRQS select HAVE_SPARSE_IRQ select GENERIC_IRQ_SHOW + select IRQ_FORCED_THREADING help The ARM series is a line of low-power-consumption RISC chip designs licensed by ARM Ltd and targeted at embedded applications and @@ -1510,7 +1511,7 @@ config HAVE_ARCH_PFN_VALID config HIGHMEM bool "High Memory Support" - depends on MMU + depends on MMU && !PREEMPT_RT_FULL help The address space of ARM processors is only 4 Gigabytes large and it has to accommodate user address space, kernel address Index: linux-2.6/arch/powerpc/platforms/85xx/mpc85xx_cds.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/85xx/mpc85xx_cds.c +++ linux-2.6/arch/powerpc/platforms/85xx/mpc85xx_cds.c @@ -178,7 +178,7 @@ static irqreturn_t mpc85xx_8259_cascade_ static struct irqaction mpc85xxcds_8259_irqaction = { .handler = mpc85xx_8259_cascade_action, - .flags = IRQF_SHARED, + .flags = IRQF_SHARED | IRQF_NO_THREAD, .name = "8259 cascade", }; #endif /* PPC_I8259 */ Index: linux-2.6/arch/powerpc/Kconfig =================================================================== --- linux-2.6.orig/arch/powerpc/Kconfig +++ linux-2.6/arch/powerpc/Kconfig @@ -69,10 +69,11 @@ config LOCKDEP_SUPPORT config RWSEM_GENERIC_SPINLOCK bool + default y if PREEMPT_RT_FULL config RWSEM_XCHGADD_ALGORITHM bool - default y + default y if !PREEMPT_RT_FULL config GENERIC_LOCKBREAK bool @@ -134,6 +135,7 @@ config PPC select GENERIC_IRQ_SHOW_LEVEL select HAVE_RCU_TABLE_FREE if SMP select HAVE_SYSCALL_TRACEPOINTS + select IRQ_FORCED_THREADING config EARLY_PRINTK bool @@ -271,7 +273,7 @@ menu "Kernel options" config HIGHMEM bool "High memory support" - depends on PPC32 + depends on PPC32 && !PREEMPT_RT_FULL source kernel/time/Kconfig source kernel/Kconfig.hz Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -359,6 +359,7 @@ extern signed long schedule_timeout_inte extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); +extern void schedule_preempt_disabled(void); extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; @@ -510,7 +511,7 @@ struct task_cputime { struct thread_group_cputimer { struct task_cputime cputime; int running; - spinlock_t lock; + raw_spinlock_t lock; }; #include @@ -1070,6 +1071,7 @@ struct sched_domain; #define WF_SYNC 0x01 /* waker goes to sleep after wakup */ #define WF_FORK 0x02 /* child wakeup after fork */ #define WF_MIGRATED 0x04 /* internal use, task got migrated */ +#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */ #define ENQUEUE_WAKEUP 1 #define ENQUEUE_HEAD 2 @@ -1219,6 +1221,7 @@ enum perf_event_task_context { struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ + volatile long saved_state; /* saved state for "spinlock sleepers" */ void *stack; atomic_t usage; unsigned int flags; /* per process flags, defined below */ @@ -1255,6 +1258,7 @@ struct task_struct { #endif unsigned int policy; + int migrate_disable; cpumask_t cpus_allowed; #ifdef CONFIG_PREEMPT_RCU @@ -1356,6 +1360,7 @@ struct task_struct { struct task_cputime cputime_expires; struct list_head cpu_timers[3]; + struct task_struct *posix_timer_list; /* process credentials */ const struct cred __rcu *real_cred; /* objective and real subjective task @@ -1389,6 +1394,7 @@ struct task_struct { /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; + struct sigqueue *sigqueue_cache; sigset_t blocked, real_blocked; sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ @@ -1432,6 +1438,7 @@ struct task_struct { /* mutex deadlock detection */ struct mutex_waiter *blocked_on; #endif + int pagefault_disabled; #ifdef CONFIG_TRACE_IRQFLAGS unsigned int irq_events; unsigned long hardirq_enable_ip; @@ -1558,6 +1565,12 @@ struct task_struct { unsigned long trace; /* bitmask and counter of trace recursion */ unsigned long trace_recursion; +#ifdef CONFIG_WAKEUP_LATENCY_HIST + u64 preempt_timestamp_hist; +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST + unsigned long timer_offset; +#endif +#endif #endif /* CONFIG_TRACING */ #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ struct memcg_batch_info { @@ -1570,11 +1583,12 @@ struct task_struct { #ifdef CONFIG_HAVE_HW_BREAKPOINT atomic_t ptrace_bp_refcnt; #endif +#ifdef CONFIG_PREEMPT_RT_BASE + struct rcu_head put_rcu; + int softirq_nestcnt; +#endif }; -/* Future-safe accessor for struct task_struct's cpus_allowed. */ -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) - /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH @@ -1743,6 +1757,15 @@ extern struct pid *cad_pid; extern void free_task(struct task_struct *tsk); #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#ifdef CONFIG_PREEMPT_RT_BASE +extern void __put_task_struct_cb(struct rcu_head *rhp); + +static inline void put_task_struct(struct task_struct *t) +{ + if (atomic_dec_and_test(&t->usage)) + call_rcu(&t->put_rcu, __put_task_struct_cb); +} +#else extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) @@ -1750,6 +1773,7 @@ static inline void put_task_struct(struc if (atomic_dec_and_test(&t->usage)) __put_task_struct(t); } +#endif extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); @@ -1774,6 +1798,7 @@ extern void thread_group_times(struct ta #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ #define PF_KSWAPD 0x00040000 /* I am kswapd */ +#define PF_STOMPER 0x00080000 /* I am a stomp machine thread */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ @@ -2022,15 +2047,27 @@ static inline void sched_autogroup_exit( #endif #ifdef CONFIG_RT_MUTEXES +extern void task_setprio(struct task_struct *p, int prio); extern int rt_mutex_getprio(struct task_struct *p); -extern void rt_mutex_setprio(struct task_struct *p, int prio); +static inline void rt_mutex_setprio(struct task_struct *p, int prio) +{ + task_setprio(p, prio); +} extern void rt_mutex_adjust_pi(struct task_struct *p); +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return tsk->pi_blocked_on != NULL; +} #else static inline int rt_mutex_getprio(struct task_struct *p) { return p->normal_prio; } # define rt_mutex_adjust_pi(p) do { } while (0) +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return false; +} #endif extern bool yield_to(struct task_struct *p, bool preempt); @@ -2110,6 +2147,7 @@ extern void xtime_update(unsigned long t extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); +extern int wake_up_lock_sleeper(struct task_struct * tsk); extern void wake_up_new_task(struct task_struct *tsk); #ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); @@ -2199,12 +2237,24 @@ extern struct mm_struct * mm_alloc(void) /* mmdrop drops the mm and the page tables */ extern void __mmdrop(struct mm_struct *); + static inline void mmdrop(struct mm_struct * mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } +#ifdef CONFIG_PREEMPT_RT_BASE +extern void __mmdrop_delayed(struct rcu_head *rhp); +static inline void mmdrop_delayed(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else +# define mmdrop_delayed(mm) mmdrop(mm) +#endif + /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); /* Grab a reference to a task's mm, if it is not already going away */ @@ -2510,7 +2560,7 @@ extern int _cond_resched(void); extern int __cond_resched_lock(spinlock_t *lock); -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT_FULL) #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET #else #define PREEMPT_LOCK_OFFSET 0 @@ -2521,12 +2571,16 @@ extern int __cond_resched_lock(spinlock_ __cond_resched_lock(lock); \ }) +#ifndef CONFIG_PREEMPT_RT_FULL extern int __cond_resched_softirq(void); #define cond_resched_softirq() ({ \ __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ __cond_resched_softirq(); \ }) +#else +# define cond_resched_softirq() cond_resched() +#endif /* * Does a critical section need to be broken due to another @@ -2550,7 +2604,7 @@ void thread_group_cputimer(struct task_s static inline void thread_group_cputime_init(struct signal_struct *sig) { - spin_lock_init(&sig->cputimer.lock); + raw_spin_lock_init(&sig->cputimer.lock); } /* @@ -2589,6 +2643,15 @@ static inline void set_task_cpu(struct t #endif /* CONFIG_SMP */ +/* Future-safe accessor for struct task_struct's cpus_allowed. */ +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p) +{ + if (p->migrate_disable) + return cpumask_of(task_cpu(p)); + + return &p->cpus_allowed; +} + extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); Index: linux-2.6/arch/arm/kernel/process.c =================================================================== --- linux-2.6.orig/arch/arm/kernel/process.c +++ linux-2.6/arch/arm/kernel/process.c @@ -209,9 +209,7 @@ void cpu_idle(void) } leds_event(led_idle_end); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/avr32/kernel/process.c =================================================================== --- linux-2.6.orig/arch/avr32/kernel/process.c +++ linux-2.6/arch/avr32/kernel/process.c @@ -38,9 +38,7 @@ void cpu_idle(void) while (!need_resched()) cpu_idle_sleep(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/blackfin/kernel/process.c =================================================================== --- linux-2.6.orig/arch/blackfin/kernel/process.c +++ linux-2.6/arch/blackfin/kernel/process.c @@ -92,9 +92,7 @@ void cpu_idle(void) while (!need_resched()) idle(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/cris/kernel/process.c =================================================================== --- linux-2.6.orig/arch/cris/kernel/process.c +++ linux-2.6/arch/cris/kernel/process.c @@ -115,9 +115,7 @@ void cpu_idle (void) idle = default_idle; idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/frv/kernel/process.c =================================================================== --- linux-2.6.orig/arch/frv/kernel/process.c +++ linux-2.6/arch/frv/kernel/process.c @@ -92,9 +92,7 @@ void cpu_idle(void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/h8300/kernel/process.c =================================================================== --- linux-2.6.orig/arch/h8300/kernel/process.c +++ linux-2.6/arch/h8300/kernel/process.c @@ -81,9 +81,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/ia64/kernel/process.c =================================================================== --- linux-2.6.orig/arch/ia64/kernel/process.c +++ linux-2.6/arch/ia64/kernel/process.c @@ -330,9 +330,7 @@ cpu_idle (void) normal_xtp(); #endif } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); if (cpu_is_offline(cpu)) play_dead(); Index: linux-2.6/arch/m32r/kernel/process.c =================================================================== --- linux-2.6.orig/arch/m32r/kernel/process.c +++ linux-2.6/arch/m32r/kernel/process.c @@ -90,9 +90,7 @@ void cpu_idle (void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/m68k/kernel/process_mm.c =================================================================== --- linux-2.6.orig/arch/m68k/kernel/process_mm.c +++ linux-2.6/arch/m68k/kernel/process_mm.c @@ -94,9 +94,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/m68k/kernel/process_no.c =================================================================== --- linux-2.6.orig/arch/m68k/kernel/process_no.c +++ linux-2.6/arch/m68k/kernel/process_no.c @@ -73,9 +73,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/microblaze/kernel/process.c =================================================================== --- linux-2.6.orig/arch/microblaze/kernel/process.c +++ linux-2.6/arch/microblaze/kernel/process.c @@ -108,9 +108,7 @@ void cpu_idle(void) idle(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } Index: linux-2.6/arch/mips/kernel/process.c =================================================================== --- linux-2.6.orig/arch/mips/kernel/process.c +++ linux-2.6/arch/mips/kernel/process.c @@ -78,9 +78,7 @@ void __noreturn cpu_idle(void) play_dead(); #endif tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/mn10300/kernel/process.c =================================================================== --- linux-2.6.orig/arch/mn10300/kernel/process.c +++ linux-2.6/arch/mn10300/kernel/process.c @@ -123,9 +123,7 @@ void cpu_idle(void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/parisc/kernel/process.c =================================================================== --- linux-2.6.orig/arch/parisc/kernel/process.c +++ linux-2.6/arch/parisc/kernel/process.c @@ -71,9 +71,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) barrier(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } Index: linux-2.6/arch/powerpc/kernel/idle.c =================================================================== --- linux-2.6.orig/arch/powerpc/kernel/idle.c +++ linux-2.6/arch/powerpc/kernel/idle.c @@ -94,11 +94,11 @@ void cpu_idle(void) HMT_medium(); ppc64_runlatch_on(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - if (cpu_should_die()) + if (cpu_should_die()) { + __preempt_enable_no_resched(); cpu_die(); - schedule(); - preempt_disable(); + } + schedule_preempt_disabled(); } } Index: linux-2.6/arch/powerpc/platforms/iseries/setup.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/iseries/setup.c +++ linux-2.6/arch/powerpc/platforms/iseries/setup.c @@ -581,9 +581,7 @@ static void iseries_shared_idle(void) if (hvlpevent_is_pending()) process_iSeries_events(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } @@ -610,9 +608,7 @@ static void iseries_dedicated_idle(void) ppc64_runlatch_on(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/s390/kernel/process.c =================================================================== --- linux-2.6.orig/arch/s390/kernel/process.c +++ linux-2.6/arch/s390/kernel/process.c @@ -94,9 +94,7 @@ void cpu_idle(void) while (!need_resched()) default_idle(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/score/kernel/process.c =================================================================== --- linux-2.6.orig/arch/score/kernel/process.c +++ linux-2.6/arch/score/kernel/process.c @@ -53,9 +53,7 @@ void __noreturn cpu_idle(void) while (!need_resched()) barrier(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/sh/kernel/idle.c =================================================================== --- linux-2.6.orig/arch/sh/kernel/idle.c +++ linux-2.6/arch/sh/kernel/idle.c @@ -110,9 +110,7 @@ void cpu_idle(void) } tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/sparc/kernel/process_32.c =================================================================== --- linux-2.6.orig/arch/sparc/kernel/process_32.c +++ linux-2.6/arch/sparc/kernel/process_32.c @@ -113,9 +113,7 @@ void cpu_idle(void) while (!need_resched()) cpu_relax(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } @@ -138,9 +136,7 @@ void cpu_idle(void) while (!need_resched()) cpu_relax(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } Index: linux-2.6/arch/sparc/kernel/process_64.c =================================================================== --- linux-2.6.orig/arch/sparc/kernel/process_64.c +++ linux-2.6/arch/sparc/kernel/process_64.c @@ -102,15 +102,13 @@ void cpu_idle(void) tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - #ifdef CONFIG_HOTPLUG_CPU - if (cpu_is_offline(cpu)) + if (cpu_is_offline(cpu)) { + __preempt_enable_no_resched(); cpu_play_dead(); + } #endif - - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/tile/kernel/process.c =================================================================== --- linux-2.6.orig/arch/tile/kernel/process.c +++ linux-2.6/arch/tile/kernel/process.c @@ -106,9 +106,7 @@ void cpu_idle(void) current_thread_info()->status |= TS_POLLING; } tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/x86/kernel/process_32.c =================================================================== --- linux-2.6.orig/arch/x86/kernel/process_32.c +++ linux-2.6/arch/x86/kernel/process_32.c @@ -113,9 +113,7 @@ void cpu_idle(void) start_critical_timings(); } tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/x86/kernel/process_64.c =================================================================== --- linux-2.6.orig/arch/x86/kernel/process_64.c +++ linux-2.6/arch/x86/kernel/process_64.c @@ -146,9 +146,7 @@ void cpu_idle(void) } tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/arch/xtensa/kernel/process.c =================================================================== --- linux-2.6.orig/arch/xtensa/kernel/process.c +++ linux-2.6/arch/xtensa/kernel/process.c @@ -113,9 +113,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) platform_idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } Index: linux-2.6/init/main.c =================================================================== --- linux-2.6.orig/init/main.c +++ linux-2.6/init/main.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -367,9 +368,7 @@ static noinline void __init_refok rest_i * at least once to get things moving: */ init_idle_bootup_task(current); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); /* Call into cpu_idle with preempt disabled */ cpu_idle(); @@ -501,6 +500,7 @@ asmlinkage void __init start_kernel(void parse_args("Booting kernel", static_command_line, __start___param, __stop___param - __start___param, &unknown_bootoption); + softirq_early_init(); /* * These use large bootmem allocations and must precede * kmem_cache_init() Index: linux-2.6/kernel/mutex.c =================================================================== --- linux-2.6.orig/kernel/mutex.c +++ linux-2.6/kernel/mutex.c @@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } Index: linux-2.6/kernel/softirq.c =================================================================== --- linux-2.6.orig/kernel/softirq.c +++ linux-2.6/kernel/softirq.c @@ -24,6 +24,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -61,6 +62,67 @@ char *softirq_to_name[NR_SOFTIRQS] = { "TASKLET", "SCHED", "HRTIMER", "RCU" }; +#ifdef CONFIG_NO_HZ +# ifdef CONFIG_PREEMPT_RT_FULL +/* + * On preempt-rt a softirq might be blocked on a lock. There might be + * no other runnable task on this CPU because the lock owner runs on + * some other CPU. So we have to go into idle with the pending bit + * set. Therefor we need to check this otherwise we warn about false + * positives which confuses users and defeats the whole purpose of + * this test. + * + * This code is called with interrupts disabled. + */ +void softirq_check_pending_idle(void) +{ + static int rate_limit; + u32 warnpending = 0, pending = local_softirq_pending(); + + if (rate_limit >= 10) + return; + + if (pending) { + struct task_struct *tsk; + + tsk = __get_cpu_var(ksoftirqd); + /* + * The wakeup code in rtmutex.c wakes up the task + * _before_ it sets pi_blocked_on to NULL under + * tsk->pi_lock. So we need to check for both: state + * and pi_blocked_on. + */ + raw_spin_lock(&tsk->pi_lock); + + if (!tsk->pi_blocked_on && !(tsk->state == TASK_RUNNING)) + warnpending = 1; + + raw_spin_unlock(&tsk->pi_lock); + } + + if (warnpending) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + pending); + rate_limit++; + } +} +# else +/* + * On !PREEMPT_RT we just printk rate limited: + */ +void softirq_check_pending_idle(void) +{ + static int rate_limit; + + if (rate_limit < 10) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + local_softirq_pending()); + rate_limit++; + } +} +# endif +#endif + /* * we cannot loop indefinitely here to avoid userspace starvation, * but we also don't want to introduce a worst case 1/HZ latency @@ -76,6 +138,35 @@ static void wakeup_softirqd(void) wake_up_process(tsk); } +static void handle_pending_softirqs(u32 pending, int cpu) +{ + struct softirq_action *h = softirq_vec; + unsigned int prev_count = preempt_count(); + + local_irq_enable(); + for ( ; pending; h++, pending >>= 1) { + unsigned int vec_nr = h - softirq_vec; + + if (!(pending & 1)) + continue; + + kstat_incr_softirqs_this_cpu(vec_nr); + trace_softirq_entry(vec_nr); + h->action(h); + trace_softirq_exit(vec_nr); + if (unlikely(prev_count != preempt_count())) { + printk(KERN_ERR + "huh, entered softirq %u %s %p with preempt_count %08x exited with %08x?\n", + vec_nr, softirq_to_name[vec_nr], h->action, + prev_count, (unsigned int) preempt_count()); + preempt_count() = prev_count; + } + rcu_bh_qs(cpu); + } + local_irq_disable(); +} + +#ifndef CONFIG_PREEMPT_RT_FULL /* * preempt_count and SOFTIRQ_OFFSET usage: * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving @@ -206,7 +297,6 @@ EXPORT_SYMBOL(local_bh_enable_ip); asmlinkage void __do_softirq(void) { - struct softirq_action *h; __u32 pending; int max_restart = MAX_SOFTIRQ_RESTART; int cpu; @@ -215,7 +305,7 @@ asmlinkage void __do_softirq(void) account_system_vtime(current); __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); + SOFTIRQ_OFFSET); lockdep_softirq_enter(); cpu = smp_processor_id(); @@ -223,36 +313,7 @@ restart: /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); - local_irq_enable(); - - h = softirq_vec; - - do { - if (pending & 1) { - unsigned int vec_nr = h - softirq_vec; - int prev_count = preempt_count(); - - kstat_incr_softirqs_this_cpu(vec_nr); - - trace_softirq_entry(vec_nr); - h->action(h); - trace_softirq_exit(vec_nr); - if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %u %s %p" - "with preempt_count %08x," - " exited with %08x?\n", vec_nr, - softirq_to_name[vec_nr], h->action, - prev_count, preempt_count()); - preempt_count() = prev_count; - } - - rcu_bh_qs(cpu); - } - h++; - pending >>= 1; - } while (pending); - - local_irq_disable(); + handle_pending_softirqs(pending, cpu); pending = local_softirq_pending(); if (pending && --max_restart) @@ -267,6 +328,26 @@ restart: __local_bh_enable(SOFTIRQ_OFFSET); } +/* + * Called with preemption disabled from run_ksoftirqd() + */ +static int ksoftirqd_do_softirq(int cpu) +{ + /* + * Preempt disable stops cpu going offline. + * If already offline, we'll be on wrong CPU: + * don't process. + */ + if (cpu_is_offline(cpu)) + return -1; + + local_irq_disable(); + if (local_softirq_pending()) + __do_softirq(); + local_irq_enable(); + return 0; +} + #ifndef __ARCH_HAS_DO_SOFTIRQ asmlinkage void do_softirq(void) @@ -289,6 +370,178 @@ asmlinkage void do_softirq(void) #endif +static inline void local_bh_disable_nort(void) { local_bh_disable(); } +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); } +static inline void ksoftirqd_set_sched_params(void) { } +static inline void ksoftirqd_clr_sched_params(void) { } + +#else /* !PREEMPT_RT_FULL */ + +/* + * On RT we serialize softirq execution with a cpu local lock + */ +static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock); +static DEFINE_PER_CPU(struct task_struct *, local_softirq_runner); + +static void __do_softirq(void); + +void __init softirq_early_init(void) +{ + local_irq_lock_init(local_softirq_lock); +} + +void local_bh_disable(void) +{ + migrate_disable(); + current->softirq_nestcnt++; +} +EXPORT_SYMBOL(local_bh_disable); + +void local_bh_enable(void) +{ + if (WARN_ON(current->softirq_nestcnt == 0)) + return; + + if ((current->softirq_nestcnt == 1) && + local_softirq_pending() && + local_trylock(local_softirq_lock)) { + + local_irq_disable(); + if (local_softirq_pending()) + __do_softirq(); + local_unlock(local_softirq_lock); + local_irq_enable(); + WARN_ON(current->softirq_nestcnt != 1); + } + current->softirq_nestcnt--; + migrate_enable(); +} +EXPORT_SYMBOL(local_bh_enable); + +void local_bh_enable_ip(unsigned long ip) +{ + local_bh_enable(); +} +EXPORT_SYMBOL(local_bh_enable_ip); + +/* For tracing */ +int notrace __in_softirq(void) +{ + if (__get_cpu_var(local_softirq_lock).owner == current) + return __get_cpu_var(local_softirq_lock).nestcnt; + return 0; +} + +int in_serving_softirq(void) +{ + int res; + + preempt_disable(); + res = __get_cpu_var(local_softirq_runner) == current; + preempt_enable(); + return res; +} + +/* + * Called with bh and local interrupts disabled. For full RT cpu must + * be pinned. + */ +static void __do_softirq(void) +{ + u32 pending = local_softirq_pending(); + int cpu = smp_processor_id(); + + current->softirq_nestcnt++; + + /* Reset the pending bitmask before enabling irqs */ + set_softirq_pending(0); + + __get_cpu_var(local_softirq_runner) = current; + + lockdep_softirq_enter(); + + handle_pending_softirqs(pending, cpu); + + pending = local_softirq_pending(); + if (pending) + wakeup_softirqd(); + + lockdep_softirq_exit(); + __get_cpu_var(local_softirq_runner) = NULL; + + current->softirq_nestcnt--; +} + +static int __thread_do_softirq(int cpu) +{ + /* + * Prevent the current cpu from going offline. + * pin_current_cpu() can reenable preemption and block on the + * hotplug mutex. When it returns, the current cpu is + * pinned. It might be the wrong one, but the offline check + * below catches that. + */ + pin_current_cpu(); + /* + * If called from ksoftirqd (cpu >= 0) we need to check + * whether we are on the wrong cpu due to cpu offlining. If + * called via thread_do_softirq() no action required. + */ + if (cpu >= 0 && cpu_is_offline(cpu)) { + unpin_current_cpu(); + return -1; + } + preempt_enable(); + local_lock(local_softirq_lock); + local_irq_disable(); + /* + * We cannot switch stacks on RT as we want to be able to + * schedule! + */ + if (local_softirq_pending()) + __do_softirq(); + local_unlock(local_softirq_lock); + unpin_current_cpu(); + preempt_disable(); + local_irq_enable(); + return 0; +} + +/* + * Called from netif_rx_ni(). Preemption enabled. + */ +void thread_do_softirq(void) +{ + if (!in_serving_softirq()) { + preempt_disable(); + __thread_do_softirq(-1); + preempt_enable(); + } +} + +static int ksoftirqd_do_softirq(int cpu) +{ + return __thread_do_softirq(cpu); +} + +static inline void local_bh_disable_nort(void) { } +static inline void _local_bh_enable_nort(void) { } + +static inline void ksoftirqd_set_sched_params(void) +{ + struct sched_param param = { .sched_priority = 1 }; + + sched_setscheduler(current, SCHED_FIFO, ¶m); +} + +static inline void ksoftirqd_clr_sched_params(void) +{ + struct sched_param param = { .sched_priority = 0 }; + + sched_setscheduler(current, SCHED_NORMAL, ¶m); +} + +#endif /* PREEMPT_RT_FULL */ /* * Enter an interrupt context. */ @@ -302,9 +555,9 @@ void irq_enter(void) * Prevent raise_softirq from needlessly waking up ksoftirqd * here, as softirq will be serviced on return from interrupt. */ - local_bh_disable(); + local_bh_disable_nort(); tick_check_idle(cpu); - _local_bh_enable(); + _local_bh_enable_nort(); } __irq_enter(); @@ -313,6 +566,7 @@ void irq_enter(void) #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED static inline void invoke_softirq(void) { +#ifndef CONFIG_PREEMPT_RT_FULL if (!force_irqthreads) __do_softirq(); else { @@ -321,10 +575,14 @@ static inline void invoke_softirq(void) wakeup_softirqd(); __local_bh_enable(SOFTIRQ_OFFSET); } +#else + wakeup_softirqd(); +#endif } #else static inline void invoke_softirq(void) { +#ifndef CONFIG_PREEMPT_RT_FULL if (!force_irqthreads) do_softirq(); else { @@ -333,6 +591,9 @@ static inline void invoke_softirq(void) wakeup_softirqd(); __local_bh_enable(SOFTIRQ_OFFSET); } +#else + wakeup_softirqd(); +#endif } #endif @@ -353,7 +614,7 @@ void irq_exit(void) if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) tick_nohz_stop_sched_tick(0); #endif - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } /* @@ -739,29 +1000,21 @@ void __init softirq_init(void) static int run_ksoftirqd(void * __bind_cpu) { + ksoftirqd_set_sched_params(); + set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { preempt_disable(); - if (!local_softirq_pending()) { - preempt_enable_no_resched(); - schedule(); - preempt_disable(); - } + if (!local_softirq_pending()) + schedule_preempt_disabled(); __set_current_state(TASK_RUNNING); while (local_softirq_pending()) { - /* Preempt disable stops cpu going offline. - If already offline, we'll be on wrong CPU: - don't process */ - if (cpu_is_offline((long)__bind_cpu)) + if (ksoftirqd_do_softirq((long) __bind_cpu)) goto wait_to_die; - local_irq_disable(); - if (local_softirq_pending()) - __do_softirq(); - local_irq_enable(); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); cond_resched(); preempt_disable(); rcu_note_context_switch((long)__bind_cpu); @@ -774,6 +1027,7 @@ static int run_ksoftirqd(void * __bind_c wait_to_die: preempt_enable(); + ksoftirqd_clr_sched_params(); /* Wait for kthread_stop */ set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { Index: linux-2.6/include/linux/kprobes.h =================================================================== --- linux-2.6.orig/include/linux/kprobes.h +++ linux-2.6/include/linux/kprobes.h @@ -181,7 +181,7 @@ struct kretprobe { int nmissed; size_t data_size; struct hlist_head free_instances; - spinlock_t lock; + raw_spinlock_t lock; }; struct kretprobe_instance { Index: linux-2.6/kernel/kprobes.c =================================================================== --- linux-2.6.orig/kernel/kprobes.c +++ linux-2.6/kernel/kprobes.c @@ -78,10 +78,10 @@ static bool kprobes_all_disarmed; static DEFINE_MUTEX(kprobe_mutex); static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { - spinlock_t lock ____cacheline_aligned_in_smp; + raw_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; -static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) +static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) { return &(kretprobe_table_locks[hash].lock); } @@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kr hlist_del(&ri->hlist); INIT_HLIST_NODE(&ri->hlist); if (likely(rp)) { - spin_lock(&rp->lock); + raw_spin_lock(&rp->lock); hlist_add_head(&ri->hlist, &rp->free_instances); - spin_unlock(&rp->lock); + raw_spin_unlock(&rp->lock); } else /* Unregistering */ hlist_add_head(&ri->hlist, head); @@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struc __acquires(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + raw_spinlock_t *hlist_lock; *head = &kretprobe_inst_table[hash]; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + raw_spin_lock_irqsave(hlist_lock, *flags); } static void __kprobes kretprobe_table_lock(unsigned long hash, unsigned long *flags) __acquires(hlist_lock) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + raw_spin_lock_irqsave(hlist_lock, *flags); } void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, @@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(str __releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + raw_spinlock_t *hlist_lock; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + raw_spin_unlock_irqrestore(hlist_lock, *flags); } static void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) __releases(hlist_lock) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + raw_spin_unlock_irqrestore(hlist_lock, *flags); } /* @@ -1650,12 +1650,12 @@ static int __kprobes pre_handler_kretpro /*TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); - spin_lock_irqsave(&rp->lock, flags); + raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); hlist_del(&ri->hlist); - spin_unlock_irqrestore(&rp->lock, flags); + raw_spin_unlock_irqrestore(&rp->lock, flags); ri->rp = rp; ri->task = current; @@ -1672,7 +1672,7 @@ static int __kprobes pre_handler_kretpro kretprobe_table_unlock(hash, &flags); } else { rp->nmissed++; - spin_unlock_irqrestore(&rp->lock, flags); + raw_spin_unlock_irqrestore(&rp->lock, flags); } return 0; } @@ -1708,7 +1708,7 @@ int __kprobes register_kretprobe(struct rp->maxactive = num_possible_cpus(); #endif } - spin_lock_init(&rp->lock); + raw_spin_lock_init(&rp->lock); INIT_HLIST_HEAD(&rp->free_instances); for (i = 0; i < rp->maxactive; i++) { inst = kmalloc(sizeof(struct kretprobe_instance) + @@ -1946,7 +1946,7 @@ static int __init init_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); - spin_lock_init(&(kretprobe_table_locks[i].lock)); + raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); } /* Index: linux-2.6/include/linux/percpu_counter.h =================================================================== --- linux-2.6.orig/include/linux/percpu_counter.h +++ linux-2.6/include/linux/percpu_counter.h @@ -16,7 +16,7 @@ #ifdef CONFIG_SMP struct percpu_counter { - spinlock_t lock; + raw_spinlock_t lock; s64 count; #ifdef CONFIG_HOTPLUG_CPU struct list_head list; /* All percpu_counters are on a list */ Index: linux-2.6/lib/percpu_counter.c =================================================================== --- linux-2.6.orig/lib/percpu_counter.c +++ linux-2.6/lib/percpu_counter.c @@ -59,13 +59,13 @@ void percpu_counter_set(struct percpu_co { int cpu; - spin_lock(&fbc->lock); + raw_spin_lock(&fbc->lock); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; - spin_unlock(&fbc->lock); + raw_spin_unlock(&fbc->lock); } EXPORT_SYMBOL(percpu_counter_set); @@ -76,10 +76,10 @@ void __percpu_counter_add(struct percpu_ preempt_disable(); count = __this_cpu_read(*fbc->counters) + amount; if (count >= batch || count <= -batch) { - spin_lock(&fbc->lock); + raw_spin_lock(&fbc->lock); fbc->count += count; __this_cpu_write(*fbc->counters, 0); - spin_unlock(&fbc->lock); + raw_spin_unlock(&fbc->lock); } else { __this_cpu_write(*fbc->counters, count); } @@ -96,13 +96,13 @@ s64 __percpu_counter_sum(struct percpu_c s64 ret; int cpu; - spin_lock(&fbc->lock); + raw_spin_lock(&fbc->lock); ret = fbc->count; for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } - spin_unlock(&fbc->lock); + raw_spin_unlock(&fbc->lock); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); @@ -110,7 +110,7 @@ EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, struct lock_class_key *key) { - spin_lock_init(&fbc->lock); + raw_spin_lock_init(&fbc->lock); lockdep_set_class(&fbc->lock, key); fbc->count = amount; fbc->counters = alloc_percpu(s32); @@ -173,11 +173,11 @@ static int __cpuinit percpu_counter_hotc s32 *pcount; unsigned long flags; - spin_lock_irqsave(&fbc->lock, flags); + raw_spin_lock_irqsave(&fbc->lock, flags); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; - spin_unlock_irqrestore(&fbc->lock, flags); + raw_spin_unlock_irqrestore(&fbc->lock, flags); } mutex_unlock(&percpu_counters_lock); #endif Index: linux-2.6/kernel/cgroup.c =================================================================== --- linux-2.6.orig/kernel/cgroup.c +++ linux-2.6/kernel/cgroup.c @@ -263,7 +263,7 @@ list_for_each_entry(_root, &roots, root_ /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); -static DEFINE_SPINLOCK(release_list_lock); +static DEFINE_RAW_SPINLOCK(release_list_lock); static void cgroup_release_agent(struct work_struct *work); static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); @@ -4010,11 +4010,11 @@ again: finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ @@ -4667,13 +4667,13 @@ static void check_for_release(struct cgr * already queued for a userspace notification, queue * it now */ int need_schedule_work = 0; - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); if (!cgroup_is_removed(cgrp) && list_empty(&cgrp->release_list)) { list_add(&cgrp->release_list, &release_list); need_schedule_work = 1; } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); if (need_schedule_work) schedule_work(&release_agent_work); } @@ -4725,7 +4725,7 @@ static void cgroup_release_agent(struct { BUG_ON(work != &release_agent_work); mutex_lock(&cgroup_mutex); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; @@ -4734,7 +4734,7 @@ static void cgroup_release_agent(struct struct cgroup, release_list); list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!pathbuf) goto continue_free; @@ -4764,9 +4764,9 @@ static void cgroup_release_agent(struct continue_free: kfree(pathbuf); kfree(agentbuf); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); mutex_unlock(&cgroup_mutex); } Index: linux-2.6/include/linux/proportions.h =================================================================== --- linux-2.6.orig/include/linux/proportions.h +++ linux-2.6/include/linux/proportions.h @@ -58,7 +58,7 @@ struct prop_local_percpu { */ int shift; unsigned long period; - spinlock_t lock; /* protect the snapshot state */ + raw_spinlock_t lock; /* protect the snapshot state */ }; int prop_local_init_percpu(struct prop_local_percpu *pl); @@ -106,11 +106,11 @@ struct prop_local_single { */ unsigned long period; int shift; - spinlock_t lock; /* protect the snapshot state */ + raw_spinlock_t lock; /* protect the snapshot state */ }; #define INIT_PROP_LOCAL_SINGLE(name) \ -{ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ +{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ } int prop_local_init_single(struct prop_local_single *pl); Index: linux-2.6/lib/proportions.c =================================================================== --- linux-2.6.orig/lib/proportions.c +++ linux-2.6/lib/proportions.c @@ -190,7 +190,7 @@ prop_adjust_shift(int *pl_shift, unsigne int prop_local_init_percpu(struct prop_local_percpu *pl) { - spin_lock_init(&pl->lock); + raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; return percpu_counter_init(&pl->events, 0); @@ -226,7 +226,7 @@ void prop_norm_percpu(struct prop_global if (pl->period == global_period) return; - spin_lock_irqsave(&pl->lock, flags); + raw_spin_lock_irqsave(&pl->lock, flags); prop_adjust_shift(&pl->shift, &pl->period, pg->shift); /* @@ -247,7 +247,7 @@ void prop_norm_percpu(struct prop_global percpu_counter_set(&pl->events, 0); pl->period = global_period; - spin_unlock_irqrestore(&pl->lock, flags); + raw_spin_unlock_irqrestore(&pl->lock, flags); } /* @@ -324,7 +324,7 @@ void prop_fraction_percpu(struct prop_de int prop_local_init_single(struct prop_local_single *pl) { - spin_lock_init(&pl->lock); + raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; pl->events = 0; @@ -356,7 +356,7 @@ void prop_norm_single(struct prop_global if (pl->period == global_period) return; - spin_lock_irqsave(&pl->lock, flags); + raw_spin_lock_irqsave(&pl->lock, flags); prop_adjust_shift(&pl->shift, &pl->period, pg->shift); /* * For each missed period, we half the local counter. @@ -367,7 +367,7 @@ void prop_norm_single(struct prop_global else pl->events = 0; pl->period = global_period; - spin_unlock_irqrestore(&pl->lock, flags); + raw_spin_unlock_irqrestore(&pl->lock, flags); } /* Index: linux-2.6/kernel/trace/ring_buffer.c =================================================================== --- linux-2.6.orig/kernel/trace/ring_buffer.c +++ linux-2.6/kernel/trace/ring_buffer.c @@ -478,7 +478,7 @@ struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; struct ring_buffer *buffer; - spinlock_t reader_lock; /* serialize readers */ + raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; @@ -1055,7 +1055,7 @@ rb_allocate_cpu_buffer(struct ring_buffe cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; - spin_lock_init(&cpu_buffer->reader_lock); + raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -1252,7 +1252,7 @@ rb_remove_pages(struct ring_buffer_per_c struct list_head *p; unsigned i; - spin_lock_irq(&cpu_buffer->reader_lock); + raw_spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1270,7 +1270,7 @@ rb_remove_pages(struct ring_buffer_per_c rb_check_pages(cpu_buffer); out: - spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); } static void @@ -1281,7 +1281,7 @@ rb_insert_pages(struct ring_buffer_per_c struct list_head *p; unsigned i; - spin_lock_irq(&cpu_buffer->reader_lock); + raw_spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1296,7 +1296,7 @@ rb_insert_pages(struct ring_buffer_per_c rb_check_pages(cpu_buffer); out: - spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); } /** @@ -2790,9 +2790,9 @@ void ring_buffer_iter_reset(struct ring_ cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); @@ -3251,12 +3251,12 @@ ring_buffer_peek(struct ring_buffer *buf again: local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) @@ -3281,9 +3281,9 @@ ring_buffer_iter_peek(struct ring_buffer unsigned long flags; again: - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; @@ -3323,7 +3323,7 @@ ring_buffer_consume(struct ring_buffer * cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event) { @@ -3332,7 +3332,7 @@ ring_buffer_consume(struct ring_buffer * } if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); out: @@ -3424,11 +3424,11 @@ ring_buffer_read_start(struct ring_buffe cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -3463,7 +3463,7 @@ ring_buffer_read(struct ring_buffer_iter struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); again: event = rb_iter_peek(iter, ts); if (!event) @@ -3474,7 +3474,7 @@ ring_buffer_read(struct ring_buffer_iter rb_advance_iter(iter); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return event; } @@ -3543,7 +3543,7 @@ void ring_buffer_reset_cpu(struct ring_b atomic_inc(&cpu_buffer->record_disabled); - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; @@ -3555,7 +3555,7 @@ void ring_buffer_reset_cpu(struct ring_b arch_spin_unlock(&cpu_buffer->lock); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); } @@ -3593,10 +3593,10 @@ int ring_buffer_empty(struct ring_buffer cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (!ret) @@ -3627,10 +3627,10 @@ int ring_buffer_empty_cpu(struct ring_bu cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); return ret; @@ -3826,7 +3826,7 @@ int ring_buffer_read_page(struct ring_bu if (!bpage) goto out; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) @@ -3949,7 +3949,7 @@ int ring_buffer_read_page(struct ring_bu memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); out_unlock: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); out: return ret; Index: linux-2.6/kernel/trace/trace.c =================================================================== --- linux-2.6.orig/kernel/trace/trace.c +++ linux-2.6/kernel/trace/trace.c @@ -341,7 +341,7 @@ unsigned long trace_flags = TRACE_ITER_P TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; static int trace_stop_count; -static DEFINE_SPINLOCK(tracing_start_lock); +static DEFINE_RAW_SPINLOCK(tracing_start_lock); /** * trace_wake_up - wake up tasks waiting for trace input @@ -958,7 +958,7 @@ void tracing_start(void) if (tracing_disabled) return; - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (--trace_stop_count) { if (trace_stop_count < 0) { /* Someone screwed up their debugging */ @@ -983,7 +983,7 @@ void tracing_start(void) ftrace_start(); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } /** @@ -998,7 +998,7 @@ void tracing_stop(void) unsigned long flags; ftrace_stop(); - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (trace_stop_count++) goto out; @@ -1016,7 +1016,7 @@ void tracing_stop(void) arch_spin_unlock(&ftrace_max_lock); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } void trace_stop_cmdline_recording(void); @@ -1120,6 +1120,8 @@ tracing_generic_entry_update(struct trac ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); + + entry->migrate_disable = (tsk) ? tsk->migrate_disable & 0xFF : 0; } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); @@ -1757,9 +1759,10 @@ static void print_lat_help_header(struct seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# |||| / _--=> migrate-disable\n"); + seq_puts(m, "# ||||| / delay \n"); + seq_puts(m, "# cmd pid |||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) Index: linux-2.6/kernel/trace/trace_irqsoff.c =================================================================== --- linux-2.6.orig/kernel/trace/trace_irqsoff.c +++ linux-2.6/kernel/trace/trace_irqsoff.c @@ -17,13 +17,14 @@ #include #include "trace.h" +#include static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; static DEFINE_PER_CPU(int, tracing_cpu); -static DEFINE_SPINLOCK(max_trace_lock); +static DEFINE_RAW_SPINLOCK(max_trace_lock); enum { TRACER_IRQS_OFF = (1 << 1), @@ -319,7 +320,7 @@ check_critical_timing(struct trace_array if (!report_latency(delta)) goto out; - spin_lock_irqsave(&max_trace_lock, flags); + raw_spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ if (!report_latency(delta)) @@ -342,7 +343,7 @@ check_critical_timing(struct trace_array max_sequence++; out_unlock: - spin_unlock_irqrestore(&max_trace_lock, flags); + raw_spin_unlock_irqrestore(&max_trace_lock, flags); out: data->critical_sequence = max_sequence; @@ -424,11 +425,13 @@ void start_critical_timings(void) { if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + trace_preemptirqsoff_hist(TRACE_START, 1); } EXPORT_SYMBOL_GPL(start_critical_timings); void stop_critical_timings(void) { + trace_preemptirqsoff_hist(TRACE_STOP, 0); if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings) #ifdef CONFIG_PROVE_LOCKING void time_hardirqs_on(unsigned long a0, unsigned long a1) { + trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, { if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); + trace_preemptirqsoff_hist(IRQS_OFF, 1); } #else /* !CONFIG_PROVE_LOCKING */ @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct */ void trace_hardirqs_on(void) { + trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -480,11 +486,13 @@ void trace_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + trace_preemptirqsoff_hist(IRQS_OFF, 1); } EXPORT_SYMBOL(trace_hardirqs_off); void trace_hardirqs_on_caller(unsigned long caller_addr) { + trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } @@ -494,6 +502,7 @@ void trace_hardirqs_off_caller(unsigned { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); + trace_preemptirqsoff_hist(IRQS_OFF, 1); } EXPORT_SYMBOL(trace_hardirqs_off_caller); @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller) #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) { + trace_preemptirqsoff_hist(PREEMPT_ON, 0); if (preempt_trace()) stop_critical_timing(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { + trace_preemptirqsoff_hist(PREEMPT_OFF, 1); if (preempt_trace()) start_critical_timing(a0, a1); } Index: linux-2.6/include/linux/ratelimit.h =================================================================== --- linux-2.6.orig/include/linux/ratelimit.h +++ linux-2.6/include/linux/ratelimit.h @@ -8,7 +8,7 @@ #define DEFAULT_RATELIMIT_BURST 10 struct ratelimit_state { - spinlock_t lock; /* protect the state */ + raw_spinlock_t lock; /* protect the state */ int interval; int burst; @@ -20,7 +20,7 @@ struct ratelimit_state { #define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \ \ struct ratelimit_state name = { \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ .interval = interval_init, \ .burst = burst_init, \ } @@ -28,7 +28,7 @@ struct ratelimit_state { static inline void ratelimit_state_init(struct ratelimit_state *rs, int interval, int burst) { - spin_lock_init(&rs->lock); + raw_spin_lock_init(&rs->lock); rs->interval = interval; rs->burst = burst; rs->printed = 0; Index: linux-2.6/kernel/printk.c =================================================================== --- linux-2.6.orig/kernel/printk.c +++ linux-2.6/kernel/printk.c @@ -44,13 +44,6 @@ #include -/* - * Architectures can override it: - */ -void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) -{ -} - #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) /* printk's without a loglevel use this.. */ @@ -100,7 +93,7 @@ static int console_locked, console_suspe * It is also used in interesting ways to provide interlocking in * console_unlock();. */ -static DEFINE_SPINLOCK(logbuf_lock); +static DEFINE_RAW_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -212,7 +205,7 @@ void __init setup_log_buf(int early) return; } - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; @@ -230,7 +223,7 @@ void __init setup_log_buf(int early) log_start -= offset; con_start -= offset; log_end -= offset; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); pr_info("log_buf_len: %d\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", @@ -363,18 +356,18 @@ int do_syslog(int type, char __user *buf if (error) goto out; i = 0; - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); while (!error && (log_start != log_end) && i < len) { c = LOG_BUF(log_start); log_start++; - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); error = __put_user(c,buf); buf++; i++; cond_resched(); - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (!error) error = i; break; @@ -397,7 +390,7 @@ int do_syslog(int type, char __user *buf count = len; if (count > log_buf_len) count = log_buf_len; - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); if (count > logged_chars) count = logged_chars; if (do_clear) @@ -414,12 +407,12 @@ int do_syslog(int type, char __user *buf if (j + log_buf_len < log_end) break; c = LOG_BUF(j); - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); error = __put_user(c,&buf[count-1-i]); cond_resched(); - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (error) break; error = i; @@ -509,6 +502,7 @@ static void __call_console_drivers(unsig { struct console *con; + migrate_disable(); for_each_console(con) { if (exclusive_console && con != exclusive_console) continue; @@ -517,7 +511,54 @@ static void __call_console_drivers(unsig (con->flags & CON_ANYTIME))) con->write(con, &LOG_BUF(start), end - start); } + migrate_enable(); +} + +#ifdef CONFIG_EARLY_PRINTK +struct console *early_console; + +static void early_vprintk(const char *fmt, va_list ap) +{ + char buf[512]; + int n = vscnprintf(buf, sizeof(buf), fmt, ap); + if (early_console) + early_console->write(early_console, buf, n); +} + +asmlinkage void early_printk(const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + early_vprintk(fmt, ap); + va_end(ap); +} + +/* + * This is independent of any log levels - a global + * kill switch that turns off all of printk. + * + * Used by the NMI watchdog if early-printk is enabled. + */ +static int __read_mostly printk_killswitch; + +void printk_kill(void) +{ + printk_killswitch = 1; +} + +static int forced_early_printk(const char *fmt, va_list ap) +{ + if (!printk_killswitch) + return 0; + early_vprintk(fmt, ap); + return 1; } +#else +static inline int forced_early_printk(const char *fmt, va_list ap) +{ + return 0; +} +#endif static int __read_mostly ignore_loglevel; @@ -687,7 +728,7 @@ static void zap_locks(void) oops_timestamp = jiffies; /* If a crash is occurring, make sure we can't deadlock */ - spin_lock_init(&logbuf_lock); + raw_spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ sema_init(&console_sem, 1); } @@ -779,12 +820,18 @@ static inline int can_use_console(unsign * interrupts disabled. It should return with 'lockbuf_lock' * released but interrupts still disabled. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(unsigned int cpu, unsigned long flags) __releases(&logbuf_lock) { +#ifdef CONFIG_PREEMPT_RT_FULL + int lock = !early_boot_irqs_disabled && !irqs_disabled_flags(flags) && + !preempt_count(); +#else + int lock = 1; +#endif int retval = 0; - if (console_trylock()) { + if (lock && console_trylock()) { retval = 1; /* @@ -800,7 +847,7 @@ static int console_trylock_for_printk(un } } printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); + raw_spin_unlock(&logbuf_lock); return retval; } static const char recursion_bug_msg [] = @@ -833,6 +880,13 @@ asmlinkage int vprintk(const char *fmt, size_t plen; char special; + /* + * Fall back to early_printk if a debugging subsystem has + * killed printk output + */ + if (unlikely(forced_early_printk(fmt, args))) + return 1; + boot_delay_msec(); printk_delay(); @@ -860,7 +914,7 @@ asmlinkage int vprintk(const char *fmt, } lockdep_off(); - spin_lock(&logbuf_lock); + raw_spin_lock(&logbuf_lock); printk_cpu = this_cpu; if (recursion_bug) { @@ -953,8 +1007,15 @@ asmlinkage int vprintk(const char *fmt, * will release 'logbuf_lock' regardless of whether it * actually gets the semaphore or not. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk(this_cpu, flags)) { +#ifndef CONFIG_PREEMPT_RT_FULL + console_unlock(); +#else + raw_local_irq_restore(flags); console_unlock(); + raw_local_irq_save(flags); +#endif + } lockdep_on(); out_restore_irqs: @@ -1252,18 +1313,23 @@ void console_unlock(void) console_may_schedule = 0; for ( ; ; ) { - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; if (con_start == log_end) break; /* Nothing to print */ _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ - spin_unlock(&logbuf_lock); +#ifndef CONFIG_PREEMPT_RT_FULL + raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); start_critical_timings(); local_irq_restore(flags); +#else + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + call_console_drivers(_con_start, _log_end); +#endif } console_locked = 0; @@ -1272,7 +1338,7 @@ void console_unlock(void) exclusive_console = NULL; up(&console_sem); - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) wake_up_klogd(); } @@ -1502,9 +1568,9 @@ void register_console(struct console *ne * console_unlock(); will print out the buffered messages * for us. */ - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to @@ -1711,10 +1777,10 @@ void kmsg_dump(enum kmsg_dump_reason rea /* Theoretically, the log could move on after we do this, but there's not a lot we can do about that. The new messages will overwrite the start of what we dump. */ - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); end = log_end & LOG_BUF_MASK; chars = logged_chars; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (chars > end) { s1 = log_buf + log_buf_len - chars + end; Index: linux-2.6/lib/ratelimit.c =================================================================== --- linux-2.6.orig/lib/ratelimit.c +++ linux-2.6/lib/ratelimit.c @@ -39,7 +39,7 @@ int ___ratelimit(struct ratelimit_state * in addition to the one that will be printed by * the entity that is holding the lock already: */ - if (!spin_trylock_irqsave(&rs->lock, flags)) + if (!raw_spin_trylock_irqsave(&rs->lock, flags)) return 0; if (!rs->begin) @@ -60,7 +60,7 @@ int ___ratelimit(struct ratelimit_state rs->missed++; ret = 0; } - spin_unlock_irqrestore(&rs->lock, flags); + raw_spin_unlock_irqrestore(&rs->lock, flags); return ret; } Index: linux-2.6/include/linux/init_task.h =================================================================== --- linux-2.6.orig/include/linux/init_task.h +++ linux-2.6/include/linux/init_task.h @@ -42,7 +42,7 @@ extern struct fs_struct init_fs; .cputimer = { \ .cputime = INIT_CPUTIME, \ .running = 0, \ - .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ @@ -179,6 +179,7 @@ extern struct cred init_cred; .fs_excl = ATOMIC_INIT(0), \ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .timer_slack_ns = 50000, /* 50 usec default slack */ \ + .posix_timer_list = NULL, \ .pids = { \ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ Index: linux-2.6/kernel/posix-cpu-timers.c =================================================================== --- linux-2.6.orig/kernel/posix-cpu-timers.c +++ linux-2.6/kernel/posix-cpu-timers.c @@ -274,7 +274,7 @@ void thread_group_cputimer(struct task_s struct task_cputime sum; unsigned long flags; - spin_lock_irqsave(&cputimer->lock, flags); + raw_spin_lock_irqsave(&cputimer->lock, flags); if (!cputimer->running) { cputimer->running = 1; /* @@ -287,7 +287,7 @@ void thread_group_cputimer(struct task_s update_gt_cputime(&cputimer->cputime, &sum); } *times = cputimer->cputime; - spin_unlock_irqrestore(&cputimer->lock, flags); + raw_spin_unlock_irqrestore(&cputimer->lock, flags); } /* @@ -699,7 +699,7 @@ static int posix_cpu_timer_set(struct k_ /* * Disarm any old timer after extracting its expiry time. */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); ret = 0; old_incr = timer->it.cpu.incr; @@ -997,9 +997,9 @@ static void stop_process_timers(struct s struct thread_group_cputimer *cputimer = &sig->cputimer; unsigned long flags; - spin_lock_irqsave(&cputimer->lock, flags); + raw_spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 0; - spin_unlock_irqrestore(&cputimer->lock, flags); + raw_spin_unlock_irqrestore(&cputimer->lock, flags); } static u32 onecputick; @@ -1221,7 +1221,7 @@ void posix_cpu_timer_schedule(struct k_i /* * Now re-arm for the new expiry time. */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); arm_timer(timer); spin_unlock(&p->sighand->siglock); @@ -1289,9 +1289,9 @@ static inline int fastpath_timer_check(s if (sig->cputimer.running) { struct task_cputime group_sample; - spin_lock(&sig->cputimer.lock); + raw_spin_lock(&sig->cputimer.lock); group_sample = sig->cputimer.cputime; - spin_unlock(&sig->cputimer.lock); + raw_spin_unlock(&sig->cputimer.lock); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; @@ -1305,13 +1305,13 @@ static inline int fastpath_timer_check(s * already updated our counts. We need to check if any timers fire now. * Interrupts are disabled. */ -void run_posix_cpu_timers(struct task_struct *tsk) +void __run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; unsigned long flags; - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); /* * The fast path checks that there are no expired thread or thread @@ -1369,6 +1369,177 @@ void run_posix_cpu_timers(struct task_st } } +#include +#include +DEFINE_PER_CPU(struct task_struct *, posix_timer_task); +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist); + +static int posix_cpu_timers_thread(void *data) +{ + int cpu = (long)data; + + BUG_ON(per_cpu(posix_timer_task,cpu) != current); + + while (!kthread_should_stop()) { + struct task_struct *tsk = NULL; + struct task_struct *next = NULL; + + if (cpu_is_offline(cpu)) + goto wait_to_die; + + /* grab task list */ + raw_local_irq_disable(); + tsk = per_cpu(posix_timer_tasklist, cpu); + per_cpu(posix_timer_tasklist, cpu) = NULL; + raw_local_irq_enable(); + + /* its possible the list is empty, just return */ + if (!tsk) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + continue; + } + + /* Process task list */ + while (1) { + /* save next */ + next = tsk->posix_timer_list; + + /* run the task timers, clear its ptr and + * unreference it + */ + __run_posix_cpu_timers(tsk); + tsk->posix_timer_list = NULL; + put_task_struct(tsk); + + /* check if this is the last on the list */ + if (next == tsk) + break; + tsk = next; + } + } + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static inline int __fastpath_timer_check(struct task_struct *tsk) +{ + /* tsk == current, ensure it is safe to use ->signal/sighand */ + if (unlikely(tsk->exit_state)) + return 0; + + if (!task_cputime_zero(&tsk->cputime_expires)) + return 1; + + if (!task_cputime_zero(&tsk->signal->cputime_expires)) + return 1; + + return 0; +} + +void run_posix_cpu_timers(struct task_struct *tsk) +{ + unsigned long cpu = smp_processor_id(); + struct task_struct *tasklist; + + BUG_ON(!irqs_disabled()); + if(!per_cpu(posix_timer_task, cpu)) + return; + /* get per-cpu references */ + tasklist = per_cpu(posix_timer_tasklist, cpu); + + /* check to see if we're already queued */ + if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) { + get_task_struct(tsk); + if (tasklist) { + tsk->posix_timer_list = tasklist; + } else { + /* + * The list is terminated by a self-pointing + * task_struct + */ + tsk->posix_timer_list = tsk; + } + per_cpu(posix_timer_tasklist, cpu) = tsk; + + wake_up_process(per_cpu(posix_timer_task, cpu)); + } +} + +/* + * posix_cpu_thread_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int posix_cpu_thread_call(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + struct task_struct *p; + struct sched_param param; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(posix_cpu_timers_thread, hcpu, + "posixcputmr/%d",cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; + kthread_bind(p, cpu); + /* Must be high prio to avoid getting starved */ + param.sched_priority = MAX_RT_PRIO-1; + sched_setscheduler(p, SCHED_FIFO, ¶m); + per_cpu(posix_timer_task,cpu) = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ + wake_up_process(per_cpu(posix_timer_task,cpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(per_cpu(posix_timer_task,cpu), + any_online_cpu(cpu_online_map)); + kthread_stop(per_cpu(posix_timer_task,cpu)); + per_cpu(posix_timer_task,cpu) = NULL; + break; + case CPU_DEAD: + kthread_stop(per_cpu(posix_timer_task,cpu)); + per_cpu(posix_timer_task,cpu) = NULL; + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __devinitdata posix_cpu_thread_notifier = { + .notifier_call = posix_cpu_thread_call, + .priority = 10 +}; + +static int __init posix_cpu_thread_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + /* Start one for boot CPU. */ + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, cpu); + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&posix_cpu_thread_notifier); + return 0; +} +early_initcall(posix_cpu_thread_init); + /* * Set one of the process-wide special case CPU timers or RLIMIT_CPU. * The tsk->sighand->siglock must be held by the caller. @@ -1617,6 +1788,11 @@ static __init int init_posix_cpu_timers( .timer_create = thread_cpu_timer_create, }; struct timespec ts; + unsigned long cpu; + + /* init the per-cpu posix_timer_tasklets */ + for_each_cpu_mask(cpu, cpu_possible_map) + per_cpu(posix_timer_tasklist, cpu) = NULL; posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); Index: linux-2.6/kernel/sched_stats.h =================================================================== --- linux-2.6.orig/kernel/sched_stats.h +++ linux-2.6/kernel/sched_stats.h @@ -282,10 +282,10 @@ static inline void account_group_user_ti if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.utime = cputime_add(cputimer->cputime.utime, cputime); - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } /** @@ -306,10 +306,10 @@ static inline void account_group_system_ if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.stime = cputime_add(cputimer->cputime.stime, cputime); - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } /** @@ -330,7 +330,7 @@ static inline void account_group_exec_ru if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.sum_exec_runtime += ns; - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } Index: linux-2.6/include/linux/semaphore.h =================================================================== --- linux-2.6.orig/include/linux/semaphore.h +++ linux-2.6/include/linux/semaphore.h @@ -14,14 +14,14 @@ /* Please don't access any members of this structure directly */ struct semaphore { - spinlock_t lock; + raw_spinlock_t lock; unsigned int count; struct list_head wait_list; }; #define __SEMAPHORE_INITIALIZER(name, n) \ { \ - .lock = __SPIN_LOCK_UNLOCKED((name).lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \ .count = n, \ .wait_list = LIST_HEAD_INIT((name).wait_list), \ } Index: linux-2.6/kernel/semaphore.c =================================================================== --- linux-2.6.orig/kernel/semaphore.c +++ linux-2.6/kernel/semaphore.c @@ -54,12 +54,12 @@ void down(struct semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else __down(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); } EXPORT_SYMBOL(down); @@ -77,12 +77,12 @@ int down_interruptible(struct semaphore unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_interruptible(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_killable(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem) unsigned long flags; int count; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) sem->count = count; - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); } @@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_timeout(sem, jiffies); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -179,12 +179,12 @@ void up(struct semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else __up(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); } EXPORT_SYMBOL(up); @@ -217,9 +217,9 @@ static inline int __sched __down_common( if (timeout <= 0) goto timed_out; __set_task_state(task, state); - spin_unlock_irq(&sem->lock); + raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); - spin_lock_irq(&sem->lock); + raw_spin_lock_irq(&sem->lock); if (waiter.up) return 0; } Index: linux-2.6/include/linux/rwsem-spinlock.h =================================================================== --- linux-2.6.orig/include/linux/rwsem-spinlock.h +++ linux-2.6/include/linux/rwsem-spinlock.h @@ -20,26 +20,42 @@ * - if activity is -1 then there is one active writer * - if wait_list is not empty, then there are processes waiting for the semaphore */ +struct rw_anon_semaphore { + __s32 activity; + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifndef CONFIG_PREEMPT_RT_FULL +/* + * Non preempt-rt implementation of rw_semaphore. Same as above, but + * restricted vs. ownership. i.e. ownerless locked state and non owner + * release not allowed. + */ struct rw_semaphore { __s32 activity; - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; +#endif /* PREEMPT_RT_FULL */ #define RWSEM_UNLOCKED_VALUE 0x00000000 -extern void __down_read(struct rw_semaphore *sem); -extern int __down_read_trylock(struct rw_semaphore *sem); -extern void __down_write(struct rw_semaphore *sem); -extern void __down_write_nested(struct rw_semaphore *sem, int subclass); -extern int __down_write_trylock(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); -extern void __up_write(struct rw_semaphore *sem); -extern void __downgrade_write(struct rw_semaphore *sem); -extern int rwsem_is_locked(struct rw_semaphore *sem); +extern void __down_read(struct rw_anon_semaphore *sem); +extern int __down_read_trylock(struct rw_anon_semaphore *sem); +extern void __down_write(struct rw_anon_semaphore *sem); +extern void __down_write_nested(struct rw_anon_semaphore *sem, int subclass); +extern int __down_write_trylock(struct rw_anon_semaphore *sem); +extern void __up_read(struct rw_anon_semaphore *sem); +extern void __up_write(struct rw_anon_semaphore *sem); +extern void __downgrade_write(struct rw_anon_semaphore *sem); +extern int anon_rwsem_is_locked(struct rw_anon_semaphore *sem); #endif /* __KERNEL__ */ #endif /* _LINUX_RWSEM_SPINLOCK_H */ Index: linux-2.6/include/linux/rwsem.h =================================================================== --- linux-2.6.orig/include/linux/rwsem.h +++ linux-2.6/include/linux/rwsem.h @@ -17,37 +17,50 @@ #include #include +struct rw_anon_semaphore; struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK #include /* use a generic implementation */ -#else +#else /* RWSEM_GENERIC_SPINLOCK */ + /* All arch specific implementations share the same struct */ -struct rw_semaphore { +struct rw_anon_semaphore { long count; - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct rw_anon_semaphore *rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore *rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore *rwsem_wake(struct rw_anon_semaphore *); +extern struct rw_anon_semaphore *rwsem_downgrade_wake(struct rw_anon_semaphore *sem); /* Include the arch specific part */ #include /* In all implementations count != 0 means locked */ -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) { return sem->count != 0; } +#ifndef CONFIG_PREEMPT_RT_FULL +struct rw_semaphore { + long count; + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; #endif +#endif /* !RWSEM_GENERIC_SPINLOCK */ + /* Common initializer macros and functions */ #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -56,57 +69,59 @@ static inline int rwsem_is_locked(struct # define __RWSEM_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED(name.wait_lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } +#define __RWSEM_ANON_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, \ + __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ + LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key); -#define init_rwsem(sem) \ +#define init_anon_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __init_anon_rwsem((sem), #sem, &__key); \ } while (0) /* * lock for reading */ -extern void down_read(struct rw_semaphore *sem); +extern void anon_down_read(struct rw_anon_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention */ -extern int down_read_trylock(struct rw_semaphore *sem); +extern int anon_down_read_trylock(struct rw_anon_semaphore *sem); /* * lock for writing */ -extern void down_write(struct rw_semaphore *sem); +extern void anon_down_write(struct rw_anon_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention */ -extern int down_write_trylock(struct rw_semaphore *sem); +extern int anon_down_write_trylock(struct rw_anon_semaphore *sem); /* * release a read lock */ -extern void up_read(struct rw_semaphore *sem); +extern void anon_up_read(struct rw_anon_semaphore *sem); /* * release a write lock */ -extern void up_write(struct rw_semaphore *sem); +extern void anon_up_write(struct rw_anon_semaphore *sem); /* * downgrade write lock to read lock */ -extern void downgrade_write(struct rw_semaphore *sem); +extern void anon_downgrade_write(struct rw_anon_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -122,21 +137,101 @@ extern void downgrade_write(struct rw_se * lockdep_set_class() at lock initialization time. * See Documentation/lockdep-design.txt for more details.) */ -extern void down_read_nested(struct rw_semaphore *sem, int subclass); -extern void down_write_nested(struct rw_semaphore *sem, int subclass); +extern void anon_down_read_nested(struct rw_anon_semaphore *sem, int subclass); +extern void anon_down_write_nested(struct rw_anon_semaphore *sem, int subclass); /* * Take/release a lock when not the owner will release it. * * [ This API should be avoided as much as possible - the * proper abstraction for this case is completions. ] */ -extern void down_read_non_owner(struct rw_semaphore *sem); -extern void up_read_non_owner(struct rw_semaphore *sem); +extern void anon_down_read_non_owner(struct rw_anon_semaphore *sem); +extern void anon_up_read_non_owner(struct rw_anon_semaphore *sem); #else -# define down_read_nested(sem, subclass) down_read(sem) -# define down_write_nested(sem, subclass) down_write(sem) -# define down_read_non_owner(sem) down_read(sem) -# define up_read_non_owner(sem) up_read(sem) +# define anon_down_read_nested(sem, subclass) anon_down_read(sem) +# define anon_down_write_nested(sem, subclass) anon_down_write(sem) +# define anon_down_read_non_owner(sem) anon_down_read(sem) +# define anon_up_read_non_owner(sem) anon_up_read(sem) #endif +#ifdef CONFIG_PREEMPT_RT_FULL +#include +#else /* PREEMPT_RT_FULL */ +/* + * Non preempt-rt implementations + */ +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, \ + __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ + LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + +static inline void down_read(struct rw_semaphore *sem) +{ + anon_down_read((struct rw_anon_semaphore *)sem); +} + +static inline int down_read_trylock(struct rw_semaphore *sem) +{ + return anon_down_read_trylock((struct rw_anon_semaphore *)sem); +} + +static inline void down_write(struct rw_semaphore *sem) +{ + anon_down_write((struct rw_anon_semaphore *)sem); +} + +static inline int down_write_trylock(struct rw_semaphore *sem) +{ + return anon_down_write_trylock((struct rw_anon_semaphore *)sem); +} + +static inline void up_read(struct rw_semaphore *sem) +{ + anon_up_read((struct rw_anon_semaphore *)sem); +} + +static inline void up_write(struct rw_semaphore *sem) +{ + anon_up_write((struct rw_anon_semaphore *)sem); +} + +static inline void downgrade_write(struct rw_semaphore *sem) +{ + anon_downgrade_write((struct rw_anon_semaphore *)sem); +} + +static inline void down_read_nested(struct rw_semaphore *sem, int subclass) +{ + return anon_down_read_nested((struct rw_anon_semaphore *)sem, subclass); +} + +static inline void down_write_nested(struct rw_semaphore *sem, int subclass) +{ + anon_down_write_nested((struct rw_anon_semaphore *)sem, subclass); +} + +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return anon_rwsem_is_locked((struct rw_anon_semaphore *)sem); +} +#endif /* !PREEMPT_RT_FULL */ + #endif /* _LINUX_RWSEM_H */ + Index: linux-2.6/lib/rwsem-spinlock.c =================================================================== --- linux-2.6.orig/lib/rwsem-spinlock.c +++ linux-2.6/lib/rwsem-spinlock.c @@ -17,24 +17,24 @@ struct rwsem_waiter { #define RWSEM_WAITING_FOR_WRITE 0x00000002 }; -int rwsem_is_locked(struct rw_semaphore *sem) +int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) { int ret = 1; unsigned long flags; - if (spin_trylock_irqsave(&sem->wait_lock, flags)) { + if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { ret = (sem->activity != 0); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } return ret; } -EXPORT_SYMBOL(rwsem_is_locked); +EXPORT_SYMBOL(anon_rwsem_is_locked); /* * initialise the semaphore */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) +void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -44,10 +44,10 @@ void __init_rwsem(struct rw_semaphore *s lockdep_init_map(&sem->dep_map, name, key, 0); #endif sem->activity = 0; - spin_lock_init(&sem->wait_lock); + raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__init_anon_rwsem); /* * handle the lock release when processes blocked on it that can now run @@ -58,8 +58,8 @@ EXPORT_SYMBOL(__init_rwsem); * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if wakewrite is non-zero */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +static inline struct rw_anon_semaphore * +__rwsem_do_wake(struct rw_anon_semaphore *sem, int wakewrite) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -117,8 +117,8 @@ __rwsem_do_wake(struct rw_semaphore *sem /* * wake a single writer */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) +static inline struct rw_anon_semaphore * +__rwsem_wake_one_writer(struct rw_anon_semaphore *sem) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -139,18 +139,18 @@ __rwsem_wake_one_writer(struct rw_semaph /* * get a read lock on the semaphore */ -void __sched __down_read(struct rw_semaphore *sem) +void __sched __down_read(struct rw_anon_semaphore *sem) { struct rwsem_waiter waiter; struct task_struct *tsk; unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (sem->activity >= 0 && list_empty(&sem->wait_list)) { /* granted */ sem->activity++; - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); goto out; } @@ -165,7 +165,7 @@ void __sched __down_read(struct rw_semap list_add_tail(&waiter.list, &sem->wait_list); /* we don't need to touch the semaphore struct anymore */ - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); /* wait to be given the lock */ for (;;) { @@ -183,13 +183,13 @@ void __sched __down_read(struct rw_semap /* * trylock for reading -- returns 1 if successful, 0 if contention */ -int __down_read_trylock(struct rw_semaphore *sem) +int __down_read_trylock(struct rw_anon_semaphore *sem) { unsigned long flags; int ret = 0; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (sem->activity >= 0 && list_empty(&sem->wait_list)) { /* granted */ @@ -197,7 +197,7 @@ int __down_read_trylock(struct rw_semaph ret = 1; } - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return ret; } @@ -206,18 +206,18 @@ int __down_read_trylock(struct rw_semaph * get a write lock on the semaphore * - we increment the waiting count anyway to indicate an exclusive lock */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +void __sched __down_write_nested(struct rw_anon_semaphore *sem, int subclass) { struct rwsem_waiter waiter; struct task_struct *tsk; unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (sem->activity == 0 && list_empty(&sem->wait_list)) { /* granted */ sem->activity = -1; - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); goto out; } @@ -232,7 +232,7 @@ void __sched __down_write_nested(struct list_add_tail(&waiter.list, &sem->wait_list); /* we don't need to touch the semaphore struct anymore */ - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); /* wait to be given the lock */ for (;;) { @@ -247,7 +247,7 @@ void __sched __down_write_nested(struct ; } -void __sched __down_write(struct rw_semaphore *sem) +void __sched __down_write(struct rw_anon_semaphore *sem) { __down_write_nested(sem, 0); } @@ -255,12 +255,12 @@ void __sched __down_write(struct rw_sema /* * trylock for writing -- returns 1 if successful, 0 if contention */ -int __down_write_trylock(struct rw_semaphore *sem) +int __down_write_trylock(struct rw_anon_semaphore *sem) { unsigned long flags; int ret = 0; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (sem->activity == 0 && list_empty(&sem->wait_list)) { /* granted */ @@ -268,7 +268,7 @@ int __down_write_trylock(struct rw_semap ret = 1; } - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return ret; } @@ -276,48 +276,48 @@ int __down_write_trylock(struct rw_semap /* * release a read lock on the semaphore */ -void __up_read(struct rw_semaphore *sem) +void __up_read(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (--sem->activity == 0 && !list_empty(&sem->wait_list)) sem = __rwsem_wake_one_writer(sem); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } /* * release a write lock on the semaphore */ -void __up_write(struct rw_semaphore *sem) +void __up_write(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); sem->activity = 0; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 1); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } /* * downgrade a write lock into a read lock * - just wake up any readers at the front of the queue */ -void __downgrade_write(struct rw_semaphore *sem) +void __downgrade_write(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); sem->activity = 1; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 0); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } Index: linux-2.6/lib/rwsem.c =================================================================== --- linux-2.6.orig/lib/rwsem.c +++ linux-2.6/lib/rwsem.c @@ -11,8 +11,8 @@ /* * Initialize an rwsem: */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) +void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -22,11 +22,11 @@ void __init_rwsem(struct rw_semaphore *s lockdep_init_map(&sem->dep_map, name, key, 0); #endif sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); + raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__init_anon_rwsem); struct rwsem_waiter { struct list_head list; @@ -54,8 +54,8 @@ struct rwsem_waiter { * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if downgrading is false */ -static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wake_type) +static struct rw_anon_semaphore * +__rwsem_do_wake(struct rw_anon_semaphore *sem, int wake_type) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -169,8 +169,8 @@ __rwsem_do_wake(struct rw_semaphore *sem /* * wait for a lock to be granted */ -static struct rw_semaphore __sched * -rwsem_down_failed_common(struct rw_semaphore *sem, +static struct rw_anon_semaphore __sched * +rwsem_down_failed_common(struct rw_anon_semaphore *sem, unsigned int flags, signed long adjustment) { struct rwsem_waiter waiter; @@ -180,7 +180,7 @@ rwsem_down_failed_common(struct rw_semap set_task_state(tsk, TASK_UNINTERRUPTIBLE); /* set up my own style of waitqueue */ - spin_lock_irq(&sem->wait_lock); + raw_spin_lock_irq(&sem->wait_lock); waiter.task = tsk; waiter.flags = flags; get_task_struct(tsk); @@ -204,7 +204,7 @@ rwsem_down_failed_common(struct rw_semap adjustment == -RWSEM_ACTIVE_WRITE_BIAS) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); - spin_unlock_irq(&sem->wait_lock); + raw_spin_unlock_irq(&sem->wait_lock); /* wait to be given the lock */ for (;;) { @@ -222,7 +222,8 @@ rwsem_down_failed_common(struct rw_semap /* * wait for the read lock to be granted */ -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +struct rw_anon_semaphore __sched * +rwsem_down_read_failed(struct rw_anon_semaphore *sem) { return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ, -RWSEM_ACTIVE_READ_BIAS); @@ -231,7 +232,8 @@ struct rw_semaphore __sched *rwsem_down_ /* * wait for the write lock to be granted */ -struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +struct rw_anon_semaphore __sched * +rwsem_down_write_failed(struct rw_anon_semaphore *sem) { return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE, -RWSEM_ACTIVE_WRITE_BIAS); @@ -241,17 +243,17 @@ struct rw_semaphore __sched *rwsem_down_ * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +struct rw_anon_semaphore *rwsem_wake(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return sem; } @@ -261,17 +263,17 @@ struct rw_semaphore *rwsem_wake(struct r * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +struct rw_anon_semaphore *rwsem_downgrade_wake(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return sem; } Index: linux-2.6/kernel/time/timer_stats.c =================================================================== --- linux-2.6.orig/kernel/time/timer_stats.c +++ linux-2.6/kernel/time/timer_stats.c @@ -81,7 +81,7 @@ struct entry { /* * Spinlock protecting the tables - not taken during lookup: */ -static DEFINE_SPINLOCK(table_lock); +static DEFINE_RAW_SPINLOCK(table_lock); /* * Per-CPU lookup locks for fast hash lookup: @@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct prev = NULL; curr = *head; - spin_lock(&table_lock); + raw_spin_lock(&table_lock); /* * Make sure we have not raced with another CPU: */ @@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct *head = curr; } out_unlock: - spin_unlock(&table_lock); + raw_spin_unlock(&table_lock); return curr; } Index: linux-2.6/kernel/latencytop.c =================================================================== --- linux-2.6.orig/kernel/latencytop.c +++ linux-2.6/kernel/latencytop.c @@ -58,7 +58,7 @@ #include #include -static DEFINE_SPINLOCK(latency_lock); +static DEFINE_RAW_SPINLOCK(latency_lock); #define MAXLR 128 static struct latency_record latency_record[MAXLR]; @@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct ta if (!latencytop_enabled) return; - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static void clear_global_latency_tracing(void) { unsigned long flags; - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); memset(&latency_record, 0, sizeof(latency_record)); - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static void __sched @@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_ lat.max = usecs; store_stacktrace(tsk, &lat); - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); account_global_scheduler_latency(tsk, &lat); @@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_ memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static int lstats_show(struct seq_file *m, void *v) Index: linux-2.6/drivers/video/console/vgacon.c =================================================================== --- linux-2.6.orig/drivers/video/console/vgacon.c +++ linux-2.6/drivers/video/console/vgacon.c @@ -50,7 +50,7 @@ #include