diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 752fe56..1e649c4 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -94,7 +94,11 @@ static __always_inline bool __preempt_count_dec_and_test(void) { if (____preempt_count_dec_and_test()) return true; +#ifdef CONFIG_PREEMPT_LAZY return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else + return false; +#endif } /* @@ -102,8 +106,12 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(void) { +#ifdef CONFIG_PREEMPT_LAZY return unlikely(!__this_cpu_read_4(__preempt_count) || \ test_thread_flag(TIF_NEED_RESCHED_LAZY)); +#else + return unlikely(!__this_cpu_read_4(__preempt_count)); +#endif } #ifdef CONFIG_PREEMPT diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 7c8b356..5701b50 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -72,4 +72,5 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED); } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index fd2d976..6157ed6 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -365,19 +365,22 @@ ENTRY(resume_kernel) need_resched: # preempt count == 0 + NEED_RS set? cmpl $0,PER_CPU_VAR(__preempt_count) +#ifndef CONFIG_PREEMPT_LAZY + jnz restore_all +#else jz test_int_off # atleast preempt count == 0 ? - cmpl $_TIF_NEED_RESCHED,PER_CPU_VAR(__preempt_count) + cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count) jne restore_all cmpl $0,TI_preempt_lazy_count(%ebp) # non-zero preempt_lazy_count ? jnz restore_all - testl $_TIF_NEED_RESCHED_LAZY, %ecx + testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp) jz restore_all - test_int_off: +#endif testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b650b43..d893814 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -658,8 +658,8 @@ GLOBAL(system_call_after_swapgs) /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal + testl $_TIF_NEED_RESCHED_MASK,%edx + jz sysret_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi @@ -771,8 +771,8 @@ GLOBAL(int_with_check) /* First do a reschedule test. */ /* edx: work, edi: workmask */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + testl $_TIF_NEED_RESCHED_MASK,%edx + jz int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi @@ -1071,8 +1071,8 @@ ENTRY(native_iret) /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + testl $_TIF_NEED_RESCHED_MASK,%edx + jz retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi @@ -1104,7 +1104,22 @@ ENTRY(native_iret) /* rcx: threadinfo. interrupts off. */ ENTRY(retint_kernel) cmpl $0,PER_CPU_VAR(__preempt_count) +#ifndef CONFIG_PREEMPT_LAZY jnz retint_restore_args +#else + jz check_int_off + + # atleast preempt count == 0 ? + cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count) + jnz retint_restore_args + + cmpl $0, TI_preempt_lazy_count(%rcx) + jnz retint_restore_args + + bt $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx) + jnc retint_restore_args +check_int_off: +#endif bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ jnc retint_restore_args call preempt_schedule_irq @@ -1540,7 +1555,7 @@ ENTRY(paranoid_exit) movq %rsp,%rdi /* &pt_regs */ call sync_regs movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx + testl $_TIF_NEED_RESCHED_MASK,%ebx jnz paranoid_schedule movl %ebx,%edx /* arg3: thread flags */ TRACE_IRQS_ON diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c index 136ef86..37acc3a 100644 --- a/block/blk-mq-cpu.c +++ b/block/blk-mq-cpu.c @@ -11,7 +11,7 @@ #include "blk-mq.h" static LIST_HEAD(blk_mq_cpu_notify_list); -static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock); +static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock); static int blk_mq_main_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) @@ -19,12 +19,15 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self, unsigned int cpu = (unsigned long) hcpu; struct blk_mq_cpu_notifier *notify; - raw_spin_lock(&blk_mq_cpu_notify_lock); + if (action != CPU_POST_DEAD && action != CPU_POST_DEAD) + return NOTIFY_OK; + + spin_lock(&blk_mq_cpu_notify_lock); list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) notify->notify(notify->data, action, cpu); - raw_spin_unlock(&blk_mq_cpu_notify_lock); + spin_unlock(&blk_mq_cpu_notify_lock); return NOTIFY_OK; } @@ -32,16 +35,16 @@ void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) { BUG_ON(!notifier->notify); - raw_spin_lock(&blk_mq_cpu_notify_lock); + spin_lock(&blk_mq_cpu_notify_lock); list_add_tail(¬ifier->list, &blk_mq_cpu_notify_list); - raw_spin_unlock(&blk_mq_cpu_notify_lock); + spin_unlock(&blk_mq_cpu_notify_lock); } void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) { - raw_spin_lock(&blk_mq_cpu_notify_lock); + spin_lock(&blk_mq_cpu_notify_lock); list_del(¬ifier->list); - raw_spin_unlock(&blk_mq_cpu_notify_lock); + spin_unlock(&blk_mq_cpu_notify_lock); } void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, diff --git a/block/blk-mq.c b/block/blk-mq.c index a5f25f9..5fb26f7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -48,9 +48,14 @@ static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) return __blk_mq_get_ctx(q, get_cpu_light()); } -static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) +static void __blk_mq_put_ctx(struct blk_mq_ctx *ctx) { spin_unlock(&ctx->cpu_lock); +} + +static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) +{ + __blk_mq_put_ctx(ctx); put_cpu_light(); } @@ -966,7 +971,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, struct blk_mq_ctx *ctx; LIST_HEAD(tmp); - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) + if (action != CPU_POST_DEAD && action != CPU_POST_DEAD) return; /* @@ -980,6 +985,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, clear_bit(ctx->index_hw, hctx->ctx_map); } spin_unlock(&ctx->lock); + __blk_mq_put_ctx(ctx); if (list_empty(&tmp)) return; diff --git a/include/linux/lglock.h b/include/linux/lglock.h index 2b2204e..534b16e 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -74,4 +74,10 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu); void lg_global_lock(struct lglock *lg); void lg_global_unlock(struct lglock *lg); +#ifndef CONFIG_PREEMPT_RT_FULL +#define lg_global_trylock_relax(name) lg_global_lock(name) +#else +void lg_global_trylock_relax(struct lglock *lg); +#endif + #endif diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 116af6a..5b2cdf4 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -126,8 +126,7 @@ do { \ #define preempt_enable_notrace() \ do { \ barrier(); \ - if (unlikely(__preempt_count_dec_and_test() || \ - test_thread_flag(TIF_NEED_RESCHED_LAZY))) \ + if (unlikely(__preempt_count_dec_and_test())) \ __preempt_schedule_context(); \ } while (0) #else diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h index 924c2d2..0065b08 100644 --- a/include/linux/rwsem_rt.h +++ b/include/linux/rwsem_rt.h @@ -20,7 +20,6 @@ struct rw_semaphore { struct rt_mutex lock; - int read_depth; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h index ac6f08b..c0d1367 100644 --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -35,6 +35,7 @@ extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock); */ extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock); extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock); +extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock); #define spin_lock(lock) \ do { \ diff --git a/kernel/cpu.c b/kernel/cpu.c index 041fada..ce00329 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -649,7 +649,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) /* CPU didn't die: tell everyone. Can't complain. */ smpboot_unpark_threads(cpu); cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); - goto out_cancel; + goto out_release; } BUG_ON(cpu_online(cpu)); diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c index f2356df..9397974 100644 --- a/kernel/locking/lglock.c +++ b/kernel/locking/lglock.c @@ -105,3 +105,28 @@ void lg_global_unlock(struct lglock *lg) preempt_enable_nort(); } EXPORT_SYMBOL(lg_global_unlock); + +#ifdef CONFIG_PREEMPT_RT_FULL +/* + * HACK: If you use this, you get to keep the pieces. + * Used in queue_stop_cpus_work() when stop machinery + * is called from inactive CPU, so we can't schedule. + */ +# define lg_do_trylock_relax(l) \ + do { \ + while (!__rt_spin_trylock(l)) \ + cpu_relax(); \ + } while (0) + +void lg_global_trylock_relax(struct lglock *lg) +{ + int i; + + lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); + for_each_possible_cpu(i) { + lg_lock_ptr *lock; + lock = per_cpu_ptr(lg->lock, i); + lg_do_trylock_relax(lock); + } +} +#endif diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c index 5d17727..055a3df 100644 --- a/kernel/locking/rt.c +++ b/kernel/locking/rt.c @@ -180,12 +180,14 @@ EXPORT_SYMBOL(_mutex_unlock); */ int __lockfunc rt_write_trylock(rwlock_t *rwlock) { - int ret = rt_mutex_trylock(&rwlock->lock); + int ret; - if (ret) { + migrate_disable(); + ret = rt_mutex_trylock(&rwlock->lock); + if (ret) rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); - migrate_disable(); - } + else + migrate_enable(); return ret; } @@ -212,11 +214,13 @@ int __lockfunc rt_read_trylock(rwlock_t *rwlock) * write locked. */ if (rt_mutex_owner(lock) != current) { + migrate_disable(); ret = rt_mutex_trylock(lock); - if (ret) { + if (ret) rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); - migrate_disable(); - } + else + migrate_enable(); + } else if (!rwlock->read_depth) { ret = 0; } @@ -240,13 +244,14 @@ void __lockfunc rt_read_lock(rwlock_t *rwlock) { struct rt_mutex *lock = &rwlock->lock; + /* * recursive read locks succeed when current owns the lock */ if (rt_mutex_owner(lock) != current) { - rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); - __rt_spin_lock(lock); migrate_disable(); + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + __rt_spin_lock(lock); } rwlock->read_depth++; } @@ -316,10 +321,8 @@ EXPORT_SYMBOL(rt_up_write); void rt_up_read(struct rw_semaphore *rwsem) { - if (--rwsem->read_depth == 0) { - rwsem_release(&rwsem->dep_map, 1, _RET_IP_); - rt_mutex_unlock(&rwsem->lock); - } + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + rt_mutex_unlock(&rwsem->lock); } EXPORT_SYMBOL(rt_up_read); @@ -330,7 +333,6 @@ EXPORT_SYMBOL(rt_up_read); void rt_downgrade_write(struct rw_semaphore *rwsem) { BUG_ON(rt_mutex_owner(&rwsem->lock) != current); - rwsem->read_depth = 1; } EXPORT_SYMBOL(rt_downgrade_write); @@ -367,37 +369,20 @@ void rt_down_write_nested_lock(struct rw_semaphore *rwsem, int rt_down_read_trylock(struct rw_semaphore *rwsem) { - struct rt_mutex *lock = &rwsem->lock; - int ret = 1; - - /* - * recursive read locks succeed when current owns the rwsem, - * but not when read_depth == 0 which means that the rwsem is - * write locked. - */ - if (rt_mutex_owner(lock) != current) { - ret = rt_mutex_trylock(&rwsem->lock); - if (ret) - rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); - } else if (!rwsem->read_depth) { - ret = 0; - } + int ret; + ret = rt_mutex_trylock(&rwsem->lock); if (ret) - rwsem->read_depth++; + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); + return ret; } EXPORT_SYMBOL(rt_down_read_trylock); static void __rt_down_read(struct rw_semaphore *rwsem, int subclass) { - struct rt_mutex *lock = &rwsem->lock; - - if (rt_mutex_owner(lock) != current) { - rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_); - rt_mutex_lock(&rwsem->lock); - } - rwsem->read_depth++; + rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_); + rt_mutex_lock(&rwsem->lock); } void rt_down_read(struct rw_semaphore *rwsem) @@ -422,7 +407,6 @@ void __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name, debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem)); lockdep_init_map(&rwsem->dep_map, name, key, 0); #endif - rwsem->read_depth = 0; rwsem->lock.save_state = 0; } EXPORT_SYMBOL(__rt_rwsem_init); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 42f4f28..5c5cc76 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1001,6 +1001,11 @@ void __lockfunc rt_spin_unlock_wait(spinlock_t *lock) } EXPORT_SYMBOL(rt_spin_unlock_wait); +int __lockfunc __rt_spin_trylock(struct rt_mutex *lock) +{ + return rt_mutex_trylock(lock); +} + int __lockfunc rt_spin_trylock(spinlock_t *lock) { int ret = rt_mutex_trylock(&lock->lock); @@ -1045,12 +1050,12 @@ int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock) /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; + migrate_disable(); rt_spin_lock(lock); - if (atomic_dec_and_test(atomic)){ - migrate_disable(); + if (atomic_dec_and_test(atomic)) return 1; - } rt_spin_unlock(lock); + migrate_enable(); return 0; } EXPORT_SYMBOL(atomic_dec_and_spin_lock); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index aaae9f1..bcbae9c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -266,7 +266,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * struct irq_cpu_stop_queue_work_info call_args; struct multi_stop_data msdata; - preempt_disable(); + preempt_disable_nort(); msdata = (struct multi_stop_data){ .fn = fn, .data = arg, @@ -299,7 +299,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * * This relies on the stopper workqueues to be FIFO. */ if (!cpu_active(cpu1) || !cpu_active(cpu2)) { - preempt_enable(); + preempt_enable_nort(); return -ENOENT; } @@ -313,7 +313,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * &irq_cpu_stop_queue_work, &call_args, 1); lg_local_unlock(&stop_cpus_lock); - preempt_enable(); + preempt_enable_nort(); wait_for_stop_done(&done); @@ -346,7 +346,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); static void queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, - struct cpu_stop_done *done) + struct cpu_stop_done *done, bool inactive) { struct cpu_stop_work *work; unsigned int cpu; @@ -360,11 +360,13 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, } /* - * Disable preemption while queueing to avoid getting - * preempted by a stopper which might wait for other stoppers - * to enter @fn which can lead to deadlock. + * Make sure that all work is queued on all cpus before + * any of the cpus can execute it. */ - lg_global_lock(&stop_cpus_lock); + if (!inactive) + lg_global_lock(&stop_cpus_lock); + else + lg_global_trylock_relax(&stop_cpus_lock); for_each_cpu(cpu, cpumask) cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); lg_global_unlock(&stop_cpus_lock); @@ -376,7 +378,7 @@ static int __stop_cpus(const struct cpumask *cpumask, struct cpu_stop_done done; cpu_stop_init_done(&done, cpumask_weight(cpumask)); - queue_stop_cpus_work(cpumask, fn, arg, &done); + queue_stop_cpus_work(cpumask, fn, arg, &done, false); wait_for_stop_done(&done); return done.executed ? done.ret : -ENOENT; } @@ -572,6 +574,8 @@ static int __init cpu_stop_init(void) INIT_LIST_HEAD(&stopper->works); } + lg_lock_init(&stop_cpus_lock, "stop_cpus_lock"); + BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); stop_machine_initialized = true; return 0; @@ -667,11 +671,11 @@ int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, set_state(&msdata, MULTI_STOP_PREPARE); cpu_stop_init_done(&done, num_active_cpus()); queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata, - &done); + &done, true); ret = multi_cpu_stop(&msdata); /* Busy wait for completion. */ - while (!atomic_read(&done.nr_todo)) + while (atomic_read(&done.nr_todo)) cpu_relax(); mutex_unlock(&stop_cpus_mutex); diff --git a/kernel/timer.c b/kernel/timer.c index 54596b5..8750875 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1461,6 +1461,19 @@ void run_local_timers(void) * the timer softirq. */ #ifdef CONFIG_PREEMPT_RT_FULL + +#ifndef CONFIG_SMP + /* + * The spin_do_trylock() later may fail as the lock may be hold before + * the interrupt arrived. The spin-lock debugging code will raise a + * warning if the try_lock fails on UP. Since this is only an + * optimization for the FULL_NO_HZ case (not to run the timer softirq on + * an nohz_full CPU) we don't really care and shedule the softirq. + */ + raise_softirq(TIMER_SOFTIRQ); + return; +#endif + /* On RT, irq work runs from softirq */ if (irq_work_needs_cpu()) { raise_softirq(TIMER_SOFTIRQ); diff --git a/localversion-rt b/localversion-rt index c3054d0..1445cd6 100644 --- a/localversion-rt +++ b/localversion-rt @@ -1 +1 @@ --rt2 +-rt3