Index: linux-3.0/mm/memory.c =================================================================== --- linux-3.0.orig/mm/memory.c +++ linux-3.0/mm/memory.c @@ -1290,13 +1290,6 @@ static unsigned long unmap_page_range(st return addr; } -#ifdef CONFIG_PREEMPT -# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) -#else -/* No preempt: go for improved straight-line efficiency */ -# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) -#endif - /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather @@ -3435,6 +3428,32 @@ unlock: return 0; } +#ifdef CONFIG_PREEMPT_RT_FULL +void pagefault_disable(void) +{ + migrate_disable(); + current->pagefault_disabled++; + /* + * make sure to have issued the store before a pagefault + * can hit. + */ + barrier(); +} +EXPORT_SYMBOL_GPL(pagefault_disable); + +void pagefault_enable(void) +{ + /* + * make sure to issue those last loads/stores before enabling + * the pagefault handler again. + */ + barrier(); + current->pagefault_disabled--; + migrate_enable(); +} +EXPORT_SYMBOL_GPL(pagefault_enable); +#endif + /* * By the time we get here, we already hold the mm semaphore */ @@ -3983,3 +4002,35 @@ void copy_user_huge_page(struct page *ds } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + +#if defined(CONFIG_PREEMPT_RT_FULL) && (USE_SPLIT_PTLOCKS > 0) +/* + * Heinous hack, relies on the caller doing something like: + * + * pte = alloc_pages(PGALLOC_GFP, 0); + * if (pte) + * pgtable_page_ctor(pte); + * return pte; + * + * This ensures we release the page and return NULL when the + * lock allocation fails. + */ +struct page *pte_lock_init(struct page *page) +{ + page->ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + if (page->ptl) { + spin_lock_init(__pte_lockptr(page)); + } else { + __free_page(page); + page = NULL; + } + return page; +} + +void pte_lock_deinit(struct page *page) +{ + kfree(page->ptl); + page->mapping = NULL; +} + +#endif Index: linux-3.0/kernel/sched_cpupri.c =================================================================== --- linux-3.0.orig/kernel/sched_cpupri.c +++ linux-3.0/kernel/sched_cpupri.c @@ -47,9 +47,6 @@ static int convert_prio(int prio) return cpupri; } -#define for_each_cpupri_active(array, idx) \ - for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) - /** * cpupri_find - find the best (lowest-pri) CPU in the system * @cp: The cpupri context @@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struc int idx = 0; int task_pri = convert_prio(p->prio); - for_each_cpupri_active(cp->pri_active, idx) { + if (task_pri >= MAX_RT_PRIO) + return 0; + + for (idx = 0; idx < task_pri; idx++) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + int skip = 0; - if (idx >= task_pri) - break; + if (!atomic_read(&(vec)->count)) + skip = 1; + /* + * When looking at the vector, we need to read the counter, + * do a memory barrier, then read the mask. + * + * Note: This is still all racey, but we can deal with it. + * Ideally, we only want to look at masks that are set. + * + * If a mask is not set, then the only thing wrong is that we + * did a little more work than necessary. + * + * If we read a zero count but the mask is set, because of the + * memory barriers, that can only happen when the highest prio + * task for a run queue has left the run queue, in which case, + * it will be followed by a pull. If the task we are processing + * fails to find a proper place to go, that pull request will + * pull this task if the run queue is running at a lower + * priority. + */ + smp_rmb(); + + /* Need to do the rmb for every iteration */ + if (skip) + continue; if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; @@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int c { int *currpri = &cp->cpu_to_pri[cpu]; int oldpri = *currpri; - unsigned long flags; + int do_mb = 0; newpri = convert_prio(newpri); @@ -134,26 +158,41 @@ void cpupri_set(struct cpupri *cp, int c if (likely(newpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; - raw_spin_lock_irqsave(&vec->lock, flags); - cpumask_set_cpu(cpu, vec->mask); - vec->count++; - if (vec->count == 1) - set_bit(newpri, cp->pri_active); - - raw_spin_unlock_irqrestore(&vec->lock, flags); + /* + * When adding a new vector, we update the mask first, + * do a write memory barrier, and then update the count, to + * make sure the vector is visible when count is set. + */ + smp_mb__before_atomic_inc(); + atomic_inc(&(vec)->count); + do_mb = 1; } if (likely(oldpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - raw_spin_lock_irqsave(&vec->lock, flags); - - vec->count--; - if (!vec->count) - clear_bit(oldpri, cp->pri_active); + /* + * Because the order of modification of the vec->count + * is important, we must make sure that the update + * of the new prio is seen before we decrement the + * old prio. This makes sure that the loop sees + * one or the other when we raise the priority of + * the run queue. We don't care about when we lower the + * priority, as that will trigger an rt pull anyway. + * + * We only need to do a memory barrier if we updated + * the new priority vec. + */ + if (do_mb) + smp_mb__after_atomic_inc(); + + /* + * When removing from the vector, we decrement the counter first + * do a memory barrier and then clear the mask. + */ + atomic_dec(&(vec)->count); + smp_mb__after_atomic_inc(); cpumask_clear_cpu(cpu, vec->mask); - - raw_spin_unlock_irqrestore(&vec->lock, flags); } *currpri = newpri; @@ -175,8 +214,7 @@ int cpupri_init(struct cpupri *cp) for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { struct cpupri_vec *vec = &cp->pri_to_cpu[i]; - raw_spin_lock_init(&vec->lock); - vec->count = 0; + atomic_set(&vec->count, 0); if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } Index: linux-3.0/kernel/sched_cpupri.h =================================================================== --- linux-3.0.orig/kernel/sched_cpupri.h +++ linux-3.0/kernel/sched_cpupri.h @@ -12,9 +12,8 @@ /* values 2-101 are RT priorities 0-99 */ struct cpupri_vec { - raw_spinlock_t lock; - int count; - cpumask_var_t mask; + atomic_t count; + cpumask_var_t mask; }; struct cpupri { Index: linux-3.0/mm/slab.c =================================================================== --- linux-3.0.orig/mm/slab.c +++ linux-3.0/mm/slab.c @@ -116,6 +116,7 @@ #include #include #include +#include #include #include @@ -620,6 +621,51 @@ int slab_is_available(void) static struct lock_class_key on_slab_l3_key; static struct lock_class_key on_slab_alc_key; +static struct lock_class_key debugobj_l3_key; +static struct lock_class_key debugobj_alc_key; + +static void slab_set_lock_classes(struct kmem_cache *cachep, + struct lock_class_key *l3_key, struct lock_class_key *alc_key, + int q) +{ + struct array_cache **alc; + struct kmem_list3 *l3; + int r; + + l3 = cachep->nodelists[q]; + if (!l3) + return; + + lockdep_set_class(&l3->list_lock, l3_key); + alc = l3->alien; + /* + * FIXME: This check for BAD_ALIEN_MAGIC + * should go away when common slab code is taught to + * work even without alien caches. + * Currently, non NUMA code returns BAD_ALIEN_MAGIC + * for alloc_alien_cache, + */ + if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) + return; + for_each_node(r) { + if (alc[r]) + lockdep_set_class(&alc[r]->lock, alc_key); + } +} + +static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +{ + slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); +} + +static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) +{ + int node; + + for_each_online_node(node) + slab_set_debugobj_lock_classes_node(cachep, node); +} + static void init_node_lock_keys(int q) { struct cache_sizes *s = malloc_sizes; @@ -628,29 +674,14 @@ static void init_node_lock_keys(int q) return; for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { - struct array_cache **alc; struct kmem_list3 *l3; - int r; l3 = s->cs_cachep->nodelists[q]; if (!l3 || OFF_SLAB(s->cs_cachep)) continue; - lockdep_set_class(&l3->list_lock, &on_slab_l3_key); - alc = l3->alien; - /* - * FIXME: This check for BAD_ALIEN_MAGIC - * should go away when common slab code is taught to - * work even without alien caches. - * Currently, non NUMA code returns BAD_ALIEN_MAGIC - * for alloc_alien_cache, - */ - if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - continue; - for_each_node(r) { - if (alc[r]) - lockdep_set_class(&alc[r]->lock, - &on_slab_alc_key); - } + + slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key, + &on_slab_alc_key, q); } } @@ -669,6 +700,14 @@ static void init_node_lock_keys(int q) static inline void init_lock_keys(void) { } + +static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +{ +} + +static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) +{ +} #endif /* @@ -678,12 +717,66 @@ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); +static DEFINE_PER_CPU(struct list_head, slab_free_list); +static DEFINE_LOCAL_IRQ_LOCK(slab_lock); + +#ifndef CONFIG_PREEMPT_RT_BASE +# define slab_on_each_cpu(func, cp) on_each_cpu(func, cp, 1) +#else +/* + * execute func() for all CPUs. On PREEMPT_RT we dont actually have + * to run on the remote CPUs - we only have to take their CPU-locks. + * (This is a rare operation, so cacheline bouncing is not an issue.) + */ +static void +slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg) +{ + unsigned int i; + + for_each_online_cpu(i) + func(arg, i); +} +#endif + +static void free_delayed(struct list_head *h) +{ + while(!list_empty(h)) { + struct page *page = list_first_entry(h, struct page, lru); + + list_del(&page->lru); + __free_pages(page, page->index); + } +} + +static void unlock_l3_and_free_delayed(spinlock_t *list_lock) +{ + LIST_HEAD(tmp); + + list_splice_init(&__get_cpu_var(slab_free_list), &tmp); + local_spin_unlock_irq(slab_lock, list_lock); + free_delayed(&tmp); +} + +static void unlock_slab_and_free_delayed(unsigned long flags) +{ + LIST_HEAD(tmp); + + list_splice_init(&__get_cpu_var(slab_free_list), &tmp); + local_unlock_irqrestore(slab_lock, flags); + free_delayed(&tmp); +} static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) { return cachep->array[smp_processor_id()]; } +static inline struct array_cache *cpu_cache_get_on_cpu(struct kmem_cache *cachep, + int cpu) +{ + return cachep->array[cpu]; +} + static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags) { @@ -1021,9 +1114,10 @@ static void reap_alien(struct kmem_cache if (l3->alien) { struct array_cache *ac = l3->alien[node]; - if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { + if (ac && ac->avail && + local_spin_trylock_irq(slab_lock, &ac->lock)) { __drain_alien_cache(cachep, ac, node); - spin_unlock_irq(&ac->lock); + local_spin_unlock_irq(slab_lock, &ac->lock); } } } @@ -1038,9 +1132,9 @@ static void drain_alien_cache(struct kme for_each_online_node(i) { ac = alien[i]; if (ac) { - spin_lock_irqsave(&ac->lock, flags); + local_spin_lock_irqsave(slab_lock, &ac->lock, flags); __drain_alien_cache(cachep, ac, i); - spin_unlock_irqrestore(&ac->lock, flags); + local_spin_unlock_irqrestore(slab_lock, &ac->lock, flags); } } } @@ -1119,11 +1213,11 @@ static int init_cache_nodelists_node(int cachep->nodelists[node] = l3; } - spin_lock_irq(&cachep->nodelists[node]->list_lock); + local_spin_lock_irq(slab_lock, &cachep->nodelists[node]->list_lock); cachep->nodelists[node]->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->nodelists[node]->list_lock); + local_spin_unlock_irq(slab_lock, &cachep->nodelists[node]->list_lock); } return 0; } @@ -1148,7 +1242,7 @@ static void __cpuinit cpuup_canceled(lon if (!l3) goto free_array_cache; - spin_lock_irq(&l3->list_lock); + local_spin_lock_irq(slab_lock, &l3->list_lock); /* Free limit for this kmem_list3 */ l3->free_limit -= cachep->batchcount; @@ -1156,7 +1250,7 @@ static void __cpuinit cpuup_canceled(lon free_block(cachep, nc->entry, nc->avail, node); if (!cpumask_empty(mask)) { - spin_unlock_irq(&l3->list_lock); + unlock_l3_and_free_delayed(&l3->list_lock); goto free_array_cache; } @@ -1170,7 +1264,7 @@ static void __cpuinit cpuup_canceled(lon alien = l3->alien; l3->alien = NULL; - spin_unlock_irq(&l3->list_lock); + unlock_l3_and_free_delayed(&l3->list_lock); kfree(shared); if (alien) { @@ -1244,7 +1338,7 @@ static int __cpuinit cpuup_prepare(long l3 = cachep->nodelists[node]; BUG_ON(!l3); - spin_lock_irq(&l3->list_lock); + local_spin_lock_irq(slab_lock, &l3->list_lock); if (!l3->shared) { /* * We are serialised from CPU_DEAD or @@ -1259,9 +1353,11 @@ static int __cpuinit cpuup_prepare(long alien = NULL; } #endif - spin_unlock_irq(&l3->list_lock); + local_spin_unlock_irq(slab_lock, &l3->list_lock); kfree(shared); free_alien_cache(alien); + if (cachep->flags & SLAB_DEBUG_OBJECTS) + slab_set_debugobj_lock_classes_node(cachep, node); } init_node_lock_keys(node); @@ -1448,6 +1544,10 @@ void __init kmem_cache_init(void) if (num_possible_nodes() == 1) use_alien_caches = 0; + local_irq_lock_init(slab_lock); + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(slab_free_list, i)); + for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); if (i < MAX_NUMNODES) @@ -1625,6 +1725,9 @@ void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); + /* 6) resize the head arrays to their final sizes */ mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) @@ -1635,9 +1738,6 @@ void __init kmem_cache_init_late(void) /* Done! */ g_cpucache_up = FULL; - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); - /* * Register a cpu startup notifier callback that initializes * cpu_cache_get for all new cpus @@ -1725,12 +1825,14 @@ static void *kmem_getpages(struct kmem_c /* * Interface to system's page release. */ -static void kmem_freepages(struct kmem_cache *cachep, void *addr) +static void kmem_freepages(struct kmem_cache *cachep, void *addr, bool delayed) { unsigned long i = (1 << cachep->gfporder); - struct page *page = virt_to_page(addr); + struct page *page, *basepage = virt_to_page(addr); const unsigned long nr_freed = i; + page = basepage; + kmemcheck_free_shadow(page, cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) @@ -1746,7 +1848,13 @@ static void kmem_freepages(struct kmem_c } if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - free_pages((unsigned long)addr, cachep->gfporder); + + if (!delayed) { + free_pages((unsigned long)addr, cachep->gfporder); + } else { + basepage->index = cachep->gfporder; + list_add(&basepage->lru, &__get_cpu_var(slab_free_list)); + } } static void kmem_rcu_free(struct rcu_head *head) @@ -1754,7 +1862,7 @@ static void kmem_rcu_free(struct rcu_hea struct slab_rcu *slab_rcu = (struct slab_rcu *)head; struct kmem_cache *cachep = slab_rcu->cachep; - kmem_freepages(cachep, slab_rcu->addr); + kmem_freepages(cachep, slab_rcu->addr, false); if (OFF_SLAB(cachep)) kmem_cache_free(cachep->slabp_cache, slab_rcu); } @@ -1973,7 +2081,8 @@ static void slab_destroy_debugcheck(stru * Before calling the slab must have been unlinked from the cache. The * cache-lock is not held/needed. */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp, + bool delayed) { void *addr = slabp->s_mem - slabp->colouroff; @@ -1986,7 +2095,7 @@ static void slab_destroy(struct kmem_cac slab_rcu->addr = addr; call_rcu(&slab_rcu->head, kmem_rcu_free); } else { - kmem_freepages(cachep, addr); + kmem_freepages(cachep, addr, delayed); if (OFF_SLAB(cachep)) kmem_cache_free(cachep->slabp_cache, slabp); } @@ -2424,6 +2533,16 @@ kmem_cache_create (const char *name, siz goto oops; } + if (flags & SLAB_DEBUG_OBJECTS) { + /* + * Would deadlock through slab_destroy()->call_rcu()-> + * debug_object_activate()->kmem_cache_alloc(). + */ + WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); + + slab_set_debugobj_lock_classes(cachep); + } + /* cache setup completed, link it into the list */ list_add(&cachep->next, &cache_chain); oops: @@ -2441,7 +2560,7 @@ EXPORT_SYMBOL(kmem_cache_create); #if DEBUG static void check_irq_off(void) { - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); } static void check_irq_on(void) @@ -2476,26 +2595,43 @@ static void drain_array(struct kmem_cach struct array_cache *ac, int force, int node); -static void do_drain(void *arg) +static void __do_drain(void *arg, unsigned int cpu) { struct kmem_cache *cachep = arg; struct array_cache *ac; - int node = numa_mem_id(); + int node = cpu_to_mem(cpu); - check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get_on_cpu(cachep, cpu); spin_lock(&cachep->nodelists[node]->list_lock); free_block(cachep, ac->entry, ac->avail, node); spin_unlock(&cachep->nodelists[node]->list_lock); ac->avail = 0; } +#ifndef CONFIG_PREEMPT_RT_BASE +static void do_drain(void *arg) +{ + __do_drain(arg, smp_processor_id()); +} +#else +static void do_drain(void *arg, int cpu) +{ + LIST_HEAD(tmp); + + spin_lock_irq(&per_cpu(slab_lock, cpu).lock); + __do_drain(arg, cpu); + list_splice_init(&per_cpu(slab_free_list, cpu), &tmp); + spin_unlock_irq(&per_cpu(slab_lock, cpu).lock); + free_delayed(&tmp); +} +#endif + static void drain_cpu_caches(struct kmem_cache *cachep) { struct kmem_list3 *l3; int node; - on_each_cpu(do_drain, cachep, 1); + slab_on_each_cpu(do_drain, cachep); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; @@ -2526,10 +2662,10 @@ static int drain_freelist(struct kmem_ca nr_freed = 0; while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { - spin_lock_irq(&l3->list_lock); + local_spin_lock_irq(slab_lock, &l3->list_lock); p = l3->slabs_free.prev; if (p == &l3->slabs_free) { - spin_unlock_irq(&l3->list_lock); + local_spin_unlock_irq(slab_lock, &l3->list_lock); goto out; } @@ -2543,8 +2679,8 @@ static int drain_freelist(struct kmem_ca * to the cache. */ l3->free_objects -= cache->num; - spin_unlock_irq(&l3->list_lock); - slab_destroy(cache, slabp); + local_spin_unlock_irq(slab_lock, &l3->list_lock); + slab_destroy(cache, slabp, false); nr_freed++; } out: @@ -2838,7 +2974,7 @@ static int cache_grow(struct kmem_cache offset *= cachep->colour_off; if (local_flags & __GFP_WAIT) - local_irq_enable(); + local_unlock_irq(slab_lock); /* * The test for missing atomic flag is performed here, rather than @@ -2868,7 +3004,7 @@ static int cache_grow(struct kmem_cache cache_init_objs(cachep, slabp); if (local_flags & __GFP_WAIT) - local_irq_disable(); + local_lock_irq(slab_lock); check_irq_off(); spin_lock(&l3->list_lock); @@ -2879,10 +3015,10 @@ static int cache_grow(struct kmem_cache spin_unlock(&l3->list_lock); return 1; opps1: - kmem_freepages(cachep, objp); + kmem_freepages(cachep, objp, false); failed: if (local_flags & __GFP_WAIT) - local_irq_disable(); + local_lock_irq(slab_lock); return 0; } @@ -3280,11 +3416,11 @@ retry: * set and go into memory reserves if necessary. */ if (local_flags & __GFP_WAIT) - local_irq_enable(); + local_unlock_irq(slab_lock); kmem_flagcheck(cache, flags); obj = kmem_getpages(cache, local_flags, numa_mem_id()); if (local_flags & __GFP_WAIT) - local_irq_disable(); + local_lock_irq(slab_lock); if (obj) { /* * Insert into the appropriate per node queues @@ -3400,7 +3536,7 @@ __cache_alloc_node(struct kmem_cache *ca return NULL; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); + local_lock_irqsave(slab_lock, save_flags); if (nodeid == -1) nodeid = slab_node; @@ -3425,7 +3561,7 @@ __cache_alloc_node(struct kmem_cache *ca /* ___cache_alloc_node can fall back to other nodes */ ptr = ____cache_alloc_node(cachep, flags, nodeid); out: - local_irq_restore(save_flags); + local_unlock_irqrestore(slab_lock, save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, flags); @@ -3485,9 +3621,9 @@ __cache_alloc(struct kmem_cache *cachep, return NULL; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); + local_lock_irqsave(slab_lock, save_flags); objp = __do_cache_alloc(cachep, flags); - local_irq_restore(save_flags); + local_unlock_irqrestore(slab_lock, save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, flags); @@ -3535,7 +3671,7 @@ static void free_block(struct kmem_cache * a different cache, refer to comments before * alloc_slabmgmt. */ - slab_destroy(cachep, slabp); + slab_destroy(cachep, slabp, true); } else { list_add(&slabp->list, &l3->slabs_free); } @@ -3798,12 +3934,12 @@ void kmem_cache_free(struct kmem_cache * { unsigned long flags; - local_irq_save(flags); debug_check_no_locks_freed(objp, obj_size(cachep)); if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(objp, obj_size(cachep)); + local_lock_irqsave(slab_lock, flags); __cache_free(cachep, objp, __builtin_return_address(0)); - local_irq_restore(flags); + unlock_slab_and_free_delayed(flags); trace_kmem_cache_free(_RET_IP_, objp); } @@ -3827,13 +3963,13 @@ void kfree(const void *objp) if (unlikely(ZERO_OR_NULL_PTR(objp))) return; - local_irq_save(flags); kfree_debugcheck(objp); c = virt_to_cache(objp); debug_check_no_locks_freed(objp, obj_size(c)); debug_check_no_obj_freed(objp, obj_size(c)); + local_lock_irqsave(slab_lock, flags); __cache_free(c, (void *)objp, __builtin_return_address(0)); - local_irq_restore(flags); + unlock_slab_and_free_delayed(flags); } EXPORT_SYMBOL(kfree); @@ -3876,7 +4012,7 @@ static int alloc_kmemlist(struct kmem_ca if (l3) { struct array_cache *shared = l3->shared; - spin_lock_irq(&l3->list_lock); + local_spin_lock_irq(slab_lock, &l3->list_lock); if (shared) free_block(cachep, shared->entry, @@ -3889,7 +4025,8 @@ static int alloc_kmemlist(struct kmem_ca } l3->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&l3->list_lock); + unlock_l3_and_free_delayed(&l3->list_lock); + kfree(shared); free_alien_cache(new_alien); continue; @@ -3936,17 +4073,30 @@ struct ccupdate_struct { struct array_cache *new[NR_CPUS]; }; -static void do_ccupdate_local(void *info) +static void __do_ccupdate_local(void *info, int cpu) { struct ccupdate_struct *new = info; struct array_cache *old; - check_irq_off(); - old = cpu_cache_get(new->cachep); + old = cpu_cache_get_on_cpu(new->cachep, cpu); - new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; - new->new[smp_processor_id()] = old; + new->cachep->array[cpu] = new->new[cpu]; + new->new[cpu] = old; +} + +#ifndef CONFIG_PREEMPT_RT_BASE +static void do_ccupdate_local(void *info) +{ + __do_ccupdate_local(info, smp_processor_id()); +} +#else +static void do_ccupdate_local(void *info, int cpu) +{ + spin_lock_irq(&per_cpu(slab_lock, cpu).lock); + __do_ccupdate_local(info, cpu); + spin_unlock_irq(&per_cpu(slab_lock, cpu).lock); } +#endif /* Always called with the cache_chain_mutex held */ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, @@ -3971,7 +4121,7 @@ static int do_tune_cpucache(struct kmem_ } new->cachep = cachep; - on_each_cpu(do_ccupdate_local, (void *)new, 1); + slab_on_each_cpu(do_ccupdate_local, (void *)new); check_irq_on(); cachep->batchcount = batchcount; @@ -3982,9 +4132,11 @@ static int do_tune_cpucache(struct kmem_ struct array_cache *ccold = new->new[i]; if (!ccold) continue; - spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); + local_spin_lock_irq(slab_lock, + &cachep->nodelists[cpu_to_mem(i)]->list_lock); free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); - spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); + + unlock_l3_and_free_delayed(&cachep->nodelists[cpu_to_mem(i)]->list_lock); kfree(ccold); } kfree(new); @@ -4060,7 +4212,7 @@ static void drain_array(struct kmem_cach if (ac->touched && !force) { ac->touched = 0; } else { - spin_lock_irq(&l3->list_lock); + local_spin_lock_irq(slab_lock, &l3->list_lock); if (ac->avail) { tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) @@ -4070,7 +4222,7 @@ static void drain_array(struct kmem_cach memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } - spin_unlock_irq(&l3->list_lock); + local_spin_unlock_irq(slab_lock, &l3->list_lock); } } @@ -4209,7 +4361,7 @@ static int s_show(struct seq_file *m, vo continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + local_spin_lock_irq(slab_lock, &l3->list_lock); list_for_each_entry(slabp, &l3->slabs_full, list) { if (slabp->inuse != cachep->num && !error) @@ -4234,7 +4386,7 @@ static int s_show(struct seq_file *m, vo if (l3->shared) shared_avail += l3->shared->avail; - spin_unlock_irq(&l3->list_lock); + local_spin_unlock_irq(slab_lock, &l3->list_lock); } num_slabs += active_slabs; num_objs = num_slabs * cachep->num; @@ -4463,13 +4615,13 @@ static int leaks_show(struct seq_file *m continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + local_spin_lock_irq(slab_lock, &l3->list_lock); list_for_each_entry(slabp, &l3->slabs_full, list) handle_slab(n, cachep, slabp); list_for_each_entry(slabp, &l3->slabs_partial, list) handle_slab(n, cachep, slabp); - spin_unlock_irq(&l3->list_lock); + local_spin_unlock_irq(slab_lock, &l3->list_lock); } name = cachep->name; if (n[0] == n[1]) { Index: linux-3.0/kernel/lockdep.c =================================================================== --- linux-3.0.orig/kernel/lockdep.c +++ linux-3.0/kernel/lockdep.c @@ -2859,10 +2859,7 @@ static int mark_lock(struct task_struct void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - int i; - - for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) - lock->class_cache[i] = NULL; + memset(lock, 0, sizeof(*lock)); #ifdef CONFIG_LOCK_STAT lock->cpu = raw_smp_processor_id(); @@ -3341,6 +3338,7 @@ static void check_flags(unsigned long fl } } +#ifndef CONFIG_PREEMPT_RT_FULL /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only @@ -3352,6 +3350,7 @@ static void check_flags(unsigned long fl else DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } +#endif if (!debug_locks) print_irqtrace_events(current); Index: linux-3.0/arch/x86/kernel/apic/apic.c =================================================================== --- linux-3.0.orig/arch/x86/kernel/apic/apic.c +++ linux-3.0/arch/x86/kernel/apic/apic.c @@ -856,8 +856,8 @@ void __irq_entry smp_apic_timer_interrup * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ - exit_idle(); irq_enter(); + exit_idle(); local_apic_timer_interrupt(); irq_exit(); @@ -1790,8 +1790,8 @@ void smp_spurious_interrupt(struct pt_re { u32 v; - exit_idle(); irq_enter(); + exit_idle(); /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1827,8 +1827,8 @@ void smp_error_interrupt(struct pt_regs "Illegal register address", /* APIC Error Bit 7 */ }; - exit_idle(); irq_enter(); + exit_idle(); /* First tickle the hardware, only then report what went on. -- REW */ v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); Index: linux-3.0/arch/x86/kernel/apic/io_apic.c =================================================================== --- linux-3.0.orig/arch/x86/kernel/apic/io_apic.c +++ linux-3.0/arch/x86/kernel/apic/io_apic.c @@ -2275,8 +2275,8 @@ asmlinkage void smp_irq_move_cleanup_int unsigned vector, me; ack_APIC_irq(); - exit_idle(); irq_enter(); + exit_idle(); me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { @@ -2417,7 +2417,8 @@ static void ack_apic_level(struct irq_da irq_complete_move(cfg); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irqd_is_setaffinity_pending(data))) { + if (unlikely(irqd_is_setaffinity_pending(data) && + !irqd_irq_inprogress(data))) { do_unmask_irq = 1; mask_ioapic(cfg); } Index: linux-3.0/arch/x86/kernel/cpu/mcheck/mce.c =================================================================== --- linux-3.0.orig/arch/x86/kernel/cpu/mcheck/mce.c +++ linux-3.0/arch/x86/kernel/cpu/mcheck/mce.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -470,8 +471,8 @@ static inline void mce_get_rip(struct mc asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) { ack_APIC_irq(); - exit_idle(); irq_enter(); + exit_idle(); mce_notify_irq(); mce_schedule_work(); irq_exit(); @@ -1139,17 +1140,14 @@ void mce_log_therm_throt_event(__u64 sta * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ -static int check_interval = 5 * 60; /* 5 minutes */ +static unsigned long check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ -static DEFINE_PER_CPU(struct timer_list, mce_timer); +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ +static DEFINE_PER_CPU(struct hrtimer, mce_timer); -static void mce_start_timer(unsigned long data) +static enum hrtimer_restart mce_start_timer(struct hrtimer *timer) { - struct timer_list *t = &per_cpu(mce_timer, data); - int *n; - - WARN_ON(smp_processor_id() != data); + unsigned long *n; if (mce_available(__this_cpu_ptr(&cpu_info))) { machine_check_poll(MCP_TIMESTAMP, @@ -1162,12 +1160,13 @@ static void mce_start_timer(unsigned lon */ n = &__get_cpu_var(mce_next_interval); if (mce_notify_irq()) - *n = max(*n/2, HZ/100); + *n = max(*n/2, HZ/100UL); else - *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + *n = min(*n*2, round_jiffies_relative(check_interval*HZ)); - t->expires = jiffies + *n; - add_timer_on(t, smp_processor_id()); + hrtimer_forward(timer, timer->base->get_time(), + ns_to_ktime(jiffies_to_usecs(*n) * 1000)); + return HRTIMER_RESTART; } static void mce_do_trigger(struct work_struct *work) @@ -1393,10 +1392,11 @@ static void __mcheck_cpu_init_vendor(str static void __mcheck_cpu_init_timer(void) { - struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(mce_next_interval); + struct hrtimer *t = &__get_cpu_var(mce_timer); + unsigned long *n = &__get_cpu_var(mce_next_interval); - setup_timer(t, mce_start_timer, smp_processor_id()); + hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + t->function = mce_start_timer; if (mce_ignore_ce) return; @@ -1404,8 +1404,9 @@ static void __mcheck_cpu_init_timer(void *n = check_interval * HZ; if (!*n) return; - t->expires = round_jiffies(jiffies + *n); - add_timer_on(t, smp_processor_id()); + + hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(*n) * 1000), + 0 , HRTIMER_MODE_REL_PINNED); } /* Handle unconfigured int18 (should never happen) */ @@ -1768,7 +1769,7 @@ static struct syscore_ops mce_syscore_op static void mce_cpu_restart(void *data) { - del_timer_sync(&__get_cpu_var(mce_timer)); + hrtimer_cancel(&__get_cpu_var(mce_timer)); if (!mce_available(__this_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); @@ -1787,7 +1788,7 @@ static void mce_disable_ce(void *all) if (!mce_available(__this_cpu_ptr(&cpu_info))) return; if (all) - del_timer_sync(&__get_cpu_var(mce_timer)); + hrtimer_cancel(&__get_cpu_var(mce_timer)); cmci_clear(); } @@ -2016,6 +2017,8 @@ static void __cpuinit mce_disable_cpu(vo if (!mce_available(__this_cpu_ptr(&cpu_info))) return; + hrtimer_cancel(&__get_cpu_var(mce_timer)); + if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { @@ -2042,6 +2045,7 @@ static void __cpuinit mce_reenable_cpu(v if (b->init) wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); } + __mcheck_cpu_init_timer(); } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ @@ -2049,7 +2053,6 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct timer_list *t = &per_cpu(mce_timer, cpu); switch (action) { case CPU_ONLINE: @@ -2066,16 +2069,10 @@ mce_cpu_callback(struct notifier_block * break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - del_timer_sync(t); smp_call_function_single(cpu, mce_disable_cpu, &action, 1); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - if (!mce_ignore_ce && check_interval) { - t->expires = round_jiffies(jiffies + - __get_cpu_var(mce_next_interval)); - add_timer_on(t, cpu); - } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; case CPU_POST_DEAD: Index: linux-3.0/arch/x86/kernel/cpu/mcheck/therm_throt.c =================================================================== --- linux-3.0.orig/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ linux-3.0/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -396,8 +396,8 @@ static void (*smp_thermal_vector)(void) asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) { - exit_idle(); irq_enter(); + exit_idle(); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); irq_exit(); Index: linux-3.0/arch/x86/kernel/cpu/mcheck/threshold.c =================================================================== --- linux-3.0.orig/arch/x86/kernel/cpu/mcheck/threshold.c +++ linux-3.0/arch/x86/kernel/cpu/mcheck/threshold.c @@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = def asmlinkage void smp_threshold_interrupt(void) { - exit_idle(); irq_enter(); + exit_idle(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); irq_exit(); Index: linux-3.0/arch/x86/kernel/irq.c =================================================================== --- linux-3.0.orig/arch/x86/kernel/irq.c +++ linux-3.0/arch/x86/kernel/irq.c @@ -180,8 +180,8 @@ unsigned int __irq_entry do_IRQ(struct p unsigned vector = ~regs->orig_ax; unsigned irq; - exit_idle(); irq_enter(); + exit_idle(); irq = __this_cpu_read(vector_irq[vector]); @@ -208,10 +208,10 @@ void smp_x86_platform_ipi(struct pt_regs ack_APIC_irq(); - exit_idle(); - irq_enter(); + exit_idle(); + inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) Index: linux-3.0/kernel/taskstats.c =================================================================== --- linux-3.0.orig/kernel/taskstats.c +++ linux-3.0/kernel/taskstats.c @@ -657,6 +657,7 @@ static struct genl_ops taskstats_ops = { .cmd = TASKSTATS_CMD_GET, .doit = taskstats_user_cmd, .policy = taskstats_cmd_get_policy, + .flags = GENL_ADMIN_PERM, }; static struct genl_ops cgroupstats_ops = { Index: linux-3.0/kernel/trace/ftrace.c =================================================================== --- linux-3.0.orig/kernel/trace/ftrace.c +++ linux-3.0/kernel/trace/ftrace.c @@ -1182,8 +1182,14 @@ alloc_and_copy_ftrace_hash(int size_bits return NULL; } +static void +ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +static void +ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); + static int -ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) { struct ftrace_func_entry *entry; struct hlist_node *tp, *tn; @@ -1193,9 +1199,16 @@ ftrace_hash_move(struct ftrace_hash **ds unsigned long key; int size = src->count; int bits = 0; + int ret; int i; /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable(ops, enable); + + /* * If the new source is empty, just free dst and assign it * the empty_hash. */ @@ -1215,9 +1228,10 @@ ftrace_hash_move(struct ftrace_hash **ds if (bits > FTRACE_HASH_MAX_BITS) bits = FTRACE_HASH_MAX_BITS; + ret = -ENOMEM; new_hash = alloc_ftrace_hash(bits); if (!new_hash) - return -ENOMEM; + goto out; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1236,7 +1250,16 @@ ftrace_hash_move(struct ftrace_hash **ds rcu_assign_pointer(*dst, new_hash); free_ftrace_hash_rcu(old_hash); - return 0; + ret = 0; + out: + /* + * Enable regardless of ret: + * On success, we enable the new hash. + * On failure, we re-enable the original hash. + */ + ftrace_hash_rec_enable(ops, enable); + + return ret; } /* @@ -2877,7 +2900,7 @@ ftrace_set_regex(struct ftrace_ops *ops, ftrace_match_records(hash, buf, len); mutex_lock(&ftrace_lock); - ret = ftrace_hash_move(orig_hash, hash); + ret = ftrace_hash_move(ops, enable, orig_hash, hash); mutex_unlock(&ftrace_lock); mutex_unlock(&ftrace_regex_lock); @@ -3060,18 +3083,12 @@ ftrace_regex_release(struct inode *inode orig_hash = &iter->ops->notrace_hash; mutex_lock(&ftrace_lock); - /* - * Remove the current set, update the hash and add - * them back. - */ - ftrace_hash_rec_disable(iter->ops, filter_hash); - ret = ftrace_hash_move(orig_hash, iter->hash); - if (!ret) { - ftrace_hash_rec_enable(iter->ops, filter_hash); - if (iter->ops->flags & FTRACE_OPS_FL_ENABLED - && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); - } + ret = ftrace_hash_move(iter->ops, filter_hash, + orig_hash, iter->hash); + if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftrace_lock); } free_ftrace_hash(iter->hash); Index: linux-3.0/drivers/gpu/drm/drm_irq.c =================================================================== --- linux-3.0.orig/drivers/gpu/drm/drm_irq.c +++ linux-3.0/drivers/gpu/drm/drm_irq.c @@ -109,10 +109,7 @@ static void vblank_disable_and_save(stru /* Prevent vblank irq processing while disabling vblank irqs, * so no updates of timestamps or count can happen after we've * disabled. Needed to prevent races in case of delayed irq's. - * Disable preemption, so vblank_time_lock is held as short as - * possible, even under a kernel with PREEMPT_RT patches. */ - preempt_disable(); spin_lock_irqsave(&dev->vblank_time_lock, irqflags); dev->driver->disable_vblank(dev, crtc); @@ -163,7 +160,6 @@ static void vblank_disable_and_save(stru clear_vblank_timestamps(dev, crtc); spin_unlock_irqrestore(&dev->vblank_time_lock, irqflags); - preempt_enable(); } static void vblank_disable_fn(unsigned long arg) @@ -875,10 +871,6 @@ int drm_vblank_get(struct drm_device *de spin_lock_irqsave(&dev->vbl_lock, irqflags); /* Going from 0->1 means we have to enable interrupts again */ if (atomic_add_return(1, &dev->vblank_refcount[crtc]) == 1) { - /* Disable preemption while holding vblank_time_lock. Do - * it explicitely to guard against PREEMPT_RT kernel. - */ - preempt_disable(); spin_lock_irqsave(&dev->vblank_time_lock, irqflags2); if (!dev->vblank_enabled[crtc]) { /* Enable vblank irqs under vblank_time_lock protection. @@ -898,7 +890,6 @@ int drm_vblank_get(struct drm_device *de } } spin_unlock_irqrestore(&dev->vblank_time_lock, irqflags2); - preempt_enable(); } else { if (!dev->vblank_enabled[crtc]) { atomic_dec(&dev->vblank_refcount[crtc]); Index: linux-3.0/arch/x86/kernel/kprobes.c =================================================================== --- linux-3.0.orig/arch/x86/kernel/kprobes.c +++ linux-3.0/arch/x86/kernel/kprobes.c @@ -477,7 +477,6 @@ static void __kprobes setup_singlestep(s * stepping. */ regs->ip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); return; } #endif Index: linux-3.0/drivers/ide/ide_platform.c =================================================================== --- linux-3.0.orig/drivers/ide/ide_platform.c +++ linux-3.0/drivers/ide/ide_platform.c @@ -95,7 +95,7 @@ static int __devinit plat_ide_probe(stru plat_ide_setup_ports(&hw, base, alt_base, pdata, res_irq->start); hw.dev = &pdev->dev; - d.irq_flags = res_irq->flags; + d.irq_flags = 0; if (mmio) d.host_flags |= IDE_HFLAG_MMIO; Index: linux-3.0/arch/x86/kernel/hpet.c =================================================================== --- linux-3.0.orig/arch/x86/kernel/hpet.c +++ linux-3.0/arch/x86/kernel/hpet.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -566,6 +567,30 @@ static void init_one_hpet_msi_clockevent #define RESERVE_TIMERS 0 #endif +static int __init dmi_disable_hpet_msi(const struct dmi_system_id *d) +{ + hpet_msi_disable = 1; + return 0; +} + +static struct dmi_system_id __initdata dmi_hpet_table[] = { + /* + * MSI based per cpu timers lose interrupts when intel_idle() + * is enabled - independent of the c-state. With idle=poll the + * problem cannot be observed. We have no idea yet, whether + * this is a W510 specific issue or a general chipset oddity. + */ + { + .callback = dmi_disable_hpet_msi, + .ident = "Lenovo W510", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad W510"), + }, + }, + {} +}; + static void hpet_msi_capability_lookup(unsigned int start_timer) { unsigned int id; @@ -573,6 +598,8 @@ static void hpet_msi_capability_lookup(u unsigned int num_timers_used = 0; int i; + dmi_check_system(dmi_hpet_table); + if (hpet_msi_disable) return; Index: linux-3.0/block/blk-core.c =================================================================== --- linux-3.0.orig/block/blk-core.c +++ linux-3.0/block/blk-core.c @@ -236,7 +236,7 @@ EXPORT_SYMBOL(blk_delay_queue); **/ void blk_start_queue(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); __blk_run_queue(q); @@ -301,7 +301,11 @@ void __blk_run_queue(struct request_queu { if (unlikely(blk_queue_stopped(q))) return; - + /* + * q->request_fn() can drop q->queue_lock and reenable + * interrupts, but must return with q->queue_lock held and + * interrupts disabled. + */ q->request_fn(q); } EXPORT_SYMBOL(__blk_run_queue); @@ -2669,11 +2673,11 @@ static void queue_unplugged(struct reque * this lock). */ if (from_schedule) { - spin_unlock(q->queue_lock); + spin_unlock_irq(q->queue_lock); blk_run_queue_async(q); } else { __blk_run_queue(q); - spin_unlock(q->queue_lock); + spin_unlock_irq(q->queue_lock); } } @@ -2699,7 +2703,6 @@ static void flush_plug_callbacks(struct void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request_queue *q; - unsigned long flags; struct request *rq; LIST_HEAD(list); unsigned int depth; @@ -2720,11 +2723,6 @@ void blk_flush_plug_list(struct blk_plug q = NULL; depth = 0; - /* - * Save and disable interrupts here, to avoid doing it for every - * queue lock we have to take. - */ - local_irq_save(flags); while (!list_empty(&list)) { rq = list_entry_rq(list.next); list_del_init(&rq->queuelist); @@ -2737,7 +2735,7 @@ void blk_flush_plug_list(struct blk_plug queue_unplugged(q, depth, from_schedule); q = rq->q; depth = 0; - spin_lock(q->queue_lock); + spin_lock_irq(q->queue_lock); } /* * rq is already accounted, so use raw insert @@ -2755,8 +2753,6 @@ void blk_flush_plug_list(struct blk_plug */ if (q) queue_unplugged(q, depth, from_schedule); - - local_irq_restore(flags); } void blk_finish_plug(struct blk_plug *plug) Index: linux-3.0/kernel/sched.c =================================================================== --- linux-3.0.orig/kernel/sched.c +++ linux-3.0/kernel/sched.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -185,6 +186,7 @@ void init_rt_bandwidth(struct rt_bandwid hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.irqsafe = 1; rt_b->rt_period_timer.function = sched_rt_period_timer; } @@ -800,7 +802,11 @@ late_initcall(sched_init_debug); * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ +#ifndef CONFIG_PREEMPT_RT_FULL const_debug unsigned int sysctl_sched_nr_migrate = 32; +#else +const_debug unsigned int sysctl_sched_nr_migrate = 8; +#endif /* * period over which we average the RT time consumption, measured @@ -1136,6 +1142,7 @@ static void init_rq_hrtick(struct rq *rq hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; + rq->hrtick_timer.irqsafe = 1; } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -2378,11 +2385,11 @@ static int select_fallback_rq(int cpu, s /* Look for allowed, online CPU in same node. */ for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); if (dest_cpu < nr_cpu_ids) return dest_cpu; @@ -2397,7 +2404,12 @@ static int select_fallback_rq(int cpu, s printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } - + /* + * Clear PF_THREAD_BOUND, otherwise we wreckage + * migrate_disable/enable. See optimization for + * PF_THREAD_BOUND tasks there. + */ + p->flags &= ~PF_THREAD_BOUND; return dest_cpu; } @@ -2419,7 +2431,7 @@ int select_task_rq(struct task_struct *p * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || !cpu_online(cpu))) cpu = select_fallback_rq(task_cpu(p), p); @@ -2477,10 +2489,6 @@ static void ttwu_activate(struct rq *rq, { activate_task(rq, p, en_flags); p->on_rq = 1; - - /* if a worker is waking up, notify workqueue */ - if (p->flags & PF_WQ_WORKER) - wq_worker_waking_up(p, cpu_of(rq)); } /* @@ -2678,8 +2686,25 @@ try_to_wake_up(struct task_struct *p, un smp_wmb(); raw_spin_lock_irqsave(&p->pi_lock, flags); - if (!(p->state & state)) + if (!(p->state & state)) { + /* + * The task might be running due to a spinlock sleeper + * wakeup. Check the saved state and set it to running + * if the wakeup condition is true. + */ + if (!(wake_flags & WF_LOCK_SLEEPER)) { + if (p->saved_state & state) + p->saved_state = TASK_RUNNING; + } goto out; + } + + /* + * If this is a regular wakeup, then we can unconditionally + * clear the saved state of a "lock sleeper". + */ + if (!(wake_flags & WF_LOCK_SLEEPER)) + p->saved_state = TASK_RUNNING; success = 1; /* we're going to change ->state */ cpu = task_cpu(p); @@ -2735,40 +2760,6 @@ out: } /** - * try_to_wake_up_local - try to wake up a local task with rq lock held - * @p: the thread to be awakened - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. - */ -static void try_to_wake_up_local(struct task_struct *p) -{ - struct rq *rq = task_rq(p); - - BUG_ON(rq != this_rq()); - BUG_ON(p == current); - lockdep_assert_held(&rq->lock); - - if (!raw_spin_trylock(&p->pi_lock)) { - raw_spin_unlock(&rq->lock); - raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - } - - if (!(p->state & TASK_NORMAL)) - goto out; - - if (!p->on_rq) - ttwu_activate(rq, p, ENQUEUE_WAKEUP); - - ttwu_do_wakeup(rq, p, 0); - ttwu_stat(p, smp_processor_id(), 0); -out: - raw_spin_unlock(&p->pi_lock); -} - -/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -2785,6 +2776,18 @@ int wake_up_process(struct task_struct * } EXPORT_SYMBOL(wake_up_process); +/** + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock" + * @p: The process to be woken up. + * + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate + * the nature of the wakeup. + */ +int wake_up_lock_sleeper(struct task_struct *p) +{ + return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER); +} + int wake_up_state(struct task_struct *p, unsigned int state) { return try_to_wake_up(p, state, 0); @@ -3060,8 +3063,12 @@ static void finish_task_switch(struct rq finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); + /* + * We use mmdrop_delayed() here so we don't have to do the + * full __mmdrop() when we are the last user. + */ if (mm) - mmdrop(mm); + mmdrop_delayed(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this @@ -4182,6 +4189,126 @@ static inline void schedule_debug(struct schedstat_inc(this_rq(), sched_count); } +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP) +#define MIGRATE_DISABLE_SET_AFFIN (1<<30) /* Can't make a negative */ +#define migrate_disabled_updated(p) ((p)->migrate_disable & MIGRATE_DISABLE_SET_AFFIN) +#define migrate_disable_count(p) ((p)->migrate_disable & ~MIGRATE_DISABLE_SET_AFFIN) + +static inline void update_migrate_disable(struct task_struct *p) +{ + const struct cpumask *mask; + + if (likely(!p->migrate_disable)) + return; + + /* Did we already update affinity? */ + if (unlikely(migrate_disabled_updated(p))) + return; + + /* + * Since this is always current we can get away with only locking + * rq->lock, the ->cpus_allowed value can normally only be changed + * while holding both p->pi_lock and rq->lock, but seeing that this + * is current, we cannot actually be waking up, so all code that + * relies on serialization against p->pi_lock is out of scope. + * + * Having rq->lock serializes us against things like + * set_cpus_allowed_ptr() that can still happen concurrently. + */ + mask = tsk_cpus_allowed(p); + + if (p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, mask); + p->rt.nr_cpus_allowed = cpumask_weight(mask); + + /* Let migrate_enable know to fix things back up */ + p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN; +} + +void migrate_disable(void) +{ + struct task_struct *p = current; + + if (in_atomic() || p->flags & PF_THREAD_BOUND) { +#ifdef CONFIG_SCHED_DEBUG + p->migrate_disable_atomic++; +#endif + return; + } + +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(p->migrate_disable_atomic); +#endif + + preempt_disable(); + if (p->migrate_disable) { + p->migrate_disable++; + preempt_enable(); + return; + } + + pin_current_cpu(); + p->migrate_disable = 1; + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); + +void migrate_enable(void) +{ + struct task_struct *p = current; + const struct cpumask *mask; + unsigned long flags; + struct rq *rq; + + if (in_atomic() || p->flags & PF_THREAD_BOUND) { +#ifdef CONFIG_SCHED_DEBUG + p->migrate_disable_atomic--; +#endif + return; + } + +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(p->migrate_disable_atomic); +#endif + WARN_ON_ONCE(p->migrate_disable <= 0); + + preempt_disable(); + if (migrate_disable_count(p) > 1) { + p->migrate_disable--; + preempt_enable(); + return; + } + + if (unlikely(migrate_disabled_updated(p))) { + /* + * Undo whatever update_migrate_disable() did, also see there + * about locking. + */ + rq = this_rq(); + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * Clearing migrate_disable causes tsk_cpus_allowed to + * show the tasks original cpu affinity. + */ + p->migrate_disable = 0; + mask = tsk_cpus_allowed(p); + if (p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, mask); + p->rt.nr_cpus_allowed = cpumask_weight(mask); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } else + p->migrate_disable = 0; + + unpin_current_cpu(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_enable); +#else +static inline void update_migrate_disable(struct task_struct *p) { } +#define migrate_disabled_updated(p) 0 +#endif + static void put_prev_task(struct rq *rq, struct task_struct *prev) { if (prev->on_rq || rq->skip_clock_update < 0) @@ -4241,6 +4368,8 @@ need_resched: raw_spin_lock_irq(&rq->lock); + update_migrate_disable(prev); + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) { @@ -4248,19 +4377,6 @@ need_resched: } else { deactivate_task(rq, prev, DEQUEUE_SLEEP); prev->on_rq = 0; - - /* - * If a worker went to sleep, notify and ask workqueue - * whether it wants to wake up a task to maintain - * concurrency. - */ - if (prev->flags & PF_WQ_WORKER) { - struct task_struct *to_wakeup; - - to_wakeup = wq_worker_sleeping(prev, cpu); - if (to_wakeup) - try_to_wake_up_local(to_wakeup); - } } switch_count = &prev->nvcsw; } @@ -4294,15 +4410,23 @@ need_resched: post_schedule(rq); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); if (need_resched()) goto need_resched; } static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state) + if (!tsk->state || tsk_is_pi_blocked(tsk)) return; + + /* + * If a worker went to sleep, notify and ask workqueue whether + * it wants to wake up a task to maintain concurrency. + */ + if (tsk->flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. @@ -4311,15 +4435,37 @@ static inline void sched_submit_work(str blk_schedule_flush_plug(tsk); } +static inline void sched_update_worker(struct task_struct *tsk) +{ + if (tsk_is_pi_blocked(tsk)) + return; + + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); +} + asmlinkage void __sched schedule(void) { struct task_struct *tsk = current; sched_submit_work(tsk); __schedule(); + sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); +/** + * schedule_preempt_disabled - called with preemption disabled + * + * Returns with preemption disabled. Note: preempt_count must be 1 + */ +void __sched schedule_preempt_disabled(void) +{ + __preempt_enable_no_resched(); + schedule(); + preempt_disable(); +} + #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline bool owner_running(struct mutex *lock, struct task_struct *owner) @@ -4391,7 +4537,16 @@ asmlinkage void __sched notrace preempt_ do { add_preempt_count_notrace(PREEMPT_ACTIVE); + /* + * The add/subtract must not be traced by the function + * tracer. But we still want to account for the + * preempt off latency tracer. Since the _notrace versions + * of add/subtract skip the accounting for latency tracer + * we must force it manually. + */ + start_critical_timings(); __schedule(); + stop_critical_timings(); sub_preempt_count_notrace(PREEMPT_ACTIVE); /* @@ -4814,9 +4969,8 @@ long __sched sleep_on_timeout(wait_queue EXPORT_SYMBOL(sleep_on_timeout); #ifdef CONFIG_RT_MUTEXES - /* - * rt_mutex_setprio - set the current priority of a task + * task_setprio - set the current priority of a task * @p: task * @prio: prio value (kernel-internal form) * @@ -4825,7 +4979,7 @@ EXPORT_SYMBOL(sleep_on_timeout); * * Used by the rt_mutex code to implement priority inheritance logic. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void task_setprio(struct task_struct *p, int prio) { int oldprio, on_rq, running; struct rq *rq; @@ -4835,6 +4989,24 @@ void rt_mutex_setprio(struct task_struct rq = __task_rq_lock(p); + /* + * Idle task boosting is a nono in general. There is one + * exception, when PREEMPT_RT and NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; @@ -4858,9 +5030,9 @@ void rt_mutex_setprio(struct task_struct enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio); +out_unlock: __task_rq_unlock(rq); } - #endif void set_user_nice(struct task_struct *p, long nice) @@ -4995,7 +5167,13 @@ EXPORT_SYMBOL(task_nice); */ int idle_cpu(int cpu) { - return cpu_curr(cpu) == cpu_rq(cpu)->idle; + struct rq *rq = cpu_rq(cpu); + +#ifdef CONFIG_SMP + return rq->curr == rq->idle && !rq->nr_running && !rq->wake_list; +#else + return rq->curr == rq->idle && !rq->nr_running; +#endif } /** @@ -5529,7 +5707,7 @@ SYSCALL_DEFINE0(sched_yield) __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); do_raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); schedule(); @@ -5543,9 +5721,17 @@ static inline int should_resched(void) static void __cond_resched(void) { - add_preempt_count(PREEMPT_ACTIVE); - __schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + do { + add_preempt_count(PREEMPT_ACTIVE); + __schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + /* + * Check again in case we missed a preemption + * opportunity between schedule and now. + */ + barrier(); + + } while (need_resched()); } int __sched _cond_resched(void) @@ -5586,6 +5772,7 @@ int __cond_resched_lock(spinlock_t *lock } EXPORT_SYMBOL(__cond_resched_lock); +#ifndef CONFIG_PREEMPT_RT_FULL int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); @@ -5599,6 +5786,7 @@ int __sched __cond_resched_softirq(void) return 0; } EXPORT_SYMBOL(__cond_resched_softirq); +#endif /** * yield - yield the current processor to other threads. @@ -5845,7 +6033,7 @@ void show_state_filter(unsigned long sta printk(KERN_INFO " task PC stack pid father\n"); #endif - read_lock(&tasklist_lock); + rcu_read_lock(); do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -5861,7 +6049,7 @@ void show_state_filter(unsigned long sta #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * Only show locks if all tasks are dumped: */ @@ -5922,6 +6110,9 @@ void __cpuinit init_idle(struct task_str */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); +#if defined(CONFIG_SMP) + sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); +#endif } /* @@ -5983,12 +6174,12 @@ static inline void sched_init_granularit #ifdef CONFIG_SMP void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { - if (p->sched_class && p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - else { - cpumask_copy(&p->cpus_allowed, new_mask); + if (!migrate_disabled_updated(p)) { + if (p->sched_class && p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } + cpumask_copy(&p->cpus_allowed, new_mask); } /* @@ -6039,7 +6230,7 @@ int set_cpus_allowed_ptr(struct task_str do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) + if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p)) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); @@ -6086,7 +6277,7 @@ static int __migrate_task(struct task_st if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; /* @@ -6128,6 +6319,8 @@ static int migration_cpu_stop(void *data #ifdef CONFIG_HOTPLUG_CPU +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm); + /* * Ensures that the idle task is using init_mm right before its cpu goes * offline. @@ -6140,7 +6333,12 @@ void idle_task_exit(void) if (mm != &init_mm) switch_mm(mm, &init_mm, current); - mmdrop(mm); + + /* + * Defer the cleanup to an alive cpu. On RT we can neither + * call mmdrop() nor mmdrop_delayed() from here. + */ + per_cpu(idle_last_mm, smp_processor_id()) = mm; } /* @@ -6458,6 +6656,12 @@ migration_call(struct notifier_block *nf migrate_nr_uninterruptible(rq); calc_global_load_remove(rq); break; + case CPU_DEAD: + if (per_cpu(idle_last_mm, cpu)) { + mmdrop(per_cpu(idle_last_mm, cpu)); + per_cpu(idle_last_mm, cpu) = NULL; + } + break; #endif } @@ -8175,7 +8379,8 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + int nested = (preempt_count() & ~PREEMPT_ACTIVE) + + sched_rcu_preempt_depth(); return (nested == preempt_offset); } Index: linux-3.0/kernel/workqueue.c =================================================================== --- linux-3.0.orig/kernel/workqueue.c +++ linux-3.0/kernel/workqueue.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "workqueue_sched.h" @@ -57,20 +58,10 @@ enum { WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ - WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ - WORKER_REBIND = 1 << 5, /* mom is home, come back */ - WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ - WORKER_UNBOUND = 1 << 7, /* worker is unbound */ - - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | - WORKER_CPU_INTENSIVE | WORKER_UNBOUND, - - /* gcwq->trustee_state */ - TRUSTEE_START = 0, /* start */ - TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ - TRUSTEE_BUTCHER = 2, /* butcher workers */ - TRUSTEE_RELEASE = 3, /* release workers */ - TRUSTEE_DONE = 4, /* trustee is done */ + WORKER_CPU_INTENSIVE = 1 << 4, /* cpu intensive */ + WORKER_UNBOUND = 1 << 5, /* worker is unbound */ + + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | WORKER_UNBOUND, BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, @@ -84,7 +75,6 @@ enum { (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ - TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ /* * Rescue workers are used only on emergencies and shared by @@ -136,7 +126,7 @@ struct worker { unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ - struct work_struct rebind_work; /* L: rebind worker to cpu */ + int sleeping; /* None */ }; /* @@ -163,10 +153,8 @@ struct global_cwq { struct ida worker_ida; /* L: for worker IDs */ - struct task_struct *trustee; /* L: for gcwq shutdown */ - unsigned int trustee_state; /* L: trustee state */ - wait_queue_head_t trustee_wait; /* trustee wait */ struct worker *first_idle; /* L: first idle worker */ + wait_queue_head_t idle_wait; } ____cacheline_aligned_in_smp; /* @@ -657,66 +645,58 @@ static void wake_up_worker(struct global } /** - * wq_worker_waking_up - a worker is waking up - * @task: task waking up - * @cpu: CPU @task is waking up to - * - * This function is called during try_to_wake_up() when a worker is - * being awoken. + * wq_worker_running - a worker is running again + * @task: task returning from sleep * - * CONTEXT: - * spin_lock_irq(rq->lock) + * This function is called when a worker returns from schedule() */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +void wq_worker_running(struct task_struct *task) { struct worker *worker = kthread_data(task); + if (!worker->sleeping) + return; if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_gcwq_nr_running(cpu)); + atomic_inc(get_gcwq_nr_running(smp_processor_id())); + worker->sleeping = 0; } /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep - * @cpu: CPU in question, must be the current CPU number - * - * This function is called during schedule() when a busy worker is - * going to sleep. Worker on the same cpu can be woken up by - * returning pointer to its task. * - * CONTEXT: - * spin_lock_irq(rq->lock) - * - * RETURNS: - * Worker task on @cpu to wake up, %NULL if none. + * This function is called from schedule() when a busy worker is + * going to sleep. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu) +void wq_worker_sleeping(struct task_struct *task) { - struct worker *worker = kthread_data(task), *to_wakeup = NULL; - struct global_cwq *gcwq = get_gcwq(cpu); - atomic_t *nr_running = get_gcwq_nr_running(cpu); + struct worker *worker = kthread_data(task); + struct global_cwq *gcwq; + int cpu; if (worker->flags & WORKER_NOT_RUNNING) - return NULL; + return; - /* this can only happen on the local cpu */ - BUG_ON(cpu != raw_smp_processor_id()); + if (WARN_ON_ONCE(worker->sleeping)) + return; + + worker->sleeping = 1; + cpu = smp_processor_id(); + gcwq = get_gcwq(cpu); + spin_lock_irq(&gcwq->lock); /* * The counterpart of the following dec_and_test, implied mb, * worklist not empty test sequence is in insert_work(). * Please read comment there. - * - * NOT_RUNNING is clear. This means that trustee is not in - * charge and we're running on the local cpu w/ rq lock held - * and preemption disabled, which in turn means that none else - * could be manipulating idle_list, so dereferencing idle_list - * without gcwq lock is safe. - */ - if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) - to_wakeup = first_worker(gcwq); - return to_wakeup ? to_wakeup->task : NULL; + */ + if (atomic_dec_and_test(get_gcwq_nr_running(cpu)) && + !list_empty(&gcwq->worklist)) { + worker = first_worker(gcwq); + if (worker) + wake_up_process(worker->task); + } + spin_unlock_irq(&gcwq->lock); } /** @@ -978,13 +958,38 @@ static bool is_chained_work(struct workq return false; } -static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, - struct work_struct *work) +static void ___queue_work(struct workqueue_struct *wq, struct global_cwq *gcwq, + struct work_struct *work) { - struct global_cwq *gcwq; struct cpu_workqueue_struct *cwq; struct list_head *worklist; unsigned int work_flags; + + /* gcwq determined, get cwq and queue */ + cwq = get_cwq(gcwq->cpu, wq); + trace_workqueue_queue_work(gcwq->cpu, cwq, work); + + BUG_ON(!list_empty(&work->entry)); + + cwq->nr_in_flight[cwq->work_color]++; + work_flags = work_color_to_flags(cwq->work_color); + + if (likely(cwq->nr_active < cwq->max_active)) { + trace_workqueue_activate_work(work); + cwq->nr_active++; + worklist = gcwq_determine_ins_pos(gcwq, cwq); + } else { + work_flags |= WORK_STRUCT_DELAYED; + worklist = &cwq->delayed_works; + } + + insert_work(cwq, work, worklist, work_flags); +} + +static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, + struct work_struct *work) +{ + struct global_cwq *gcwq; unsigned long flags; debug_work_activate(work); @@ -1030,27 +1035,32 @@ static void __queue_work(unsigned int cp spin_lock_irqsave(&gcwq->lock, flags); } - /* gcwq determined, get cwq and queue */ - cwq = get_cwq(gcwq->cpu, wq); - trace_workqueue_queue_work(cpu, cwq, work); + ___queue_work(wq, gcwq, work); - BUG_ON(!list_empty(&work->entry)); + spin_unlock_irqrestore(&gcwq->lock, flags); +} - cwq->nr_in_flight[cwq->work_color]++; - work_flags = work_color_to_flags(cwq->work_color); +/** + * queue_work_on - queue work on specific cpu + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @work: work to queue + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to a specific CPU, the caller must ensure it + * can't go away. + */ +static int +__queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) +{ + int ret = 0; - if (likely(cwq->nr_active < cwq->max_active)) { - trace_workqueue_activate_work(work); - cwq->nr_active++; - worklist = gcwq_determine_ins_pos(gcwq, cwq); - } else { - work_flags |= WORK_STRUCT_DELAYED; - worklist = &cwq->delayed_works; + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_work(cpu, wq, work); + ret = 1; } - - insert_work(cwq, work, worklist, work_flags); - - spin_unlock_irqrestore(&gcwq->lock, flags); + return ret; } /** @@ -1067,34 +1077,19 @@ int queue_work(struct workqueue_struct * { int ret; - ret = queue_work_on(get_cpu(), wq, work); - put_cpu(); + ret = __queue_work_on(get_cpu_light(), wq, work); + put_cpu_light(); return ret; } EXPORT_SYMBOL_GPL(queue_work); -/** - * queue_work_on - queue work on specific cpu - * @cpu: CPU number to execute work on - * @wq: workqueue to use - * @work: work to queue - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - * - * We queue the work to a specific CPU, the caller must ensure it - * can't go away. - */ int queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { - int ret = 0; + WARN_ON(wq->flags & WQ_NON_AFFINE); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - __queue_work(cpu, wq, work); - ret = 1; - } - return ret; + return __queue_work_on(cpu, wq, work); } EXPORT_SYMBOL_GPL(queue_work_on); @@ -1140,6 +1135,8 @@ int queue_delayed_work_on(int cpu, struc struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + WARN_ON((wq->flags & WQ_NON_AFFINE) && cpu != -1); + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { unsigned int lcpu; @@ -1205,12 +1202,13 @@ static void worker_enter_idle(struct wor /* idle_list is LIFO */ list_add(&worker->entry, &gcwq->idle_list); - if (likely(!(worker->flags & WORKER_ROGUE))) { - if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) - mod_timer(&gcwq->idle_timer, - jiffies + IDLE_WORKER_TIMEOUT); - } else - wake_up_all(&gcwq->trustee_wait); + if (gcwq->nr_idle == gcwq->nr_workers) + wake_up_all(&gcwq->idle_wait); + + if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) { + mod_timer(&gcwq->idle_timer, + jiffies + IDLE_WORKER_TIMEOUT); + } /* sanity check nr_running */ WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && @@ -1287,8 +1285,14 @@ __acquires(&gcwq->lock) return false; if (task_cpu(task) == gcwq->cpu && cpumask_equal(¤t->cpus_allowed, - get_cpu_mask(gcwq->cpu))) + get_cpu_mask(gcwq->cpu))) { + /* + * Since we're binding to a particular cpu and need to + * stay there for correctness, mark us PF_THREAD_BOUND. + */ + task->flags |= PF_THREAD_BOUND; return true; + } spin_unlock_irq(&gcwq->lock); /* @@ -1302,20 +1306,15 @@ __acquires(&gcwq->lock) } } -/* - * Function for worker->rebind_work used to rebind rogue busy workers - * to the associated cpu which is coming back online. This is - * scheduled by cpu up but can race with other cpu hotplug operations - * and may be executed twice without intervening cpu down. - */ -static void worker_rebind_fn(struct work_struct *work) +static void worker_unbind_and_unlock(struct worker *worker) { - struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->gcwq; + struct task_struct *task = worker->task; - if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_REBIND); - + /* + * Its no longer required we're PF_THREAD_BOUND, the work is done. + */ + task->flags &= ~PF_THREAD_BOUND; spin_unlock_irq(&gcwq->lock); } @@ -1327,7 +1326,6 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); - INIT_WORK(&worker->rebind_work, worker_rebind_fn); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1382,15 +1380,9 @@ static struct worker *create_worker(stru if (IS_ERR(worker->task)) goto fail; - /* - * A rogue worker will become a regular one if CPU comes - * online later on. Make sure every worker has - * PF_THREAD_BOUND set. - */ if (bind && !on_unbound_cpu) kthread_bind(worker->task, gcwq->cpu); else { - worker->task->flags |= PF_THREAD_BOUND; if (on_unbound_cpu) worker->flags |= WORKER_UNBOUND; } @@ -1667,13 +1659,6 @@ static bool manage_workers(struct worker gcwq->flags &= ~GCWQ_MANAGING_WORKERS; - /* - * The trustee might be waiting to take over the manager - * position, tell it we're done. - */ - if (unlikely(gcwq->trustee)) - wake_up_all(&gcwq->trustee_wait); - return ret; } @@ -2074,7 +2059,7 @@ repeat: if (keep_working(gcwq)) wake_up_worker(gcwq); - spin_unlock_irq(&gcwq->lock); + worker_unbind_and_unlock(rescuer); } schedule(); @@ -2970,7 +2955,6 @@ struct workqueue_struct *__alloc_workque if (IS_ERR(rescuer->task)) goto err; - rescuer->task->flags |= PF_THREAD_BOUND; wake_up_process(rescuer->task); } @@ -3189,171 +3173,76 @@ EXPORT_SYMBOL_GPL(work_busy); * gcwqs serve mix of short, long and very long running works making * blocked draining impractical. * - * This is solved by allowing a gcwq to be detached from CPU, running - * it with unbound (rogue) workers and allowing it to be reattached - * later if the cpu comes back online. A separate thread is created - * to govern a gcwq in such state and is called the trustee of the - * gcwq. - * - * Trustee states and their descriptions. - * - * START Command state used on startup. On CPU_DOWN_PREPARE, a - * new trustee is started with this state. - * - * IN_CHARGE Once started, trustee will enter this state after - * assuming the manager role and making all existing - * workers rogue. DOWN_PREPARE waits for trustee to - * enter this state. After reaching IN_CHARGE, trustee - * tries to execute the pending worklist until it's empty - * and the state is set to BUTCHER, or the state is set - * to RELEASE. - * - * BUTCHER Command state which is set by the cpu callback after - * the cpu has went down. Once this state is set trustee - * knows that there will be no new works on the worklist - * and once the worklist is empty it can proceed to - * killing idle workers. - * - * RELEASE Command state which is set by the cpu callback if the - * cpu down has been canceled or it has come online - * again. After recognizing this state, trustee stops - * trying to drain or butcher and clears ROGUE, rebinds - * all remaining workers back to the cpu and releases - * manager role. - * - * DONE Trustee will enter this state after BUTCHER or RELEASE - * is complete. - * - * trustee CPU draining - * took over down complete - * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE - * | | ^ - * | CPU is back online v return workers | - * ----------------> RELEASE -------------- */ -/** - * trustee_wait_event_timeout - timed event wait for trustee - * @cond: condition to wait for - * @timeout: timeout in jiffies - * - * wait_event_timeout() for trustee to use. Handles locking and - * checks for RELEASE request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * Positive indicating left time if @cond is satisfied, 0 if timed - * out, -1 if canceled. - */ -#define trustee_wait_event_timeout(cond, timeout) ({ \ - long __ret = (timeout); \ - while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ - __ret) { \ - spin_unlock_irq(&gcwq->lock); \ - __wait_event_timeout(gcwq->trustee_wait, (cond) || \ - (gcwq->trustee_state == TRUSTEE_RELEASE), \ - __ret); \ - spin_lock_irq(&gcwq->lock); \ - } \ - gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ -}) +static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct global_cwq *gcwq = get_gcwq(cpu); + struct worker *uninitialized_var(new_worker); + unsigned long flags; -/** - * trustee_wait_event - event wait for trustee - * @cond: condition to wait for - * - * wait_event() for trustee to use. Automatically handles locking and - * checks for CANCEL request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * 0 if @cond is satisfied, -1 if canceled. - */ -#define trustee_wait_event(cond) ({ \ - long __ret1; \ - __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ - __ret1 < 0 ? -1 : 0; \ -}) + action &= ~CPU_TASKS_FROZEN; -static int __cpuinit trustee_thread(void *__gcwq) -{ - struct global_cwq *gcwq = __gcwq; - struct worker *worker; - struct work_struct *work; - struct hlist_node *pos; - long rc; - int i; + switch (action) { + case CPU_UP_PREPARE: + BUG_ON(gcwq->first_idle); + new_worker = create_worker(gcwq, false); + if (!new_worker) + return NOTIFY_BAD; + case CPU_UP_CANCELED: + case CPU_ONLINE: + break; + default: + return notifier_from_errno(0); + } - BUG_ON(gcwq->cpu != smp_processor_id()); + /* some are called w/ irq disabled, don't disturb irq status */ + spin_lock_irqsave(&gcwq->lock, flags); - spin_lock_irq(&gcwq->lock); - /* - * Claim the manager position and make all workers rogue. - * Trustee must be bound to the target cpu and can't be - * cancelled. - */ - BUG_ON(gcwq->cpu != smp_processor_id()); - rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); - BUG_ON(rc < 0); + switch (action) { + case CPU_UP_PREPARE: + BUG_ON(gcwq->first_idle); + gcwq->first_idle = new_worker; + break; - gcwq->flags |= GCWQ_MANAGING_WORKERS; + case CPU_UP_CANCELED: + destroy_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; - list_for_each_entry(worker, &gcwq->idle_list, entry) - worker->flags |= WORKER_ROGUE; + case CPU_ONLINE: + spin_unlock_irq(&gcwq->lock); + kthread_bind(gcwq->first_idle->task, cpu); + spin_lock_irq(&gcwq->lock); + gcwq->flags |= GCWQ_MANAGE_WORKERS; + start_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; + } - for_each_busy_worker(worker, i, pos, gcwq) - worker->flags |= WORKER_ROGUE; + spin_unlock_irqrestore(&gcwq->lock, flags); - /* - * Call schedule() so that we cross rq->lock and thus can - * guarantee sched callbacks see the rogue flag. This is - * necessary as scheduler callbacks may be invoked from other - * cpus. - */ - spin_unlock_irq(&gcwq->lock); - schedule(); - spin_lock_irq(&gcwq->lock); + return notifier_from_errno(0); +} - /* - * Sched callbacks are disabled now. Zap nr_running. After - * this, nr_running stays zero and need_more_worker() and - * keep_working() are always true as long as the worklist is - * not empty. - */ - atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); +static void flush_gcwq(struct global_cwq *gcwq) +{ + struct work_struct *work, *nw; + struct worker *worker, *n; + LIST_HEAD(non_affine_works); - spin_unlock_irq(&gcwq->lock); - del_timer_sync(&gcwq->idle_timer); spin_lock_irq(&gcwq->lock); + list_for_each_entry_safe(work, nw, &gcwq->worklist, entry) { + struct workqueue_struct *wq = get_work_cwq(work)->wq; - /* - * We're now in charge. Notify and proceed to drain. We need - * to keep the gcwq running during the whole CPU down - * procedure as other cpu hotunplug callbacks may need to - * flush currently running tasks. - */ - gcwq->trustee_state = TRUSTEE_IN_CHARGE; - wake_up_all(&gcwq->trustee_wait); - - /* - * The original cpu is in the process of dying and may go away - * anytime now. When that happens, we and all workers would - * be migrated to other cpus. Try draining any left work. We - * want to get it over with ASAP - spam rescuers, wake up as - * many idlers as necessary and create new ones till the - * worklist is empty. Note that if the gcwq is frozen, there - * may be frozen works in freezable cwqs. Don't declare - * completion while frozen. - */ - while (gcwq->nr_workers != gcwq->nr_idle || - gcwq->flags & GCWQ_FREEZING || - gcwq->trustee_state == TRUSTEE_IN_CHARGE) { + if (wq->flags & WQ_NON_AFFINE) + list_move(&work->entry, &non_affine_works); + } + + while (!list_empty(&gcwq->worklist)) { int nr_works = 0; list_for_each_entry(work, &gcwq->worklist, entry) { @@ -3367,189 +3256,54 @@ static int __cpuinit trustee_thread(void wake_up_process(worker->task); } + spin_unlock_irq(&gcwq->lock); + if (need_to_create_worker(gcwq)) { - spin_unlock_irq(&gcwq->lock); - worker = create_worker(gcwq, false); - spin_lock_irq(&gcwq->lock); - if (worker) { - worker->flags |= WORKER_ROGUE; + worker = create_worker(gcwq, true); + if (worker) start_worker(worker); - } } - /* give a breather */ - if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) - break; - } - - /* - * Either all works have been scheduled and cpu is down, or - * cpu down has already been canceled. Wait for and butcher - * all workers till we're canceled. - */ - do { - rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); - while (!list_empty(&gcwq->idle_list)) - destroy_worker(list_first_entry(&gcwq->idle_list, - struct worker, entry)); - } while (gcwq->nr_workers && rc >= 0); - - /* - * At this point, either draining has completed and no worker - * is left, or cpu down has been canceled or the cpu is being - * brought back up. There shouldn't be any idle one left. - * Tell the remaining busy ones to rebind once it finishes the - * currently scheduled works by scheduling the rebind_work. - */ - WARN_ON(!list_empty(&gcwq->idle_list)); - - for_each_busy_worker(worker, i, pos, gcwq) { - struct work_struct *rebind_work = &worker->rebind_work; + wait_event_timeout(gcwq->idle_wait, + gcwq->nr_idle == gcwq->nr_workers, HZ/10); - /* - * Rebind_work may race with future cpu hotplug - * operations. Use a separate flag to mark that - * rebinding is scheduled. - */ - worker->flags |= WORKER_REBIND; - worker->flags &= ~WORKER_ROGUE; + spin_lock_irq(&gcwq->lock); + } - /* queue rebind_work, wq doesn't matter, use the default one */ - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; + WARN_ON(gcwq->nr_workers != gcwq->nr_idle); - debug_work_activate(rebind_work); - insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } + list_for_each_entry_safe(worker, n, &gcwq->idle_list, entry) + destroy_worker(worker); - /* relinquish manager role */ - gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + WARN_ON(gcwq->nr_workers || gcwq->nr_idle); - /* notify completion */ - gcwq->trustee = NULL; - gcwq->trustee_state = TRUSTEE_DONE; - wake_up_all(&gcwq->trustee_wait); spin_unlock_irq(&gcwq->lock); - return 0; -} -/** - * wait_trustee_state - wait for trustee to enter the specified state - * @gcwq: gcwq the trustee of interest belongs to - * @state: target state to wait for - * - * Wait for the trustee to reach @state. DONE is already matched. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by cpu_callback. - */ -static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) -{ - if (!(gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE)) { - spin_unlock_irq(&gcwq->lock); - __wait_event(gcwq->trustee_wait, - gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE); - spin_lock_irq(&gcwq->lock); + gcwq = get_gcwq(get_cpu_light()); + spin_lock_irq(&gcwq->lock); + list_for_each_entry_safe(work, nw, &non_affine_works, entry) { + list_del_init(&work->entry); + ___queue_work(get_work_cwq(work)->wq, gcwq, work); } + spin_unlock_irq(&gcwq->lock); + put_cpu_light(); } -static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, +static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct global_cwq *gcwq = get_gcwq(cpu); - struct task_struct *new_trustee = NULL; - struct worker *uninitialized_var(new_worker); - unsigned long flags; action &= ~CPU_TASKS_FROZEN; - switch (action) { - case CPU_DOWN_PREPARE: - new_trustee = kthread_create(trustee_thread, gcwq, - "workqueue_trustee/%d\n", cpu); - if (IS_ERR(new_trustee)) - return notifier_from_errno(PTR_ERR(new_trustee)); - kthread_bind(new_trustee, cpu); - /* fall through */ - case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - new_worker = create_worker(gcwq, false); - if (!new_worker) { - if (new_trustee) - kthread_stop(new_trustee); - return NOTIFY_BAD; - } - } + switch (action) { + case CPU_DOWN_PREPARE: + flush_gcwq(gcwq); + break; + } - /* some are called w/ irq disabled, don't disturb irq status */ - spin_lock_irqsave(&gcwq->lock, flags); - - switch (action) { - case CPU_DOWN_PREPARE: - /* initialize trustee and tell it to acquire the gcwq */ - BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); - gcwq->trustee = new_trustee; - gcwq->trustee_state = TRUSTEE_START; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); - /* fall through */ - case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - gcwq->first_idle = new_worker; - break; - - case CPU_DYING: - /* - * Before this, the trustee and all workers except for - * the ones which are still executing works from - * before the last CPU down must be on the cpu. After - * this, they'll all be diasporas. - */ - gcwq->flags |= GCWQ_DISASSOCIATED; - break; - - case CPU_POST_DEAD: - gcwq->trustee_state = TRUSTEE_BUTCHER; - /* fall through */ - case CPU_UP_CANCELED: - destroy_worker(gcwq->first_idle); - gcwq->first_idle = NULL; - break; - - case CPU_DOWN_FAILED: - case CPU_ONLINE: - gcwq->flags &= ~GCWQ_DISASSOCIATED; - if (gcwq->trustee_state != TRUSTEE_DONE) { - gcwq->trustee_state = TRUSTEE_RELEASE; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_DONE); - } - - /* - * Trustee is done and there might be no worker left. - * Put the first_idle in and request a real manager to - * take a look. - */ - spin_unlock_irq(&gcwq->lock); - kthread_bind(gcwq->first_idle->task, cpu); - spin_lock_irq(&gcwq->lock); - gcwq->flags |= GCWQ_MANAGE_WORKERS; - start_worker(gcwq->first_idle); - gcwq->first_idle = NULL; - break; - } - - spin_unlock_irqrestore(&gcwq->lock, flags); return notifier_from_errno(0); } @@ -3747,7 +3501,8 @@ static int __init init_workqueues(void) unsigned int cpu; int i; - cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_ACTIVE); + hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_INACTIVE); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { @@ -3770,9 +3525,7 @@ static int __init init_workqueues(void) (unsigned long)gcwq); ida_init(&gcwq->worker_ida); - - gcwq->trustee_state = TRUSTEE_DONE; - init_waitqueue_head(&gcwq->trustee_wait); + init_waitqueue_head(&gcwq->idle_wait); } /* create the initial worker */ Index: linux-3.0/kernel/workqueue_sched.h =================================================================== --- linux-3.0.orig/kernel/workqueue_sched.h +++ linux-3.0/kernel/workqueue_sched.h @@ -4,6 +4,5 @@ * Scheduler hooks for concurrency managed workqueue. Only to be * included from sched.c and workqueue.c. */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu); +void wq_worker_running(struct task_struct *task); +void wq_worker_sleeping(struct task_struct *task); Index: linux-3.0/arch/mips/sibyte/sb1250/irq.c =================================================================== --- linux-3.0.orig/arch/mips/sibyte/sb1250/irq.c +++ linux-3.0/arch/mips/sibyte/sb1250/irq.c @@ -178,7 +178,7 @@ static void ack_sb1250_irq(struct irq_da static struct irq_chip sb1250_irq_type = { .name = "SB1250-IMR", - .irq_mask_ack = ack_sb1250_irq, + .irq_mask = ack_sb1250_irq, .irq_unmask = enable_sb1250_irq, #ifdef CONFIG_SMP .irq_set_affinity = sb1250_set_affinity Index: linux-3.0/arch/mips/kernel/ftrace.c =================================================================== --- linux-3.0.orig/arch/mips/kernel/ftrace.c +++ linux-3.0/arch/mips/kernel/ftrace.c @@ -19,6 +19,26 @@ #include +#if defined(KBUILD_MCOUNT_RA_ADDRESS) && defined(CONFIG_32BIT) +#define MCOUNT_OFFSET_INSNS 5 +#else +#define MCOUNT_OFFSET_INSNS 4 +#endif + +/* + * Check if the address is in kernel space + * + * Clone core_kernel_text() from kernel/extable.c, but doesn't call + * init_kernel_text() for Ftrace doesn't trace functions in init sections. + */ +static inline int in_kernel_space(unsigned long ip) +{ + if (ip >= (unsigned long)_stext && + ip <= (unsigned long)_etext) + return 1; + return 0; +} + #ifdef CONFIG_DYNAMIC_FTRACE #define JAL 0x0c000000 /* jump & link: ip --> ra, jump to target */ @@ -54,20 +74,6 @@ static inline void ftrace_dyn_arch_init_ #endif } -/* - * Check if the address is in kernel space - * - * Clone core_kernel_text() from kernel/extable.c, but doesn't call - * init_kernel_text() for Ftrace doesn't trace functions in init sections. - */ -static inline int in_kernel_space(unsigned long ip) -{ - if (ip >= (unsigned long)_stext && - ip <= (unsigned long)_etext) - return 1; - return 0; -} - static int ftrace_modify_code(unsigned long ip, unsigned int new_code) { int faulted; @@ -112,11 +118,6 @@ static int ftrace_modify_code(unsigned l * 1: offset = 4 instructions */ -#if defined(KBUILD_MCOUNT_RA_ADDRESS) && defined(CONFIG_32BIT) -#define MCOUNT_OFFSET_INSNS 5 -#else -#define MCOUNT_OFFSET_INSNS 4 -#endif #define INSN_B_1F (0x10000000 | MCOUNT_OFFSET_INSNS) int ftrace_make_nop(struct module *mod, Index: linux-3.0/arch/mips/loongson/fuloong-2e/irq.c =================================================================== --- linux-3.0.orig/arch/mips/loongson/fuloong-2e/irq.c +++ linux-3.0/arch/mips/loongson/fuloong-2e/irq.c @@ -42,6 +42,7 @@ asmlinkage void mach_irq_dispatch(unsign static struct irqaction cascade_irqaction = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; void __init mach_init_irq(void) Index: linux-3.0/arch/mips/loongson/lemote-2f/irq.c =================================================================== --- linux-3.0.orig/arch/mips/loongson/lemote-2f/irq.c +++ linux-3.0/arch/mips/loongson/lemote-2f/irq.c @@ -96,12 +96,13 @@ static irqreturn_t ip6_action(int cpl, v struct irqaction ip6_irqaction = { .handler = ip6_action, .name = "cascade", - .flags = IRQF_SHARED, + .flags = IRQF_SHARED | IRQF_NO_THREAD, }; struct irqaction cascade_irqaction = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; void __init mach_init_irq(void) Index: linux-3.0/arch/mips/ar7/irq.c =================================================================== --- linux-3.0.orig/arch/mips/ar7/irq.c +++ linux-3.0/arch/mips/ar7/irq.c @@ -98,7 +98,8 @@ static struct irq_chip ar7_sec_irq_type static struct irqaction ar7_cascade_action = { .handler = no_action, - .name = "AR7 cascade interrupt" + .name = "AR7 cascade interrupt", + .flags = IRQF_NO_THREAD, }; static void __init ar7_irq_init(int base) Index: linux-3.0/arch/mips/bcm63xx/irq.c =================================================================== --- linux-3.0.orig/arch/mips/bcm63xx/irq.c +++ linux-3.0/arch/mips/bcm63xx/irq.c @@ -222,6 +222,7 @@ static struct irq_chip bcm63xx_external_ static struct irqaction cpu_ip2_cascade_action = { .handler = no_action, .name = "cascade_ip2", + .flags = IRQF_NO_THREAD, }; void __init arch_init_irq(void) Index: linux-3.0/arch/mips/cobalt/irq.c =================================================================== --- linux-3.0.orig/arch/mips/cobalt/irq.c +++ linux-3.0/arch/mips/cobalt/irq.c @@ -48,6 +48,7 @@ asmlinkage void plat_irq_dispatch(void) static struct irqaction cascade = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; void __init arch_init_irq(void) Index: linux-3.0/arch/mips/dec/setup.c =================================================================== --- linux-3.0.orig/arch/mips/dec/setup.c +++ linux-3.0/arch/mips/dec/setup.c @@ -101,20 +101,24 @@ int cpu_fpu_mask = DEC_CPU_IRQ_MASK(DEC_ static struct irqaction ioirq = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; static struct irqaction fpuirq = { .handler = no_action, .name = "fpu", + .flags = IRQF_NO_THREAD, }; static struct irqaction busirq = { .flags = IRQF_DISABLED, .name = "bus error", + .flags = IRQF_NO_THREAD, }; static struct irqaction haltirq = { .handler = dec_intr_halt, .name = "halt", + .flags = IRQF_NO_THREAD, }; Index: linux-3.0/arch/mips/emma/markeins/irq.c =================================================================== --- linux-3.0.orig/arch/mips/emma/markeins/irq.c +++ linux-3.0/arch/mips/emma/markeins/irq.c @@ -169,7 +169,7 @@ void emma2rh_gpio_irq_init(void) static struct irqaction irq_cascade = { .handler = no_action, - .flags = 0, + .flags = IRQF_NO_THREAD, .name = "cascade", .dev_id = NULL, .next = NULL, Index: linux-3.0/arch/mips/lasat/interrupt.c =================================================================== --- linux-3.0.orig/arch/mips/lasat/interrupt.c +++ linux-3.0/arch/mips/lasat/interrupt.c @@ -105,6 +105,7 @@ asmlinkage void plat_irq_dispatch(void) static struct irqaction cascade = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; void __init arch_init_irq(void) Index: linux-3.0/arch/mips/mti-malta/malta-int.c =================================================================== --- linux-3.0.orig/arch/mips/mti-malta/malta-int.c +++ linux-3.0/arch/mips/mti-malta/malta-int.c @@ -350,12 +350,14 @@ unsigned int plat_ipi_resched_int_xlate( static struct irqaction i8259irq = { .handler = no_action, - .name = "XT-PIC cascade" + .name = "XT-PIC cascade", + .flags = IRQF_NO_THREAD, }; static struct irqaction corehi_irqaction = { .handler = no_action, - .name = "CoreHi" + .name = "CoreHi", + .flags = IRQF_NO_THREAD, }; static msc_irqmap_t __initdata msc_irqmap[] = { Index: linux-3.0/arch/mips/pmc-sierra/msp71xx/msp_irq.c =================================================================== --- linux-3.0.orig/arch/mips/pmc-sierra/msp71xx/msp_irq.c +++ linux-3.0/arch/mips/pmc-sierra/msp71xx/msp_irq.c @@ -109,11 +109,13 @@ asmlinkage void plat_irq_dispatch(struct static struct irqaction cic_cascade_msp = { .handler = no_action, .name = "MSP CIC cascade" + .flags = IRQF_NO_THREAD, }; static struct irqaction per_cascade_msp = { .handler = no_action, .name = "MSP PER cascade" + .flags = IRQF_NO_THREAD, }; void __init arch_init_irq(void) Index: linux-3.0/arch/mips/pnx8550/common/int.c =================================================================== --- linux-3.0.orig/arch/mips/pnx8550/common/int.c +++ linux-3.0/arch/mips/pnx8550/common/int.c @@ -167,7 +167,7 @@ static struct irq_chip level_irq_type = static struct irqaction gic_action = { .handler = no_action, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NO_THREAD, .name = "GIC", }; Index: linux-3.0/arch/mips/sgi-ip22/ip22-int.c =================================================================== --- linux-3.0.orig/arch/mips/sgi-ip22/ip22-int.c +++ linux-3.0/arch/mips/sgi-ip22/ip22-int.c @@ -155,32 +155,32 @@ static void __irq_entry indy_buserror_ir static struct irqaction local0_cascade = { .handler = no_action, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NO_THREAD, .name = "local0 cascade", }; static struct irqaction local1_cascade = { .handler = no_action, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NO_THREAD, .name = "local1 cascade", }; static struct irqaction buserr = { .handler = no_action, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NO_THREAD, .name = "Bus Error", }; static struct irqaction map0_cascade = { .handler = no_action, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NO_THREAD, .name = "mapable0 cascade", }; #ifdef USE_LIO3_IRQ static struct irqaction map1_cascade = { .handler = no_action, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NO_THREAD, .name = "mapable1 cascade", }; #define SGI_INTERRUPTS SGINT_END Index: linux-3.0/arch/mips/sni/rm200.c =================================================================== --- linux-3.0.orig/arch/mips/sni/rm200.c +++ linux-3.0/arch/mips/sni/rm200.c @@ -359,6 +359,7 @@ void sni_rm200_init_8259A(void) static struct irqaction sni_rm200_irq2 = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; static struct resource sni_rm200_pic1_resource = { Index: linux-3.0/arch/mips/vr41xx/common/irq.c =================================================================== --- linux-3.0.orig/arch/mips/vr41xx/common/irq.c +++ linux-3.0/arch/mips/vr41xx/common/irq.c @@ -34,6 +34,7 @@ static irq_cascade_t irq_cascade[NR_IRQS static struct irqaction cascade_irqaction = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; int cascade_irq(unsigned int irq, int (*get_irq)(unsigned int)) Index: linux-3.0/arch/mips/Kconfig =================================================================== --- linux-3.0.orig/arch/mips/Kconfig +++ linux-3.0/arch/mips/Kconfig @@ -24,6 +24,7 @@ config MIPS select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select HAVE_ARCH_JUMP_LABEL + select IRQ_FORCED_THREADING menu "Machine selection" @@ -2038,7 +2039,7 @@ config CPU_R4400_WORKAROUNDS # config HIGHMEM bool "High Memory Support" - depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM + depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !PREEMPT_RT_FULL config CPU_SUPPORTS_HIGHMEM bool Index: linux-3.0/arch/mips/kernel/traps.c =================================================================== --- linux-3.0.orig/arch/mips/kernel/traps.c +++ linux-3.0/arch/mips/kernel/traps.c @@ -364,7 +364,7 @@ static int regs_to_trapnr(struct pt_regs return (regs->cp0_cause >> 2) & 0x1f; } -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); void __noreturn die(const char *str, struct pt_regs *regs) { @@ -378,7 +378,7 @@ void __noreturn die(const char *str, str sig = 0; console_verbose(); - spin_lock_irq(&die_lock); + raw_spin_lock_irq(&die_lock); bust_spinlocks(1); #ifdef CONFIG_MIPS_MT_SMTC mips_mt_regdump(dvpret); @@ -387,7 +387,7 @@ void __noreturn die(const char *str, str printk("%s[#%d]:\n", str, ++die_counter); show_registers(regs); add_taint(TAINT_DIE); - spin_unlock_irq(&die_lock); + raw_spin_unlock_irq(&die_lock); if (in_interrupt()) panic("Fatal exception in interrupt"); Index: linux-3.0/arch/mips/kernel/signal.c =================================================================== --- linux-3.0.orig/arch/mips/kernel/signal.c +++ linux-3.0/arch/mips/kernel/signal.c @@ -603,6 +603,9 @@ static void do_signal(struct pt_regs *re if (!user_mode(regs)) return; + local_irq_enable(); + preempt_check_resched(); + if (test_thread_flag(TIF_RESTORE_SIGMASK)) oldset = ¤t->saved_sigmask; else Index: linux-3.0/arch/mips/kernel/i8259.c =================================================================== --- linux-3.0.orig/arch/mips/kernel/i8259.c +++ linux-3.0/arch/mips/kernel/i8259.c @@ -295,6 +295,7 @@ static void init_8259A(int auto_eoi) static struct irqaction irq2 = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; static struct resource pic1_io_resource = { Index: linux-3.0/drivers/watchdog/octeon-wdt-main.c =================================================================== --- linux-3.0.orig/drivers/watchdog/octeon-wdt-main.c +++ linux-3.0/drivers/watchdog/octeon-wdt-main.c @@ -402,7 +402,7 @@ static void octeon_wdt_setup_interrupt(i irq = OCTEON_IRQ_WDOG0 + core; if (request_irq(irq, octeon_wdt_poke_irq, - IRQF_DISABLED, "octeon_wdt", octeon_wdt_poke_irq)) + IRQF_NO_THREAD, "octeon_wdt", octeon_wdt_poke_irq)) panic("octeon_wdt: Couldn't obtain irq %d", irq); cpumask_set_cpu(cpu, &irq_enabled_cpus); Index: linux-3.0/arch/mips/cavium-octeon/smp.c =================================================================== --- linux-3.0.orig/arch/mips/cavium-octeon/smp.c +++ linux-3.0/arch/mips/cavium-octeon/smp.c @@ -207,8 +207,9 @@ void octeon_prepare_cpus(unsigned int ma * the other bits alone. */ cvmx_write_csr(CVMX_CIU_MBOX_CLRX(cvmx_get_core_num()), 0xffff); - if (request_irq(OCTEON_IRQ_MBOX0, mailbox_interrupt, IRQF_DISABLED, - "SMP-IPI", mailbox_interrupt)) { + if (request_irq(OCTEON_IRQ_MBOX0, mailbox_interrupt, + IRQF_PERCPU | IRQF_NO_THREAD, "SMP-IPI", + mailbox_interrupt)) { panic("Cannot request_irq(OCTEON_IRQ_MBOX0)\n"); } } Index: linux-3.0/arch/arm/kernel/signal.c =================================================================== --- linux-3.0.orig/arch/arm/kernel/signal.c +++ linux-3.0/arch/arm/kernel/signal.c @@ -673,6 +673,9 @@ static void do_signal(struct pt_regs *re if (!user_mode(regs)) return; + local_irq_enable(); + preempt_check_resched(); + /* * If we were from a system call, check for system call restarting... */ Index: linux-3.0/arch/arm/kernel/smp.c =================================================================== --- linux-3.0.orig/arch/arm/kernel/smp.c +++ linux-3.0/arch/arm/kernel/smp.c @@ -305,6 +305,18 @@ asmlinkage void __cpuinit secondary_star * Enable local interrupts. */ notify_cpu_starting(cpu); + + /* + * OK, now it's safe to let the boot CPU continue. Wait for + * the CPU migration code to notice that the CPU is online + * before we continue. We need to do that before we enable + * interrupts otherwise a wakeup of a kernel thread affine to + * this CPU might break the affinity and let hell break lose. + */ + set_cpu_online(cpu, true); + while (!cpu_active(cpu)) + cpu_relax(); + local_irq_enable(); local_fiq_enable(); @@ -318,15 +330,6 @@ asmlinkage void __cpuinit secondary_star smp_store_cpu_info(cpu); /* - * OK, now it's safe to let the boot CPU continue. Wait for - * the CPU migration code to notice that the CPU is online - * before we continue. - */ - set_cpu_online(cpu, true); - while (!cpu_active(cpu)) - cpu_relax(); - - /* * OK, it's off to the idle thread for us */ cpu_idle(); @@ -531,7 +534,7 @@ static void percpu_timer_stop(void) } #endif -static DEFINE_SPINLOCK(stop_lock); +static DEFINE_RAW_SPINLOCK(stop_lock); /* * ipi_cpu_stop - handle IPI from smp_send_stop() @@ -540,10 +543,10 @@ static void ipi_cpu_stop(unsigned int cp { if (system_state == SYSTEM_BOOTING || system_state == SYSTEM_RUNNING) { - spin_lock(&stop_lock); + raw_spin_lock(&stop_lock); printk(KERN_CRIT "CPU%u: stopping\n", cpu); dump_stack(); - spin_unlock(&stop_lock); + raw_spin_unlock(&stop_lock); } set_cpu_online(cpu, false); Index: linux-3.0/include/linux/jbd.h =================================================================== --- linux-3.0.orig/include/linux/jbd.h +++ linux-3.0/include/linux/jbd.h @@ -244,6 +244,7 @@ typedef struct journal_superblock_s #include #include +#include #define J_ASSERT(assert) BUG_ON(!(assert)) @@ -270,69 +271,6 @@ typedef struct journal_superblock_s #define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) #endif -enum jbd_state_bits { - BH_JBD /* Has an attached ext3 journal_head */ - = BH_PrivateStart, - BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ - BH_Freed, /* Has been freed (truncated) */ - BH_Revoked, /* Has been revoked from the log */ - BH_RevokeValid, /* Revoked flag is valid */ - BH_JBDDirty, /* Is dirty but journaled */ - BH_State, /* Pins most journal_head state */ - BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ - BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ -}; - -BUFFER_FNS(JBD, jbd) -BUFFER_FNS(JWrite, jwrite) -BUFFER_FNS(JBDDirty, jbddirty) -TAS_BUFFER_FNS(JBDDirty, jbddirty) -BUFFER_FNS(Revoked, revoked) -TAS_BUFFER_FNS(Revoked, revoked) -BUFFER_FNS(RevokeValid, revokevalid) -TAS_BUFFER_FNS(RevokeValid, revokevalid) -BUFFER_FNS(Freed, freed) - -static inline struct buffer_head *jh2bh(struct journal_head *jh) -{ - return jh->b_bh; -} - -static inline struct journal_head *bh2jh(struct buffer_head *bh) -{ - return bh->b_private; -} - -static inline void jbd_lock_bh_state(struct buffer_head *bh) -{ - bit_spin_lock(BH_State, &bh->b_state); -} - -static inline int jbd_trylock_bh_state(struct buffer_head *bh) -{ - return bit_spin_trylock(BH_State, &bh->b_state); -} - -static inline int jbd_is_locked_bh_state(struct buffer_head *bh) -{ - return bit_spin_is_locked(BH_State, &bh->b_state); -} - -static inline void jbd_unlock_bh_state(struct buffer_head *bh) -{ - bit_spin_unlock(BH_State, &bh->b_state); -} - -static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) -{ - bit_spin_lock(BH_JournalHead, &bh->b_state); -} - -static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) -{ - bit_spin_unlock(BH_JournalHead, &bh->b_state); -} - struct jbd_revoke_table_s; /** Index: linux-3.0/include/linux/jbd2.h =================================================================== --- linux-3.0.orig/include/linux/jbd2.h +++ linux-3.0/include/linux/jbd2.h @@ -275,6 +275,7 @@ typedef struct journal_superblock_s #include #include +#include #define J_ASSERT(assert) BUG_ON(!(assert)) @@ -302,70 +303,6 @@ typedef struct journal_superblock_s #define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) #endif -enum jbd_state_bits { - BH_JBD /* Has an attached ext3 journal_head */ - = BH_PrivateStart, - BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ - BH_Freed, /* Has been freed (truncated) */ - BH_Revoked, /* Has been revoked from the log */ - BH_RevokeValid, /* Revoked flag is valid */ - BH_JBDDirty, /* Is dirty but journaled */ - BH_State, /* Pins most journal_head state */ - BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ - BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ - BH_JBDPrivateStart, /* First bit available for private use by FS */ -}; - -BUFFER_FNS(JBD, jbd) -BUFFER_FNS(JWrite, jwrite) -BUFFER_FNS(JBDDirty, jbddirty) -TAS_BUFFER_FNS(JBDDirty, jbddirty) -BUFFER_FNS(Revoked, revoked) -TAS_BUFFER_FNS(Revoked, revoked) -BUFFER_FNS(RevokeValid, revokevalid) -TAS_BUFFER_FNS(RevokeValid, revokevalid) -BUFFER_FNS(Freed, freed) - -static inline struct buffer_head *jh2bh(struct journal_head *jh) -{ - return jh->b_bh; -} - -static inline struct journal_head *bh2jh(struct buffer_head *bh) -{ - return bh->b_private; -} - -static inline void jbd_lock_bh_state(struct buffer_head *bh) -{ - bit_spin_lock(BH_State, &bh->b_state); -} - -static inline int jbd_trylock_bh_state(struct buffer_head *bh) -{ - return bit_spin_trylock(BH_State, &bh->b_state); -} - -static inline int jbd_is_locked_bh_state(struct buffer_head *bh) -{ - return bit_spin_is_locked(BH_State, &bh->b_state); -} - -static inline void jbd_unlock_bh_state(struct buffer_head *bh) -{ - bit_spin_unlock(BH_State, &bh->b_state); -} - -static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) -{ - bit_spin_lock(BH_JournalHead, &bh->b_state); -} - -static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) -{ - bit_spin_unlock(BH_JournalHead, &bh->b_state); -} - /* Flags in jbd_inode->i_flags */ #define __JI_COMMIT_RUNNING 0 /* Commit of the inode data in progress. We use this flag to protect us from Index: linux-3.0/include/linux/jbd_common.h =================================================================== --- /dev/null +++ linux-3.0/include/linux/jbd_common.h @@ -0,0 +1,92 @@ +#ifndef _LINUX_JBD_STATE_H +#define _LINUX_JBD_STATE_H + +enum jbd_state_bits { + BH_JBD /* Has an attached ext3 journal_head */ + = BH_PrivateStart, + BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ + BH_Freed, /* Has been freed (truncated) */ + BH_Revoked, /* Has been revoked from the log */ + BH_RevokeValid, /* Revoked flag is valid */ + BH_JBDDirty, /* Is dirty but journaled */ + BH_State, /* Pins most journal_head state */ + BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ + BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ + BH_JBDPrivateStart, /* First bit available for private use by FS */ +}; + +BUFFER_FNS(JBD, jbd) +BUFFER_FNS(JWrite, jwrite) +BUFFER_FNS(JBDDirty, jbddirty) +TAS_BUFFER_FNS(JBDDirty, jbddirty) +BUFFER_FNS(Revoked, revoked) +TAS_BUFFER_FNS(Revoked, revoked) +BUFFER_FNS(RevokeValid, revokevalid) +TAS_BUFFER_FNS(RevokeValid, revokevalid) +BUFFER_FNS(Freed, freed) + +static inline struct buffer_head *jh2bh(struct journal_head *jh) +{ + return jh->b_bh; +} + +static inline struct journal_head *bh2jh(struct buffer_head *bh) +{ + return bh->b_private; +} + +static inline void jbd_lock_bh_state(struct buffer_head *bh) +{ +#ifndef CONFIG_PREEMPT_RT_BASE + bit_spin_lock(BH_State, &bh->b_state); +#else + spin_lock(&bh->b_state_lock); +#endif +} + +static inline int jbd_trylock_bh_state(struct buffer_head *bh) +{ +#ifndef CONFIG_PREEMPT_RT_BASE + return bit_spin_trylock(BH_State, &bh->b_state); +#else + return spin_trylock(&bh->b_state_lock); +#endif +} + +static inline int jbd_is_locked_bh_state(struct buffer_head *bh) +{ +#ifndef CONFIG_PREEMPT_RT_BASE + return bit_spin_is_locked(BH_State, &bh->b_state); +#else + return spin_is_locked(&bh->b_state_lock); +#endif +} + +static inline void jbd_unlock_bh_state(struct buffer_head *bh) +{ +#ifndef CONFIG_PREEMPT_RT_BASE + bit_spin_unlock(BH_State, &bh->b_state); +#else + spin_unlock(&bh->b_state_lock); +#endif +} + +static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) +{ +#ifndef CONFIG_PREEMPT_RT_BASE + bit_spin_lock(BH_JournalHead, &bh->b_state); +#else + spin_lock(&bh->b_journal_head_lock); +#endif +} + +static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) +{ +#ifndef CONFIG_PREEMPT_RT_BASE + bit_spin_unlock(BH_JournalHead, &bh->b_state); +#else + spin_unlock(&bh->b_journal_head_lock); +#endif +} + +#endif Index: linux-3.0/kernel/sched_rt.c =================================================================== --- linux-3.0.orig/kernel/sched_rt.c +++ linux-3.0/kernel/sched_rt.c @@ -536,6 +536,9 @@ static int balance_runtime(struct rt_rq { int more = 0; + if (!sched_feat(RT_RUNTIME_SHARE)) + return more; + if (rt_rq->rt_time > rt_rq->rt_runtime) { raw_spin_unlock(&rt_rq->rt_runtime_lock); more = do_balance_runtime(rt_rq); @@ -553,12 +556,9 @@ static inline int balance_runtime(struct static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { - int i, idle = 1; + int i, idle = 1, throttled = 0; const struct cpumask *span; - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return 1; - span = sched_rt_period_mask(); for_each_cpu(i, span) { int enqueue = 0; @@ -593,12 +593,17 @@ static int do_sched_rt_period_timer(stru if (!rt_rq_throttled(rt_rq)) enqueue = 1; } + if (rt_rq->rt_throttled) + throttled = 1; if (enqueue) sched_rt_rq_enqueue(rt_rq); raw_spin_unlock(&rq->lock); } + if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) + return 1; + return idle; } @@ -630,7 +635,24 @@ static int sched_rt_runtime_exceeded(str return 0; if (rt_rq->rt_time > runtime) { - rt_rq->rt_throttled = 1; + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + /* + * Don't actually throttle groups that have no runtime assigned + * but accrue some time due to boosting. + */ + if (likely(rt_b->rt_runtime)) { + rt_rq->rt_throttled = 1; + printk_once(KERN_WARNING "sched: RT throttling activated\n"); + } else { + /* + * In case we did anyway, make it go away, + * replenishment is a joke, since it will replenish us + * with exactly 0 ns. + */ + rt_rq->rt_time = 0; + } + if (rt_rq_throttled(rt_rq)) { sched_rt_rq_dequeue(rt_rq); return 1; @@ -658,7 +680,8 @@ static void update_curr_rt(struct rq *rq if (unlikely((s64)delta_exec < 0)) delta_exec = 0; - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); @@ -1186,7 +1209,7 @@ static void deactivate_task(struct rq *r static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && + (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && (p->rt.nr_cpus_allowed > 1)) return 1; return 0; @@ -1331,7 +1354,7 @@ static struct rq *find_lock_lowest_rq(st */ if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(lowest_rq->cpu, - &task->cpus_allowed) || + tsk_cpus_allowed(task)) || task_running(rq, task) || !task->on_rq)) { @@ -1614,9 +1637,6 @@ static void set_cpus_allowed_rt(struct t update_rt_migration(&rq->rt); } - - cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = weight; } /* Assumes rq->lock is held */ Index: linux-3.0/arch/powerpc/platforms/85xx/mpc85xx_cds.c =================================================================== --- linux-3.0.orig/arch/powerpc/platforms/85xx/mpc85xx_cds.c +++ linux-3.0/arch/powerpc/platforms/85xx/mpc85xx_cds.c @@ -178,7 +178,7 @@ static irqreturn_t mpc85xx_8259_cascade_ static struct irqaction mpc85xxcds_8259_irqaction = { .handler = mpc85xx_8259_cascade_action, - .flags = IRQF_SHARED, + .flags = IRQF_SHARED | IRQF_NO_THREAD, .name = "8259 cascade", }; #endif /* PPC_I8259 */ Index: linux-3.0/arch/powerpc/platforms/wsp/opb_pic.c =================================================================== --- linux-3.0.orig/arch/powerpc/platforms/wsp/opb_pic.c +++ linux-3.0/arch/powerpc/platforms/wsp/opb_pic.c @@ -320,7 +320,8 @@ void __init opb_pic_init(void) } /* Attach opb interrupt handler to new virtual IRQ */ - rc = request_irq(virq, opb_irq_handler, 0, "OPB LS Cascade", opb); + rc = request_irq(virq, opb_irq_handler, IRQF_NO_THREAD, + "OPB LS Cascade", opb); if (rc) { printk("opb: request_irq failed: %d\n", rc); continue; Index: linux-3.0/arch/powerpc/kernel/smp.c =================================================================== --- linux-3.0.orig/arch/powerpc/kernel/smp.c +++ linux-3.0/arch/powerpc/kernel/smp.c @@ -170,7 +170,7 @@ int smp_request_message_ipi(int virq, in return 1; } #endif - err = request_irq(virq, smp_ipi_action[msg], IRQF_DISABLED|IRQF_PERCPU, + err = request_irq(virq, smp_ipi_action[msg], IRQF_NO_THREAD|IRQF_PERCPU, smp_ipi_name[msg], 0); WARN(err < 0, "unable to request_irq %d for %s (rc %d)\n", virq, smp_ipi_name[msg], err); Index: linux-3.0/arch/powerpc/platforms/powermac/smp.c =================================================================== --- linux-3.0.orig/arch/powerpc/platforms/powermac/smp.c +++ linux-3.0/arch/powerpc/platforms/powermac/smp.c @@ -200,7 +200,7 @@ static int psurge_secondary_ipi_init(voi if (psurge_secondary_virq) rc = request_irq(psurge_secondary_virq, psurge_ipi_intr, - IRQF_DISABLED|IRQF_PERCPU, "IPI", NULL); + IRQF_NO_THREAD|IRQF_PERCPU, "IPI", NULL); if (rc) pr_err("Failed to setup secondary cpu IPI\n"); @@ -408,7 +408,7 @@ static int __init smp_psurge_kick_cpu(in static struct irqaction psurge_irqaction = { .handler = psurge_ipi_intr, - .flags = IRQF_DISABLED|IRQF_PERCPU, + .flags = IRQF_NO_THREAD|IRQF_PERCPU, .name = "primary IPI", }; Index: linux-3.0/arch/powerpc/sysdev/xics/xics-common.c =================================================================== --- linux-3.0.orig/arch/powerpc/sysdev/xics/xics-common.c +++ linux-3.0/arch/powerpc/sysdev/xics/xics-common.c @@ -134,11 +134,11 @@ static void xics_request_ipi(void) BUG_ON(ipi == NO_IRQ); /* - * IPIs are marked IRQF_DISABLED as they must run with irqs - * disabled, and PERCPU. The handler was set in map. + * IPIs are marked PERCPU and also IRQF_NO_THREAD as they must + * run in hard interrupt context. The handler was set in map. */ BUG_ON(request_irq(ipi, icp_ops->ipi_action, - IRQF_DISABLED|IRQF_PERCPU, "IPI", NULL)); + IRQF_NO_THREAD|IRQF_PERCPU, "IPI", NULL)); } int __init xics_smp_probe(void) Index: linux-3.0/arch/powerpc/Kconfig =================================================================== --- linux-3.0.orig/arch/powerpc/Kconfig +++ linux-3.0/arch/powerpc/Kconfig @@ -69,10 +69,11 @@ config LOCKDEP_SUPPORT config RWSEM_GENERIC_SPINLOCK bool + default y if PREEMPT_RT_FULL config RWSEM_XCHGADD_ALGORITHM bool - default y + default y if !PREEMPT_RT_FULL config GENERIC_LOCKBREAK bool @@ -134,6 +135,7 @@ config PPC select GENERIC_IRQ_SHOW_LEVEL select HAVE_RCU_TABLE_FREE if SMP select HAVE_SYSCALL_TRACEPOINTS + select IRQ_FORCED_THREADING config EARLY_PRINTK bool @@ -271,7 +273,7 @@ menu "Kernel options" config HIGHMEM bool "High memory support" - depends on PPC32 + depends on PPC32 && !PREEMPT_RT_FULL source kernel/time/Kconfig source kernel/Kconfig.hz Index: linux-3.0/net/netfilter/ipvs/ip_vs_ctl.c =================================================================== --- linux-3.0.orig/net/netfilter/ipvs/ip_vs_ctl.c +++ linux-3.0/net/netfilter/ipvs/ip_vs_ctl.c @@ -3679,7 +3679,7 @@ int __net_init __ip_vs_control_init(stru int idx; struct netns_ipvs *ipvs = net_ipvs(net); - ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock); + rwlock_init(&ipvs->rs_lock); /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) Index: linux-3.0/kernel/watchdog.c =================================================================== --- linux-3.0.orig/kernel/watchdog.c +++ linux-3.0/kernel/watchdog.c @@ -208,6 +208,8 @@ static struct perf_event_attr wd_hw_attr .disabled = 1, }; +static DEFINE_RAW_SPINLOCK(watchdog_output_lock); + /* Callback function for perf event subsystem */ static void watchdog_overflow_callback(struct perf_event *event, int nmi, struct perf_sample_data *data, @@ -234,10 +236,19 @@ static void watchdog_overflow_callback(s if (__this_cpu_read(hard_watchdog_warn) == true) return; - if (hardlockup_panic) + /* + * If early-printk is enabled then make sure we do not + * lock up in printk() and kill console logging: + */ + printk_kill(); + + if (hardlockup_panic) { panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - else + } else { + raw_spin_lock(&watchdog_output_lock); WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + raw_spin_unlock(&watchdog_output_lock); + } __this_cpu_write(hard_watchdog_warn, true); return; @@ -320,7 +331,7 @@ static enum hrtimer_restart watchdog_tim */ static int watchdog(void *unused) { - static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); sched_setscheduler(current, SCHED_FIFO, ¶m); @@ -349,7 +360,8 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); - + param.sched_priority = 0; + sched_setscheduler(current, SCHED_NORMAL, ¶m); return 0; } @@ -422,6 +434,7 @@ static void watchdog_prepare_cpu(int cpu WARN_ON(per_cpu(softlockup_watchdog, cpu)); hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; + hrtimer->irqsafe = 1; } static int watchdog_enable(int cpu) Index: linux-3.0/kernel/time/clocksource.c =================================================================== --- linux-3.0.orig/kernel/time/clocksource.c +++ linux-3.0/kernel/time/clocksource.c @@ -186,6 +186,7 @@ static struct timer_list watchdog_timer; static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; +static atomic_t watchdog_reset_pending; static int clocksource_watchdog_kthread(void *data); static void __clocksource_change_rating(struct clocksource *cs, int rating); @@ -247,12 +248,14 @@ static void clocksource_watchdog(unsigne struct clocksource *cs; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; - int next_cpu; + int next_cpu, reset_pending; spin_lock(&watchdog_lock); if (!watchdog_running) goto out; + reset_pending = atomic_read(&watchdog_reset_pending); + list_for_each_entry(cs, &watchdog_list, wd_list) { /* Clocksource already marked unstable? */ @@ -268,7 +271,8 @@ static void clocksource_watchdog(unsigne local_irq_enable(); /* Clocksource initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || + atomic_read(&watchdog_reset_pending)) { cs->flags |= CLOCK_SOURCE_WATCHDOG; cs->wd_last = wdnow; cs->cs_last = csnow; @@ -283,8 +287,11 @@ static void clocksource_watchdog(unsigne cs->cs_last = csnow; cs->wd_last = wdnow; + if (atomic_read(&watchdog_reset_pending)) + continue; + /* Check the deviation from the watchdog clocksource. */ - if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { clocksource_unstable(cs, cs_nsec - wd_nsec); continue; } @@ -303,6 +310,13 @@ static void clocksource_watchdog(unsigne } /* + * We only clear the watchdog_reset_pending, when we did a + * full cycle through all clocksources. + */ + if (reset_pending) + atomic_dec(&watchdog_reset_pending); + + /* * Cycle through CPUs to check if the CPUs stay synchronized * to each other. */ @@ -344,23 +358,7 @@ static inline void clocksource_reset_wat static void clocksource_resume_watchdog(void) { - unsigned long flags; - - /* - * We use trylock here to avoid a potential dead lock when - * kgdb calls this code after the kernel has been stopped with - * watchdog_lock held. When watchdog_lock is held we just - * return and accept, that the watchdog might trigger and mark - * the monitored clock source (usually TSC) unstable. - * - * This does not affect the other caller clocksource_resume() - * because at this point the kernel is UP, interrupts are - * disabled and nothing can hold watchdog_lock. - */ - if (!spin_trylock_irqsave(&watchdog_lock, flags)) - return; - clocksource_reset_watchdog(); - spin_unlock_irqrestore(&watchdog_lock, flags); + atomic_inc(&watchdog_reset_pending); } static void clocksource_enqueue_watchdog(struct clocksource *cs) Index: linux-3.0/kernel/rtmutex-debug.c =================================================================== --- linux-3.0.orig/kernel/rtmutex-debug.c +++ linux-3.0/kernel/rtmutex-debug.c @@ -29,61 +29,6 @@ #include "rtmutex_common.h" -# define TRACE_WARN_ON(x) WARN_ON(x) -# define TRACE_BUG_ON(x) BUG_ON(x) - -# define TRACE_OFF() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - if (raw_spin_is_locked(¤t->pi_lock)) \ - raw_spin_unlock(¤t->pi_lock); \ - } \ -} while (0) - -# define TRACE_OFF_NOLOCK() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - } \ -} while (0) - -# define TRACE_BUG_LOCKED() \ -do { \ - TRACE_OFF(); \ - BUG(); \ -} while (0) - -# define TRACE_WARN_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) { \ - TRACE_OFF(); \ - WARN_ON(1); \ - } \ -} while (0) - -# define TRACE_BUG_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) \ - TRACE_BUG_LOCKED(); \ -} while (0) - -#ifdef CONFIG_SMP -# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) -#else -# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) -#endif - -/* - * deadlock detection flag. We turn it off when we detect - * the first problem because we dont want to recurse back - * into the tracing code when doing error printk or - * executing a BUG(): - */ -static int rt_trace_on = 1; - static void printk_task(struct task_struct *p) { if (p) @@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex void rt_mutex_debug_task_free(struct task_struct *task) { - WARN_ON(!plist_head_empty(&task->pi_waiters)); - WARN_ON(task->pi_blocked_on); + DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } /* @@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, { struct task_struct *task; - if (!rt_trace_on || detect || !act_waiter) + if (!debug_locks || detect || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); @@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struc { struct task_struct *task; - if (!waiter->deadlock_lock || !rt_trace_on) + if (!waiter->deadlock_lock || !debug_locks) return; rcu_read_lock(); @@ -149,7 +94,10 @@ void debug_rt_mutex_print_deadlock(struc return; } - TRACE_OFF_NOLOCK(); + if (!debug_locks_off()) { + rcu_read_unlock(); + return; + } printk("\n============================================\n"); printk( "[ BUG: circular locking deadlock detected! ]\n"); @@ -180,7 +128,6 @@ void debug_rt_mutex_print_deadlock(struc printk("[ turning off deadlock detection." "Please report this trace. ]\n\n"); - local_irq_disable(); } void debug_rt_mutex_lock(struct rt_mutex *lock) @@ -189,7 +136,7 @@ void debug_rt_mutex_lock(struct rt_mutex void debug_rt_mutex_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } void @@ -199,7 +146,7 @@ debug_rt_mutex_proxy_lock(struct rt_mute void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); } void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) @@ -213,8 +160,8 @@ void debug_rt_mutex_init_waiter(struct r void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { put_pid(waiter->deadlock_task_pid); - TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); - TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); memset(waiter, 0x22, sizeof(*waiter)); } Index: linux-3.0/include/linux/kprobes.h =================================================================== --- linux-3.0.orig/include/linux/kprobes.h +++ linux-3.0/include/linux/kprobes.h @@ -181,7 +181,7 @@ struct kretprobe { int nmissed; size_t data_size; struct hlist_head free_instances; - spinlock_t lock; + raw_spinlock_t lock; }; struct kretprobe_instance { Index: linux-3.0/kernel/kprobes.c =================================================================== --- linux-3.0.orig/kernel/kprobes.c +++ linux-3.0/kernel/kprobes.c @@ -78,10 +78,10 @@ static bool kprobes_all_disarmed; static DEFINE_MUTEX(kprobe_mutex); static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { - spinlock_t lock ____cacheline_aligned_in_smp; + raw_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; -static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) +static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) { return &(kretprobe_table_locks[hash].lock); } @@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kr hlist_del(&ri->hlist); INIT_HLIST_NODE(&ri->hlist); if (likely(rp)) { - spin_lock(&rp->lock); + raw_spin_lock(&rp->lock); hlist_add_head(&ri->hlist, &rp->free_instances); - spin_unlock(&rp->lock); + raw_spin_unlock(&rp->lock); } else /* Unregistering */ hlist_add_head(&ri->hlist, head); @@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struc __acquires(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + raw_spinlock_t *hlist_lock; *head = &kretprobe_inst_table[hash]; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + raw_spin_lock_irqsave(hlist_lock, *flags); } static void __kprobes kretprobe_table_lock(unsigned long hash, unsigned long *flags) __acquires(hlist_lock) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + raw_spin_lock_irqsave(hlist_lock, *flags); } void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, @@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(str __releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + raw_spinlock_t *hlist_lock; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + raw_spin_unlock_irqrestore(hlist_lock, *flags); } static void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) __releases(hlist_lock) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + raw_spin_unlock_irqrestore(hlist_lock, *flags); } /* @@ -1650,12 +1650,12 @@ static int __kprobes pre_handler_kretpro /*TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); - spin_lock_irqsave(&rp->lock, flags); + raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); hlist_del(&ri->hlist); - spin_unlock_irqrestore(&rp->lock, flags); + raw_spin_unlock_irqrestore(&rp->lock, flags); ri->rp = rp; ri->task = current; @@ -1672,7 +1672,7 @@ static int __kprobes pre_handler_kretpro kretprobe_table_unlock(hash, &flags); } else { rp->nmissed++; - spin_unlock_irqrestore(&rp->lock, flags); + raw_spin_unlock_irqrestore(&rp->lock, flags); } return 0; } @@ -1708,7 +1708,7 @@ int __kprobes register_kretprobe(struct rp->maxactive = num_possible_cpus(); #endif } - spin_lock_init(&rp->lock); + raw_spin_lock_init(&rp->lock); INIT_HLIST_HEAD(&rp->free_instances); for (i = 0; i < rp->maxactive; i++) { inst = kmalloc(sizeof(struct kretprobe_instance) + @@ -1946,7 +1946,7 @@ static int __init init_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); - spin_lock_init(&(kretprobe_table_locks[i].lock)); + raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); } /* Index: linux-3.0/include/linux/percpu_counter.h =================================================================== --- linux-3.0.orig/include/linux/percpu_counter.h +++ linux-3.0/include/linux/percpu_counter.h @@ -16,7 +16,7 @@ #ifdef CONFIG_SMP struct percpu_counter { - spinlock_t lock; + raw_spinlock_t lock; s64 count; #ifdef CONFIG_HOTPLUG_CPU struct list_head list; /* All percpu_counters are on a list */ Index: linux-3.0/lib/percpu_counter.c =================================================================== --- linux-3.0.orig/lib/percpu_counter.c +++ linux-3.0/lib/percpu_counter.c @@ -59,13 +59,13 @@ void percpu_counter_set(struct percpu_co { int cpu; - spin_lock(&fbc->lock); + raw_spin_lock(&fbc->lock); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; - spin_unlock(&fbc->lock); + raw_spin_unlock(&fbc->lock); } EXPORT_SYMBOL(percpu_counter_set); @@ -76,10 +76,10 @@ void __percpu_counter_add(struct percpu_ preempt_disable(); count = __this_cpu_read(*fbc->counters) + amount; if (count >= batch || count <= -batch) { - spin_lock(&fbc->lock); + raw_spin_lock(&fbc->lock); fbc->count += count; __this_cpu_write(*fbc->counters, 0); - spin_unlock(&fbc->lock); + raw_spin_unlock(&fbc->lock); } else { __this_cpu_write(*fbc->counters, count); } @@ -96,13 +96,13 @@ s64 __percpu_counter_sum(struct percpu_c s64 ret; int cpu; - spin_lock(&fbc->lock); + raw_spin_lock(&fbc->lock); ret = fbc->count; for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } - spin_unlock(&fbc->lock); + raw_spin_unlock(&fbc->lock); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); @@ -110,7 +110,7 @@ EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, struct lock_class_key *key) { - spin_lock_init(&fbc->lock); + raw_spin_lock_init(&fbc->lock); lockdep_set_class(&fbc->lock, key); fbc->count = amount; fbc->counters = alloc_percpu(s32); @@ -173,11 +173,11 @@ static int __cpuinit percpu_counter_hotc s32 *pcount; unsigned long flags; - spin_lock_irqsave(&fbc->lock, flags); + raw_spin_lock_irqsave(&fbc->lock, flags); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; - spin_unlock_irqrestore(&fbc->lock, flags); + raw_spin_unlock_irqrestore(&fbc->lock, flags); } mutex_unlock(&percpu_counters_lock); #endif Index: linux-3.0/kernel/cgroup.c =================================================================== --- linux-3.0.orig/kernel/cgroup.c +++ linux-3.0/kernel/cgroup.c @@ -263,7 +263,7 @@ list_for_each_entry(_root, &roots, root_ /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); -static DEFINE_SPINLOCK(release_list_lock); +static DEFINE_RAW_SPINLOCK(release_list_lock); static void cgroup_release_agent(struct work_struct *work); static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); @@ -4010,11 +4010,11 @@ again: finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ @@ -4667,13 +4667,13 @@ static void check_for_release(struct cgr * already queued for a userspace notification, queue * it now */ int need_schedule_work = 0; - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); if (!cgroup_is_removed(cgrp) && list_empty(&cgrp->release_list)) { list_add(&cgrp->release_list, &release_list); need_schedule_work = 1; } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); if (need_schedule_work) schedule_work(&release_agent_work); } @@ -4725,7 +4725,7 @@ static void cgroup_release_agent(struct { BUG_ON(work != &release_agent_work); mutex_lock(&cgroup_mutex); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; @@ -4734,7 +4734,7 @@ static void cgroup_release_agent(struct struct cgroup, release_list); list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!pathbuf) goto continue_free; @@ -4764,9 +4764,9 @@ static void cgroup_release_agent(struct continue_free: kfree(pathbuf); kfree(agentbuf); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); mutex_unlock(&cgroup_mutex); } Index: linux-3.0/include/linux/proportions.h =================================================================== --- linux-3.0.orig/include/linux/proportions.h +++ linux-3.0/include/linux/proportions.h @@ -58,7 +58,7 @@ struct prop_local_percpu { */ int shift; unsigned long period; - spinlock_t lock; /* protect the snapshot state */ + raw_spinlock_t lock; /* protect the snapshot state */ }; int prop_local_init_percpu(struct prop_local_percpu *pl); @@ -106,11 +106,11 @@ struct prop_local_single { */ unsigned long period; int shift; - spinlock_t lock; /* protect the snapshot state */ + raw_spinlock_t lock; /* protect the snapshot state */ }; #define INIT_PROP_LOCAL_SINGLE(name) \ -{ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ +{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ } int prop_local_init_single(struct prop_local_single *pl); Index: linux-3.0/lib/proportions.c =================================================================== --- linux-3.0.orig/lib/proportions.c +++ linux-3.0/lib/proportions.c @@ -190,7 +190,7 @@ prop_adjust_shift(int *pl_shift, unsigne int prop_local_init_percpu(struct prop_local_percpu *pl) { - spin_lock_init(&pl->lock); + raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; return percpu_counter_init(&pl->events, 0); @@ -226,7 +226,7 @@ void prop_norm_percpu(struct prop_global if (pl->period == global_period) return; - spin_lock_irqsave(&pl->lock, flags); + raw_spin_lock_irqsave(&pl->lock, flags); prop_adjust_shift(&pl->shift, &pl->period, pg->shift); /* @@ -247,7 +247,7 @@ void prop_norm_percpu(struct prop_global percpu_counter_set(&pl->events, 0); pl->period = global_period; - spin_unlock_irqrestore(&pl->lock, flags); + raw_spin_unlock_irqrestore(&pl->lock, flags); } /* @@ -324,7 +324,7 @@ void prop_fraction_percpu(struct prop_de int prop_local_init_single(struct prop_local_single *pl) { - spin_lock_init(&pl->lock); + raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; pl->events = 0; @@ -356,7 +356,7 @@ void prop_norm_single(struct prop_global if (pl->period == global_period) return; - spin_lock_irqsave(&pl->lock, flags); + raw_spin_lock_irqsave(&pl->lock, flags); prop_adjust_shift(&pl->shift, &pl->period, pg->shift); /* * For each missed period, we half the local counter. @@ -367,7 +367,7 @@ void prop_norm_single(struct prop_global else pl->events = 0; pl->period = global_period; - spin_unlock_irqrestore(&pl->lock, flags); + raw_spin_unlock_irqrestore(&pl->lock, flags); } /* Index: linux-3.0/kernel/trace/ring_buffer.c =================================================================== --- linux-3.0.orig/kernel/trace/ring_buffer.c +++ linux-3.0/kernel/trace/ring_buffer.c @@ -1040,6 +1040,44 @@ static int rb_allocate_pages(struct ring return -ENOMEM; } +static inline int ok_to_lock(void) +{ + if (in_nmi()) + return 0; +#ifdef CONFIG_PREEMPT_RT_FULL + if (in_atomic()) + return 0; +#endif + return 1; +} + +static int +read_buffer_lock(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long *flags) +{ + /* + * If an NMI die dumps out the content of the ring buffer + * do not grab locks. We also permanently disable the ring + * buffer too. A one time deal is all you get from reading + * the ring buffer from an NMI. + */ + if (!ok_to_lock()) { + if (spin_trylock_irqsave(&cpu_buffer->reader_lock, *flags)) + return 1; + tracing_off_permanent(); + return 0; + } + spin_lock_irqsave(&cpu_buffer->reader_lock, *flags); + return 1; +} + +static void +read_buffer_unlock(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long flags, int locked) +{ + if (locked) + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) { @@ -1250,9 +1288,11 @@ rb_remove_pages(struct ring_buffer_per_c { struct buffer_page *bpage; struct list_head *p; + unsigned long flags; unsigned i; + int locked; - spin_lock_irq(&cpu_buffer->reader_lock); + locked = read_buffer_lock(cpu_buffer, &flags); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1270,7 +1310,7 @@ rb_remove_pages(struct ring_buffer_per_c rb_check_pages(cpu_buffer); out: - spin_unlock_irq(&cpu_buffer->reader_lock); + read_buffer_unlock(cpu_buffer, flags, locked); } static void @@ -1279,9 +1319,11 @@ rb_insert_pages(struct ring_buffer_per_c { struct buffer_page *bpage; struct list_head *p; + unsigned long flags; unsigned i; + int locked; - spin_lock_irq(&cpu_buffer->reader_lock); + locked = read_buffer_lock(cpu_buffer, &flags); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1296,7 +1338,7 @@ rb_insert_pages(struct ring_buffer_per_c rb_check_pages(cpu_buffer); out: - spin_unlock_irq(&cpu_buffer->reader_lock); + read_buffer_unlock(cpu_buffer, flags, locked); } /** @@ -2784,15 +2826,16 @@ void ring_buffer_iter_reset(struct ring_ { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; + int locked; if (!iter) return; cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + locked = read_buffer_lock(cpu_buffer, &flags); rb_iter_reset(iter); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + read_buffer_unlock(cpu_buffer, flags, locked); } EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); @@ -3210,21 +3253,6 @@ rb_iter_peek(struct ring_buffer_iter *it } EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); -static inline int rb_ok_to_lock(void) -{ - /* - * If an NMI die dumps out the content of the ring buffer - * do not grab locks. We also permanently disable the ring - * buffer too. A one time deal is all you get from reading - * the ring buffer from an NMI. - */ - if (likely(!in_nmi())) - return 1; - - tracing_off_permanent(); - return 0; -} - /** * ring_buffer_peek - peek at the next event to be read * @buffer: The ring buffer to read @@ -3242,22 +3270,17 @@ ring_buffer_peek(struct ring_buffer *buf struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; unsigned long flags; - int dolock; + int locked; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - dolock = rb_ok_to_lock(); again: - local_irq_save(flags); - if (dolock) - spin_lock(&cpu_buffer->reader_lock); + locked = read_buffer_lock(cpu_buffer, &flags); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); - if (dolock) - spin_unlock(&cpu_buffer->reader_lock); - local_irq_restore(flags); + read_buffer_unlock(cpu_buffer, flags, locked); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; @@ -3279,11 +3302,12 @@ ring_buffer_iter_peek(struct ring_buffer struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; struct ring_buffer_event *event; unsigned long flags; + int locked; again: - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + locked = read_buffer_lock(cpu_buffer, &flags); event = rb_iter_peek(iter, ts); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + read_buffer_unlock(cpu_buffer, flags, locked); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; @@ -3309,9 +3333,7 @@ ring_buffer_consume(struct ring_buffer * struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; unsigned long flags; - int dolock; - - dolock = rb_ok_to_lock(); + int locked; again: /* might be called in atomic */ @@ -3321,9 +3343,7 @@ ring_buffer_consume(struct ring_buffer * goto out; cpu_buffer = buffer->buffers[cpu]; - local_irq_save(flags); - if (dolock) - spin_lock(&cpu_buffer->reader_lock); + locked = read_buffer_lock(cpu_buffer, &flags); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event) { @@ -3331,9 +3351,8 @@ ring_buffer_consume(struct ring_buffer * rb_advance_reader(cpu_buffer); } - if (dolock) - spin_unlock(&cpu_buffer->reader_lock); - local_irq_restore(flags); + read_buffer_unlock(cpu_buffer, flags, locked); + out: preempt_enable(); @@ -3418,17 +3437,18 @@ ring_buffer_read_start(struct ring_buffe { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; + int locked; if (!iter) return; cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + locked = read_buffer_lock(cpu_buffer, &flags); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + read_buffer_unlock(cpu_buffer, flags, locked); } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -3462,8 +3482,9 @@ ring_buffer_read(struct ring_buffer_iter struct ring_buffer_event *event; struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; + int locked; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + locked = read_buffer_lock(cpu_buffer, &flags); again: event = rb_iter_peek(iter, ts); if (!event) @@ -3474,7 +3495,7 @@ ring_buffer_read(struct ring_buffer_iter rb_advance_iter(iter); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + read_buffer_unlock(cpu_buffer, flags, locked); return event; } @@ -3537,13 +3558,14 @@ void ring_buffer_reset_cpu(struct ring_b { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; unsigned long flags; + int locked; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; atomic_inc(&cpu_buffer->record_disabled); - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + locked = read_buffer_lock(cpu_buffer, &flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; @@ -3555,7 +3577,7 @@ void ring_buffer_reset_cpu(struct ring_b arch_spin_unlock(&cpu_buffer->lock); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + read_buffer_unlock(cpu_buffer, flags, locked); atomic_dec(&cpu_buffer->record_disabled); } @@ -3582,22 +3604,16 @@ int ring_buffer_empty(struct ring_buffer { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int dolock; + int locked; int cpu; int ret; - dolock = rb_ok_to_lock(); - /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - local_irq_save(flags); - if (dolock) - spin_lock(&cpu_buffer->reader_lock); + locked = read_buffer_lock(cpu_buffer, &flags); ret = rb_per_cpu_empty(cpu_buffer); - if (dolock) - spin_unlock(&cpu_buffer->reader_lock); - local_irq_restore(flags); + read_buffer_unlock(cpu_buffer, flags, locked); if (!ret) return 0; @@ -3616,22 +3632,16 @@ int ring_buffer_empty_cpu(struct ring_bu { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int dolock; + int locked; int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 1; - dolock = rb_ok_to_lock(); - cpu_buffer = buffer->buffers[cpu]; - local_irq_save(flags); - if (dolock) - spin_lock(&cpu_buffer->reader_lock); + locked = read_buffer_lock(cpu_buffer, &flags); ret = rb_per_cpu_empty(cpu_buffer); - if (dolock) - spin_unlock(&cpu_buffer->reader_lock); - local_irq_restore(flags); + read_buffer_unlock(cpu_buffer, flags, locked); return ret; } @@ -3805,6 +3815,7 @@ int ring_buffer_read_page(struct ring_bu unsigned int commit; unsigned int read; u64 save_timestamp; + int locked; int ret = -1; if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -3826,7 +3837,7 @@ int ring_buffer_read_page(struct ring_bu if (!bpage) goto out; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + locked = read_buffer_lock(cpu_buffer, &flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) @@ -3949,7 +3960,7 @@ int ring_buffer_read_page(struct ring_bu memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); out_unlock: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + read_buffer_unlock(cpu_buffer, flags, locked); out: return ret; Index: linux-3.0/kernel/trace/trace.c =================================================================== --- linux-3.0.orig/kernel/trace/trace.c +++ linux-3.0/kernel/trace/trace.c @@ -341,7 +341,7 @@ unsigned long trace_flags = TRACE_ITER_P TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; static int trace_stop_count; -static DEFINE_SPINLOCK(tracing_start_lock); +static DEFINE_RAW_SPINLOCK(tracing_start_lock); /** * trace_wake_up - wake up tasks waiting for trace input @@ -351,6 +351,7 @@ static DEFINE_SPINLOCK(tracing_start_loc */ void trace_wake_up(void) { +#ifndef CONFIG_PREEMPT_RT_FULL int cpu; if (trace_flags & TRACE_ITER_BLOCK) @@ -363,6 +364,7 @@ void trace_wake_up(void) if (!runqueue_is_locked(cpu)) wake_up(&trace_wait); put_cpu(); +#endif } static int __init set_buf_size(char *str) @@ -716,6 +718,12 @@ update_max_tr_single(struct trace_array } #endif /* CONFIG_TRACER_MAX_TRACE */ +#ifndef CONFIG_PREEMPT_RT_FULL +static void default_wait_pipe(struct trace_iterator *iter); +#else +#define default_wait_pipe poll_wait_pipe +#endif + /** * register_tracer - register a tracer with the ftrace system. * @type - the plugin for the tracer @@ -958,7 +966,7 @@ void tracing_start(void) if (tracing_disabled) return; - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (--trace_stop_count) { if (trace_stop_count < 0) { /* Someone screwed up their debugging */ @@ -983,7 +991,7 @@ void tracing_start(void) ftrace_start(); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } /** @@ -998,7 +1006,7 @@ void tracing_stop(void) unsigned long flags; ftrace_stop(); - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (trace_stop_count++) goto out; @@ -1016,7 +1024,7 @@ void tracing_stop(void) arch_spin_unlock(&ftrace_max_lock); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } void trace_stop_cmdline_recording(void); @@ -1120,6 +1128,8 @@ tracing_generic_entry_update(struct trac ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); + + entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0; } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); @@ -1757,9 +1767,10 @@ static void print_lat_help_header(struct seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# |||| / _--=> migrate-disable\n"); + seq_puts(m, "# ||||| / delay \n"); + seq_puts(m, "# cmd pid |||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) @@ -3067,6 +3078,7 @@ static int tracing_release_pipe(struct i return 0; } +#ifndef CONFIG_PREEMPT_RT_FULL static unsigned int tracing_poll_pipe(struct file *filp, poll_table *poll_table) { @@ -3088,8 +3100,7 @@ tracing_poll_pipe(struct file *filp, pol } } - -void default_wait_pipe(struct trace_iterator *iter) +static void default_wait_pipe(struct trace_iterator *iter) { DEFINE_WAIT(wait); @@ -3100,6 +3111,20 @@ void default_wait_pipe(struct trace_iter finish_wait(&trace_wait, &wait); } +#else +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ + struct trace_iterator *iter = filp->private_data; + + if ((trace_flags & TRACE_ITER_BLOCK) || !trace_empty(iter)) + return POLLIN | POLLRDNORM; + poll_wait_pipe(iter); + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + return 0; +} +#endif /* * This is a make-shift waitqueue. Index: linux-3.0/kernel/trace/trace_irqsoff.c =================================================================== --- linux-3.0.orig/kernel/trace/trace_irqsoff.c +++ linux-3.0/kernel/trace/trace_irqsoff.c @@ -17,13 +17,14 @@ #include #include "trace.h" +#include static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; static DEFINE_PER_CPU(int, tracing_cpu); -static DEFINE_SPINLOCK(max_trace_lock); +static DEFINE_RAW_SPINLOCK(max_trace_lock); enum { TRACER_IRQS_OFF = (1 << 1), @@ -319,7 +320,7 @@ check_critical_timing(struct trace_array if (!report_latency(delta)) goto out; - spin_lock_irqsave(&max_trace_lock, flags); + raw_spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ if (!report_latency(delta)) @@ -342,7 +343,7 @@ check_critical_timing(struct trace_array max_sequence++; out_unlock: - spin_unlock_irqrestore(&max_trace_lock, flags); + raw_spin_unlock_irqrestore(&max_trace_lock, flags); out: data->critical_sequence = max_sequence; @@ -424,11 +425,13 @@ void start_critical_timings(void) { if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + trace_preemptirqsoff_hist(TRACE_START, 1); } EXPORT_SYMBOL_GPL(start_critical_timings); void stop_critical_timings(void) { + trace_preemptirqsoff_hist(TRACE_STOP, 0); if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings) #ifdef CONFIG_PROVE_LOCKING void time_hardirqs_on(unsigned long a0, unsigned long a1) { + trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, { if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); + trace_preemptirqsoff_hist(IRQS_OFF, 1); } #else /* !CONFIG_PROVE_LOCKING */ @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct */ void trace_hardirqs_on(void) { + trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -480,11 +486,13 @@ void trace_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + trace_preemptirqsoff_hist(IRQS_OFF, 1); } EXPORT_SYMBOL(trace_hardirqs_off); void trace_hardirqs_on_caller(unsigned long caller_addr) { + trace_preemptirqsoff_hist(IRQS_ON, 0); if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } @@ -494,6 +502,7 @@ void trace_hardirqs_off_caller(unsigned { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); + trace_preemptirqsoff_hist(IRQS_OFF, 1); } EXPORT_SYMBOL(trace_hardirqs_off_caller); @@ -503,13 +512,15 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller) #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) { - if (preempt_trace()) + trace_preemptirqsoff_hist(PREEMPT_ON, 0); + if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - if (preempt_trace()) + trace_preemptirqsoff_hist(PREEMPT_ON, 1); + if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); } #endif /* CONFIG_PREEMPT_TRACER */ Index: linux-3.0/include/linux/ratelimit.h =================================================================== --- linux-3.0.orig/include/linux/ratelimit.h +++ linux-3.0/include/linux/ratelimit.h @@ -8,7 +8,7 @@ #define DEFAULT_RATELIMIT_BURST 10 struct ratelimit_state { - spinlock_t lock; /* protect the state */ + raw_spinlock_t lock; /* protect the state */ int interval; int burst; @@ -20,7 +20,7 @@ struct ratelimit_state { #define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \ \ struct ratelimit_state name = { \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ .interval = interval_init, \ .burst = burst_init, \ } @@ -28,7 +28,7 @@ struct ratelimit_state { static inline void ratelimit_state_init(struct ratelimit_state *rs, int interval, int burst) { - spin_lock_init(&rs->lock); + raw_spin_lock_init(&rs->lock); rs->interval = interval; rs->burst = burst; rs->printed = 0; Index: linux-3.0/kernel/printk.c =================================================================== --- linux-3.0.orig/kernel/printk.c +++ linux-3.0/kernel/printk.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -44,13 +45,6 @@ #include -/* - * Architectures can override it: - */ -void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) -{ -} - #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) /* printk's without a loglevel use this.. */ @@ -100,7 +94,7 @@ static int console_locked, console_suspe * It is also used in interesting ways to provide interlocking in * console_unlock();. */ -static DEFINE_SPINLOCK(logbuf_lock); +static DEFINE_RAW_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -212,7 +206,7 @@ void __init setup_log_buf(int early) return; } - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; @@ -230,7 +224,7 @@ void __init setup_log_buf(int early) log_start -= offset; con_start -= offset; log_end -= offset; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); pr_info("log_buf_len: %d\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", @@ -363,18 +357,18 @@ int do_syslog(int type, char __user *buf if (error) goto out; i = 0; - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); while (!error && (log_start != log_end) && i < len) { c = LOG_BUF(log_start); log_start++; - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); error = __put_user(c,buf); buf++; i++; cond_resched(); - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (!error) error = i; break; @@ -397,7 +391,7 @@ int do_syslog(int type, char __user *buf count = len; if (count > log_buf_len) count = log_buf_len; - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); if (count > logged_chars) count = logged_chars; if (do_clear) @@ -414,12 +408,12 @@ int do_syslog(int type, char __user *buf if (j + log_buf_len < log_end) break; c = LOG_BUF(j); - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); error = __put_user(c,&buf[count-1-i]); cond_resched(); - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (error) break; error = i; @@ -509,6 +503,7 @@ static void __call_console_drivers(unsig { struct console *con; + migrate_disable(); for_each_console(con) { if (exclusive_console && con != exclusive_console) continue; @@ -517,8 +512,62 @@ static void __call_console_drivers(unsig (con->flags & CON_ANYTIME))) con->write(con, &LOG_BUF(start), end - start); } + migrate_enable(); +} + +#ifdef CONFIG_EARLY_PRINTK +struct console *early_console; + +static void early_vprintk(const char *fmt, va_list ap) +{ + char buf[512]; + int n = vscnprintf(buf, sizeof(buf), fmt, ap); + if (early_console) + early_console->write(early_console, buf, n); +} + +asmlinkage void early_printk(const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + early_vprintk(fmt, ap); + va_end(ap); } +/* + * This is independent of any log levels - a global + * kill switch that turns off all of printk. + * + * Used by the NMI watchdog if early-printk is enabled. + */ +static int __read_mostly printk_killswitch; + +static int __init force_early_printk_setup(char *str) +{ + printk_killswitch = 1; + return 0; +} +early_param("force_early_printk", force_early_printk_setup); + +void printk_kill(void) +{ + printk_killswitch = 1; +} + +static int forced_early_printk(const char *fmt, va_list ap) +{ + if (!printk_killswitch) + return 0; + early_vprintk(fmt, ap); + return 1; +} +#else +static inline int forced_early_printk(const char *fmt, va_list ap) +{ + return 0; +} +#endif + static int __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) @@ -687,7 +736,7 @@ static void zap_locks(void) oops_timestamp = jiffies; /* If a crash is occurring, make sure we can't deadlock */ - spin_lock_init(&logbuf_lock); + raw_spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ sema_init(&console_sem, 1); } @@ -779,12 +828,18 @@ static inline int can_use_console(unsign * interrupts disabled. It should return with 'lockbuf_lock' * released but interrupts still disabled. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(unsigned int cpu, unsigned long flags) __releases(&logbuf_lock) { +#ifdef CONFIG_PREEMPT_RT_FULL + int lock = (!early_boot_irqs_disabled && !irqs_disabled_flags(flags) && + !preempt_count()) || sysrq_in_progress; +#else + int lock = 1; +#endif int retval = 0; - if (console_trylock()) { + if (lock && console_trylock()) { retval = 1; /* @@ -800,7 +855,7 @@ static int console_trylock_for_printk(un } } printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); + raw_spin_unlock(&logbuf_lock); return retval; } static const char recursion_bug_msg [] = @@ -833,6 +888,13 @@ asmlinkage int vprintk(const char *fmt, size_t plen; char special; + /* + * Fall back to early_printk if a debugging subsystem has + * killed printk output + */ + if (unlikely(forced_early_printk(fmt, args))) + return 1; + boot_delay_msec(); printk_delay(); @@ -860,7 +922,7 @@ asmlinkage int vprintk(const char *fmt, } lockdep_off(); - spin_lock(&logbuf_lock); + raw_spin_lock(&logbuf_lock); printk_cpu = this_cpu; if (recursion_bug) { @@ -953,8 +1015,15 @@ asmlinkage int vprintk(const char *fmt, * will release 'logbuf_lock' regardless of whether it * actually gets the semaphore or not. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk(this_cpu, flags)) { +#ifndef CONFIG_PREEMPT_RT_FULL console_unlock(); +#else + raw_local_irq_restore(flags); + console_unlock(); + raw_local_irq_save(flags); +#endif + } lockdep_on(); out_restore_irqs: @@ -1213,8 +1282,8 @@ void printk_tick(void) int printk_needs_cpu(int cpu) { - if (cpu_is_offline(cpu)) - printk_tick(); + if (unlikely(cpu_is_offline(cpu))) + __this_cpu_write(printk_pending, 0); return __this_cpu_read(printk_pending); } @@ -1252,18 +1321,23 @@ void console_unlock(void) console_may_schedule = 0; for ( ; ; ) { - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; if (con_start == log_end) break; /* Nothing to print */ _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ - spin_unlock(&logbuf_lock); +#ifndef CONFIG_PREEMPT_RT_FULL + raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); start_critical_timings(); local_irq_restore(flags); +#else + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + call_console_drivers(_con_start, _log_end); +#endif } console_locked = 0; @@ -1272,7 +1346,7 @@ void console_unlock(void) exclusive_console = NULL; up(&console_sem); - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) wake_up_klogd(); } @@ -1502,9 +1576,9 @@ void register_console(struct console *ne * console_unlock(); will print out the buffered messages * for us. */ - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to @@ -1711,10 +1785,10 @@ void kmsg_dump(enum kmsg_dump_reason rea /* Theoretically, the log could move on after we do this, but there's not a lot we can do about that. The new messages will overwrite the start of what we dump. */ - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); end = log_end & LOG_BUF_MASK; chars = logged_chars; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (chars > end) { s1 = log_buf + log_buf_len - chars + end; Index: linux-3.0/lib/ratelimit.c =================================================================== --- linux-3.0.orig/lib/ratelimit.c +++ linux-3.0/lib/ratelimit.c @@ -39,7 +39,7 @@ int ___ratelimit(struct ratelimit_state * in addition to the one that will be printed by * the entity that is holding the lock already: */ - if (!spin_trylock_irqsave(&rs->lock, flags)) + if (!raw_spin_trylock_irqsave(&rs->lock, flags)) return 0; if (!rs->begin) @@ -60,7 +60,7 @@ int ___ratelimit(struct ratelimit_state rs->missed++; ret = 0; } - spin_unlock_irqrestore(&rs->lock, flags); + raw_spin_unlock_irqrestore(&rs->lock, flags); return ret; } Index: linux-3.0/include/linux/init_task.h =================================================================== --- linux-3.0.orig/include/linux/init_task.h +++ linux-3.0/include/linux/init_task.h @@ -42,7 +42,7 @@ extern struct fs_struct init_fs; .cputimer = { \ .cputime = INIT_CPUTIME, \ .running = 0, \ - .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ @@ -126,6 +126,14 @@ extern struct cred init_cred; # define INIT_PERF_EVENTS(tsk) #endif +#ifdef CONFIG_PREEMPT_RT_BASE +# define INIT_TIMER_LIST .posix_timer_list = NULL, +#else +# define INIT_TIMER_LIST +#endif + +#define INIT_TASK_COMM "swapper" + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -162,7 +170,7 @@ extern struct cred init_cred; .group_leader = &tsk, \ RCU_INIT_POINTER(.real_cred, &init_cred), \ RCU_INIT_POINTER(.cred, &init_cred), \ - .comm = "swapper", \ + .comm = INIT_TASK_COMM, \ .thread = INIT_THREAD, \ .fs = &init_fs, \ .files = &init_files, \ @@ -179,6 +187,7 @@ extern struct cred init_cred; .fs_excl = ATOMIC_INIT(0), \ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .timer_slack_ns = 50000, /* 50 usec default slack */ \ + INIT_TIMER_LIST \ .pids = { \ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ Index: linux-3.0/include/linux/sched.h =================================================================== --- linux-3.0.orig/include/linux/sched.h +++ linux-3.0/include/linux/sched.h @@ -63,6 +63,7 @@ struct sched_param { #include #include +#include #include #include #include @@ -90,6 +91,7 @@ struct sched_param { #include #include #include +#include #include @@ -359,6 +361,7 @@ extern signed long schedule_timeout_inte extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); +extern void schedule_preempt_disabled(void); extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; @@ -510,7 +513,7 @@ struct task_cputime { struct thread_group_cputimer { struct task_cputime cputime; int running; - spinlock_t lock; + raw_spinlock_t lock; }; #include @@ -1070,6 +1073,7 @@ struct sched_domain; #define WF_SYNC 0x01 /* waker goes to sleep after wakup */ #define WF_FORK 0x02 /* child wakeup after fork */ #define WF_MIGRATED 0x04 /* internal use, task got migrated */ +#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */ #define ENQUEUE_WAKEUP 1 #define ENQUEUE_HEAD 2 @@ -1219,6 +1223,7 @@ enum perf_event_task_context { struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ + volatile long saved_state; /* saved state for "spinlock sleepers" */ void *stack; atomic_t usage; unsigned int flags; /* per process flags, defined below */ @@ -1255,14 +1260,17 @@ struct task_struct { #endif unsigned int policy; +#ifdef CONFIG_PREEMPT_RT_FULL + int migrate_disable; +#ifdef CONFIG_SCHED_DEBUG + int migrate_disable_atomic; +#endif +#endif cpumask_t cpus_allowed; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; char rcu_read_unlock_special; -#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU) - int rcu_boosted; -#endif /* #if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU) */ struct list_head rcu_node_entry; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TREE_PREEMPT_RCU @@ -1356,6 +1364,9 @@ struct task_struct { struct task_cputime cputime_expires; struct list_head cpu_timers[3]; +#ifdef CONFIG_PREEMPT_RT_BASE + struct task_struct *posix_timer_list; +#endif /* process credentials */ const struct cred __rcu *real_cred; /* objective and real subjective task @@ -1389,6 +1400,7 @@ struct task_struct { /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; + struct sigqueue *sigqueue_cache; sigset_t blocked, real_blocked; sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ @@ -1432,6 +1444,9 @@ struct task_struct { /* mutex deadlock detection */ struct mutex_waiter *blocked_on; #endif +#ifdef CONFIG_PREEMPT_RT_FULL + int pagefault_disabled; +#endif #ifdef CONFIG_TRACE_IRQFLAGS unsigned int irq_events; unsigned long hardirq_enable_ip; @@ -1558,6 +1573,12 @@ struct task_struct { unsigned long trace; /* bitmask and counter of trace recursion */ unsigned long trace_recursion; +#ifdef CONFIG_WAKEUP_LATENCY_HIST + u64 preempt_timestamp_hist; +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST + unsigned long timer_offset; +#endif +#endif #endif /* CONFIG_TRACING */ #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ struct memcg_batch_info { @@ -1570,10 +1591,26 @@ struct task_struct { #ifdef CONFIG_HAVE_HW_BREAKPOINT atomic_t ptrace_bp_refcnt; #endif +#ifdef CONFIG_PREEMPT_RT_BASE + struct rcu_head put_rcu; + int softirq_nestcnt; +#endif +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM + int kmap_idx; + pte_t kmap_pte[KM_TYPE_NR]; +#endif }; -/* Future-safe accessor for struct task_struct's cpus_allowed. */ -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#ifdef CONFIG_PREEMPT_RT_FULL +static inline bool cur_pf_disabled(void) { return current->pagefault_disabled; } +#else +static inline bool cur_pf_disabled(void) { return false; } +#endif + +static inline bool pagefault_disabled(void) +{ + return in_atomic() || cur_pf_disabled(); +} /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT @@ -1743,6 +1780,15 @@ extern struct pid *cad_pid; extern void free_task(struct task_struct *tsk); #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#ifdef CONFIG_PREEMPT_RT_BASE +extern void __put_task_struct_cb(struct rcu_head *rhp); + +static inline void put_task_struct(struct task_struct *t) +{ + if (atomic_dec_and_test(&t->usage)) + call_rcu(&t->put_rcu, __put_task_struct_cb); +} +#else extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) @@ -1750,6 +1796,7 @@ static inline void put_task_struct(struc if (atomic_dec_and_test(&t->usage)) __put_task_struct(t); } +#endif extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); @@ -1774,6 +1821,7 @@ extern void thread_group_times(struct ta #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ #define PF_KSWAPD 0x00040000 /* I am kswapd */ +#define PF_STOMPER 0x00080000 /* I am a stomp machine thread */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ @@ -2021,15 +2069,27 @@ static inline void sched_autogroup_exit( #endif #ifdef CONFIG_RT_MUTEXES +extern void task_setprio(struct task_struct *p, int prio); extern int rt_mutex_getprio(struct task_struct *p); -extern void rt_mutex_setprio(struct task_struct *p, int prio); +static inline void rt_mutex_setprio(struct task_struct *p, int prio) +{ + task_setprio(p, prio); +} extern void rt_mutex_adjust_pi(struct task_struct *p); +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return tsk->pi_blocked_on != NULL; +} #else static inline int rt_mutex_getprio(struct task_struct *p) { return p->normal_prio; } # define rt_mutex_adjust_pi(p) do { } while (0) +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return false; +} #endif extern bool yield_to(struct task_struct *p, bool preempt); @@ -2109,6 +2169,7 @@ extern void xtime_update(unsigned long t extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); +extern int wake_up_lock_sleeper(struct task_struct * tsk); extern void wake_up_new_task(struct task_struct *tsk); #ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); @@ -2198,12 +2259,24 @@ extern struct mm_struct * mm_alloc(void) /* mmdrop drops the mm and the page tables */ extern void __mmdrop(struct mm_struct *); + static inline void mmdrop(struct mm_struct * mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } +#ifdef CONFIG_PREEMPT_RT_BASE +extern void __mmdrop_delayed(struct rcu_head *rhp); +static inline void mmdrop_delayed(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else +# define mmdrop_delayed(mm) mmdrop(mm) +#endif + /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); /* Grab a reference to a task's mm, if it is not already going away */ @@ -2509,7 +2582,7 @@ extern int _cond_resched(void); extern int __cond_resched_lock(spinlock_t *lock); -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT_FULL) #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET #else #define PREEMPT_LOCK_OFFSET 0 @@ -2520,12 +2593,16 @@ extern int __cond_resched_lock(spinlock_ __cond_resched_lock(lock); \ }) +#ifndef CONFIG_PREEMPT_RT_FULL extern int __cond_resched_softirq(void); #define cond_resched_softirq() ({ \ __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ __cond_resched_softirq(); \ }) +#else +# define cond_resched_softirq() cond_resched() +#endif /* * Does a critical section need to be broken due to another @@ -2549,7 +2626,7 @@ void thread_group_cputimer(struct task_s static inline void thread_group_cputime_init(struct signal_struct *sig) { - spin_lock_init(&sig->cputimer.lock); + raw_spin_lock_init(&sig->cputimer.lock); } /* @@ -2588,6 +2665,26 @@ static inline void set_task_cpu(struct t #endif /* CONFIG_SMP */ +static inline int __migrate_disabled(struct task_struct *p) +{ +#ifdef CONFIG_PREEMPT_RT_FULL + return p->migrate_disable; +#else + return 0; +#endif +} + +/* Future-safe accessor for struct task_struct's cpus_allowed. */ +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p) +{ +#ifdef CONFIG_PREEMPT_RT_FULL + if (p->migrate_disable) + return cpumask_of(task_cpu(p)); +#endif + + return &p->cpus_allowed; +} + extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); Index: linux-3.0/kernel/posix-cpu-timers.c =================================================================== --- linux-3.0.orig/kernel/posix-cpu-timers.c +++ linux-3.0/kernel/posix-cpu-timers.c @@ -282,13 +282,13 @@ void thread_group_cputimer(struct task_s * it. */ thread_group_cputime(tsk, &sum); - spin_lock_irqsave(&cputimer->lock, flags); + raw_spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 1; update_gt_cputime(&cputimer->cputime, &sum); } else - spin_lock_irqsave(&cputimer->lock, flags); + raw_spin_lock_irqsave(&cputimer->lock, flags); *times = cputimer->cputime; - spin_unlock_irqrestore(&cputimer->lock, flags); + raw_spin_unlock_irqrestore(&cputimer->lock, flags); } /* @@ -701,7 +701,7 @@ static int posix_cpu_timer_set(struct k_ /* * Disarm any old timer after extracting its expiry time. */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); ret = 0; old_incr = timer->it.cpu.incr; @@ -999,9 +999,9 @@ static void stop_process_timers(struct s struct thread_group_cputimer *cputimer = &sig->cputimer; unsigned long flags; - spin_lock_irqsave(&cputimer->lock, flags); + raw_spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 0; - spin_unlock_irqrestore(&cputimer->lock, flags); + raw_spin_unlock_irqrestore(&cputimer->lock, flags); } static u32 onecputick; @@ -1223,7 +1223,7 @@ void posix_cpu_timer_schedule(struct k_i /* * Now re-arm for the new expiry time. */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); arm_timer(timer); spin_unlock(&p->sighand->siglock); @@ -1290,10 +1290,11 @@ static inline int fastpath_timer_check(s sig = tsk->signal; if (sig->cputimer.running) { struct task_cputime group_sample; + unsigned long flags; - spin_lock(&sig->cputimer.lock); + raw_spin_lock_irqsave(&sig->cputimer.lock, flags); group_sample = sig->cputimer.cputime; - spin_unlock(&sig->cputimer.lock); + raw_spin_unlock_irqrestore(&sig->cputimer.lock, flags); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; @@ -1307,13 +1308,13 @@ static inline int fastpath_timer_check(s * already updated our counts. We need to check if any timers fire now. * Interrupts are disabled. */ -void run_posix_cpu_timers(struct task_struct *tsk) +static void __run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; unsigned long flags; - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); /* * The fast path checks that there are no expired thread or thread @@ -1371,6 +1372,190 @@ void run_posix_cpu_timers(struct task_st } } +#ifdef CONFIG_PREEMPT_RT_BASE +#include +#include +DEFINE_PER_CPU(struct task_struct *, posix_timer_task); +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist); + +static int posix_cpu_timers_thread(void *data) +{ + int cpu = (long)data; + + BUG_ON(per_cpu(posix_timer_task,cpu) != current); + + while (!kthread_should_stop()) { + struct task_struct *tsk = NULL; + struct task_struct *next = NULL; + + if (cpu_is_offline(cpu)) + goto wait_to_die; + + /* grab task list */ + raw_local_irq_disable(); + tsk = per_cpu(posix_timer_tasklist, cpu); + per_cpu(posix_timer_tasklist, cpu) = NULL; + raw_local_irq_enable(); + + /* its possible the list is empty, just return */ + if (!tsk) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + continue; + } + + /* Process task list */ + while (1) { + /* save next */ + next = tsk->posix_timer_list; + + /* run the task timers, clear its ptr and + * unreference it + */ + __run_posix_cpu_timers(tsk); + tsk->posix_timer_list = NULL; + put_task_struct(tsk); + + /* check if this is the last on the list */ + if (next == tsk) + break; + tsk = next; + } + } + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static inline int __fastpath_timer_check(struct task_struct *tsk) +{ + /* tsk == current, ensure it is safe to use ->signal/sighand */ + if (unlikely(tsk->exit_state)) + return 0; + + if (!task_cputime_zero(&tsk->cputime_expires)) + return 1; + + if (!task_cputime_zero(&tsk->signal->cputime_expires)) + return 1; + + return 0; +} + +void run_posix_cpu_timers(struct task_struct *tsk) +{ + unsigned long cpu = smp_processor_id(); + struct task_struct *tasklist; + + BUG_ON(!irqs_disabled()); + if(!per_cpu(posix_timer_task, cpu)) + return; + /* get per-cpu references */ + tasklist = per_cpu(posix_timer_tasklist, cpu); + + /* check to see if we're already queued */ + if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) { + get_task_struct(tsk); + if (tasklist) { + tsk->posix_timer_list = tasklist; + } else { + /* + * The list is terminated by a self-pointing + * task_struct + */ + tsk->posix_timer_list = tsk; + } + per_cpu(posix_timer_tasklist, cpu) = tsk; + + wake_up_process(per_cpu(posix_timer_task, cpu)); + } +} + +/* + * posix_cpu_thread_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int posix_cpu_thread_call(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + struct task_struct *p; + struct sched_param param; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(posix_cpu_timers_thread, hcpu, + "posixcputmr/%d",cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; + kthread_bind(p, cpu); + /* Must be high prio to avoid getting starved */ + param.sched_priority = MAX_RT_PRIO-1; + sched_setscheduler(p, SCHED_FIFO, ¶m); + per_cpu(posix_timer_task,cpu) = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ + wake_up_process(per_cpu(posix_timer_task,cpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(per_cpu(posix_timer_task,cpu), + any_online_cpu(cpu_online_map)); + kthread_stop(per_cpu(posix_timer_task,cpu)); + per_cpu(posix_timer_task,cpu) = NULL; + break; + case CPU_DEAD: + kthread_stop(per_cpu(posix_timer_task,cpu)); + per_cpu(posix_timer_task,cpu) = NULL; + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __devinitdata posix_cpu_thread_notifier = { + .notifier_call = posix_cpu_thread_call, + .priority = 10 +}; + +static int __init posix_cpu_thread_init(void) +{ + void *hcpu = (void *)(long)smp_processor_id(); + /* Start one for boot CPU. */ + unsigned long cpu; + + /* init the per-cpu posix_timer_tasklets */ + for_each_cpu_mask(cpu, cpu_possible_map) + per_cpu(posix_timer_tasklist, cpu) = NULL; + + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu); + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu); + register_cpu_notifier(&posix_cpu_thread_notifier); + return 0; +} +early_initcall(posix_cpu_thread_init); +#else /* CONFIG_PREEMPT_RT_BASE */ +void run_posix_cpu_timers(struct task_struct *tsk) +{ + __run_posix_cpu_timers(tsk); +} +#endif /* CONFIG_PREEMPT_RT_BASE */ + /* * Set one of the process-wide special case CPU timers or RLIMIT_CPU. * The tsk->sighand->siglock must be held by the caller. Index: linux-3.0/kernel/sched_stats.h =================================================================== --- linux-3.0.orig/kernel/sched_stats.h +++ linux-3.0/kernel/sched_stats.h @@ -282,10 +282,10 @@ static inline void account_group_user_ti if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.utime = cputime_add(cputimer->cputime.utime, cputime); - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } /** @@ -306,10 +306,10 @@ static inline void account_group_system_ if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.stime = cputime_add(cputimer->cputime.stime, cputime); - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } /** @@ -330,7 +330,7 @@ static inline void account_group_exec_ru if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.sum_exec_runtime += ns; - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } Index: linux-3.0/include/linux/semaphore.h =================================================================== --- linux-3.0.orig/include/linux/semaphore.h +++ linux-3.0/include/linux/semaphore.h @@ -14,14 +14,14 @@ /* Please don't access any members of this structure directly */ struct semaphore { - spinlock_t lock; + raw_spinlock_t lock; unsigned int count; struct list_head wait_list; }; #define __SEMAPHORE_INITIALIZER(name, n) \ { \ - .lock = __SPIN_LOCK_UNLOCKED((name).lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \ .count = n, \ .wait_list = LIST_HEAD_INIT((name).wait_list), \ } Index: linux-3.0/kernel/semaphore.c =================================================================== --- linux-3.0.orig/kernel/semaphore.c +++ linux-3.0/kernel/semaphore.c @@ -54,12 +54,12 @@ void down(struct semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else __down(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); } EXPORT_SYMBOL(down); @@ -77,12 +77,12 @@ int down_interruptible(struct semaphore unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_interruptible(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_killable(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem) unsigned long flags; int count; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) sem->count = count; - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); } @@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_timeout(sem, jiffies); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -179,12 +179,12 @@ void up(struct semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else __up(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); } EXPORT_SYMBOL(up); @@ -217,9 +217,9 @@ static inline int __sched __down_common( if (timeout <= 0) goto timed_out; __set_task_state(task, state); - spin_unlock_irq(&sem->lock); + raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); - spin_lock_irq(&sem->lock); + raw_spin_lock_irq(&sem->lock); if (waiter.up) return 0; } Index: linux-3.0/include/linux/rwsem-spinlock.h =================================================================== --- linux-3.0.orig/include/linux/rwsem-spinlock.h +++ linux-3.0/include/linux/rwsem-spinlock.h @@ -20,26 +20,42 @@ * - if activity is -1 then there is one active writer * - if wait_list is not empty, then there are processes waiting for the semaphore */ +struct rw_anon_semaphore { + __s32 activity; + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#ifndef CONFIG_PREEMPT_RT_FULL +/* + * Non preempt-rt implementation of rw_semaphore. Same as above, but + * restricted vs. ownership. i.e. ownerless locked state and non owner + * release not allowed. + */ struct rw_semaphore { __s32 activity; - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; +#endif /* PREEMPT_RT_FULL */ #define RWSEM_UNLOCKED_VALUE 0x00000000 -extern void __down_read(struct rw_semaphore *sem); -extern int __down_read_trylock(struct rw_semaphore *sem); -extern void __down_write(struct rw_semaphore *sem); -extern void __down_write_nested(struct rw_semaphore *sem, int subclass); -extern int __down_write_trylock(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); -extern void __up_write(struct rw_semaphore *sem); -extern void __downgrade_write(struct rw_semaphore *sem); -extern int rwsem_is_locked(struct rw_semaphore *sem); +extern void __down_read(struct rw_anon_semaphore *sem); +extern int __down_read_trylock(struct rw_anon_semaphore *sem); +extern void __down_write(struct rw_anon_semaphore *sem); +extern void __down_write_nested(struct rw_anon_semaphore *sem, int subclass); +extern int __down_write_trylock(struct rw_anon_semaphore *sem); +extern void __up_read(struct rw_anon_semaphore *sem); +extern void __up_write(struct rw_anon_semaphore *sem); +extern void __downgrade_write(struct rw_anon_semaphore *sem); +extern int anon_rwsem_is_locked(struct rw_anon_semaphore *sem); #endif /* __KERNEL__ */ #endif /* _LINUX_RWSEM_SPINLOCK_H */ Index: linux-3.0/include/linux/rwsem.h =================================================================== --- linux-3.0.orig/include/linux/rwsem.h +++ linux-3.0/include/linux/rwsem.h @@ -17,37 +17,50 @@ #include #include +struct rw_anon_semaphore; struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK #include /* use a generic implementation */ -#else +#else /* RWSEM_GENERIC_SPINLOCK */ + /* All arch specific implementations share the same struct */ -struct rw_semaphore { +struct rw_anon_semaphore { long count; - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct rw_anon_semaphore *rwsem_down_read_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore *rwsem_down_write_failed(struct rw_anon_semaphore *sem); +extern struct rw_anon_semaphore *rwsem_wake(struct rw_anon_semaphore *); +extern struct rw_anon_semaphore *rwsem_downgrade_wake(struct rw_anon_semaphore *sem); /* Include the arch specific part */ #include /* In all implementations count != 0 means locked */ -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) { return sem->count != 0; } +#ifndef CONFIG_PREEMPT_RT_FULL +struct rw_semaphore { + long count; + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; #endif +#endif /* !RWSEM_GENERIC_SPINLOCK */ + /* Common initializer macros and functions */ #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -56,57 +69,59 @@ static inline int rwsem_is_locked(struct # define __RWSEM_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED(name.wait_lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } +#define __RWSEM_ANON_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, \ + __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ + LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define DECLARE_ANON_RWSEM(name) \ + struct rw_anon_semaphore name = __RWSEM_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); +extern void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key); -#define init_rwsem(sem) \ +#define init_anon_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __init_anon_rwsem((sem), #sem, &__key); \ } while (0) /* * lock for reading */ -extern void down_read(struct rw_semaphore *sem); +extern void anon_down_read(struct rw_anon_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention */ -extern int down_read_trylock(struct rw_semaphore *sem); +extern int anon_down_read_trylock(struct rw_anon_semaphore *sem); /* * lock for writing */ -extern void down_write(struct rw_semaphore *sem); +extern void anon_down_write(struct rw_anon_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention */ -extern int down_write_trylock(struct rw_semaphore *sem); +extern int anon_down_write_trylock(struct rw_anon_semaphore *sem); /* * release a read lock */ -extern void up_read(struct rw_semaphore *sem); +extern void anon_up_read(struct rw_anon_semaphore *sem); /* * release a write lock */ -extern void up_write(struct rw_semaphore *sem); +extern void anon_up_write(struct rw_anon_semaphore *sem); /* * downgrade write lock to read lock */ -extern void downgrade_write(struct rw_semaphore *sem); +extern void anon_downgrade_write(struct rw_anon_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -122,21 +137,101 @@ extern void downgrade_write(struct rw_se * lockdep_set_class() at lock initialization time. * See Documentation/lockdep-design.txt for more details.) */ -extern void down_read_nested(struct rw_semaphore *sem, int subclass); -extern void down_write_nested(struct rw_semaphore *sem, int subclass); +extern void anon_down_read_nested(struct rw_anon_semaphore *sem, int subclass); +extern void anon_down_write_nested(struct rw_anon_semaphore *sem, int subclass); /* * Take/release a lock when not the owner will release it. * * [ This API should be avoided as much as possible - the * proper abstraction for this case is completions. ] */ -extern void down_read_non_owner(struct rw_semaphore *sem); -extern void up_read_non_owner(struct rw_semaphore *sem); +extern void anon_down_read_non_owner(struct rw_anon_semaphore *sem); +extern void anon_up_read_non_owner(struct rw_anon_semaphore *sem); #else -# define down_read_nested(sem, subclass) down_read(sem) -# define down_write_nested(sem, subclass) down_write(sem) -# define down_read_non_owner(sem) down_read(sem) -# define up_read_non_owner(sem) up_read(sem) +# define anon_down_read_nested(sem, subclass) anon_down_read(sem) +# define anon_down_write_nested(sem, subclass) anon_down_write(sem) +# define anon_down_read_non_owner(sem) anon_down_read(sem) +# define anon_up_read_non_owner(sem) anon_up_read(sem) #endif +#ifdef CONFIG_PREEMPT_RT_FULL +#include +#else /* PREEMPT_RT_FULL */ +/* + * Non preempt-rt implementations + */ +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, \ + __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ + LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +static inline void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + __init_anon_rwsem((struct rw_anon_semaphore *)sem, name, key); +} + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + +static inline void down_read(struct rw_semaphore *sem) +{ + anon_down_read((struct rw_anon_semaphore *)sem); +} + +static inline int down_read_trylock(struct rw_semaphore *sem) +{ + return anon_down_read_trylock((struct rw_anon_semaphore *)sem); +} + +static inline void down_write(struct rw_semaphore *sem) +{ + anon_down_write((struct rw_anon_semaphore *)sem); +} + +static inline int down_write_trylock(struct rw_semaphore *sem) +{ + return anon_down_write_trylock((struct rw_anon_semaphore *)sem); +} + +static inline void up_read(struct rw_semaphore *sem) +{ + anon_up_read((struct rw_anon_semaphore *)sem); +} + +static inline void up_write(struct rw_semaphore *sem) +{ + anon_up_write((struct rw_anon_semaphore *)sem); +} + +static inline void downgrade_write(struct rw_semaphore *sem) +{ + anon_downgrade_write((struct rw_anon_semaphore *)sem); +} + +static inline void down_read_nested(struct rw_semaphore *sem, int subclass) +{ + return anon_down_read_nested((struct rw_anon_semaphore *)sem, subclass); +} + +static inline void down_write_nested(struct rw_semaphore *sem, int subclass) +{ + anon_down_write_nested((struct rw_anon_semaphore *)sem, subclass); +} + +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return anon_rwsem_is_locked((struct rw_anon_semaphore *)sem); +} +#endif /* !PREEMPT_RT_FULL */ + #endif /* _LINUX_RWSEM_H */ + Index: linux-3.0/lib/rwsem-spinlock.c =================================================================== --- linux-3.0.orig/lib/rwsem-spinlock.c +++ linux-3.0/lib/rwsem-spinlock.c @@ -17,24 +17,24 @@ struct rwsem_waiter { #define RWSEM_WAITING_FOR_WRITE 0x00000002 }; -int rwsem_is_locked(struct rw_semaphore *sem) +int anon_rwsem_is_locked(struct rw_anon_semaphore *sem) { int ret = 1; unsigned long flags; - if (spin_trylock_irqsave(&sem->wait_lock, flags)) { + if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { ret = (sem->activity != 0); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } return ret; } -EXPORT_SYMBOL(rwsem_is_locked); +EXPORT_SYMBOL(anon_rwsem_is_locked); /* * initialise the semaphore */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) +void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -44,10 +44,10 @@ void __init_rwsem(struct rw_semaphore *s lockdep_init_map(&sem->dep_map, name, key, 0); #endif sem->activity = 0; - spin_lock_init(&sem->wait_lock); + raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__init_anon_rwsem); /* * handle the lock release when processes blocked on it that can now run @@ -58,8 +58,8 @@ EXPORT_SYMBOL(__init_rwsem); * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if wakewrite is non-zero */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +static inline struct rw_anon_semaphore * +__rwsem_do_wake(struct rw_anon_semaphore *sem, int wakewrite) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -117,8 +117,8 @@ __rwsem_do_wake(struct rw_semaphore *sem /* * wake a single writer */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) +static inline struct rw_anon_semaphore * +__rwsem_wake_one_writer(struct rw_anon_semaphore *sem) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -139,18 +139,18 @@ __rwsem_wake_one_writer(struct rw_semaph /* * get a read lock on the semaphore */ -void __sched __down_read(struct rw_semaphore *sem) +void __sched __down_read(struct rw_anon_semaphore *sem) { struct rwsem_waiter waiter; struct task_struct *tsk; unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (sem->activity >= 0 && list_empty(&sem->wait_list)) { /* granted */ sem->activity++; - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); goto out; } @@ -165,7 +165,7 @@ void __sched __down_read(struct rw_semap list_add_tail(&waiter.list, &sem->wait_list); /* we don't need to touch the semaphore struct anymore */ - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); /* wait to be given the lock */ for (;;) { @@ -183,13 +183,13 @@ void __sched __down_read(struct rw_semap /* * trylock for reading -- returns 1 if successful, 0 if contention */ -int __down_read_trylock(struct rw_semaphore *sem) +int __down_read_trylock(struct rw_anon_semaphore *sem) { unsigned long flags; int ret = 0; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (sem->activity >= 0 && list_empty(&sem->wait_list)) { /* granted */ @@ -197,7 +197,7 @@ int __down_read_trylock(struct rw_semaph ret = 1; } - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return ret; } @@ -206,18 +206,18 @@ int __down_read_trylock(struct rw_semaph * get a write lock on the semaphore * - we increment the waiting count anyway to indicate an exclusive lock */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +void __sched __down_write_nested(struct rw_anon_semaphore *sem, int subclass) { struct rwsem_waiter waiter; struct task_struct *tsk; unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (sem->activity == 0 && list_empty(&sem->wait_list)) { /* granted */ sem->activity = -1; - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); goto out; } @@ -232,7 +232,7 @@ void __sched __down_write_nested(struct list_add_tail(&waiter.list, &sem->wait_list); /* we don't need to touch the semaphore struct anymore */ - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); /* wait to be given the lock */ for (;;) { @@ -247,7 +247,7 @@ void __sched __down_write_nested(struct ; } -void __sched __down_write(struct rw_semaphore *sem) +void __sched __down_write(struct rw_anon_semaphore *sem) { __down_write_nested(sem, 0); } @@ -255,12 +255,12 @@ void __sched __down_write(struct rw_sema /* * trylock for writing -- returns 1 if successful, 0 if contention */ -int __down_write_trylock(struct rw_semaphore *sem) +int __down_write_trylock(struct rw_anon_semaphore *sem) { unsigned long flags; int ret = 0; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (sem->activity == 0 && list_empty(&sem->wait_list)) { /* granted */ @@ -268,7 +268,7 @@ int __down_write_trylock(struct rw_semap ret = 1; } - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return ret; } @@ -276,48 +276,48 @@ int __down_write_trylock(struct rw_semap /* * release a read lock on the semaphore */ -void __up_read(struct rw_semaphore *sem) +void __up_read(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); if (--sem->activity == 0 && !list_empty(&sem->wait_list)) sem = __rwsem_wake_one_writer(sem); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } /* * release a write lock on the semaphore */ -void __up_write(struct rw_semaphore *sem) +void __up_write(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); sem->activity = 0; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 1); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } /* * downgrade a write lock into a read lock * - just wake up any readers at the front of the queue */ -void __downgrade_write(struct rw_semaphore *sem) +void __downgrade_write(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); sem->activity = 1; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 0); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } Index: linux-3.0/lib/rwsem.c =================================================================== --- linux-3.0.orig/lib/rwsem.c +++ linux-3.0/lib/rwsem.c @@ -11,8 +11,8 @@ /* * Initialize an rwsem: */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) +void __init_anon_rwsem(struct rw_anon_semaphore *sem, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -22,11 +22,11 @@ void __init_rwsem(struct rw_semaphore *s lockdep_init_map(&sem->dep_map, name, key, 0); #endif sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); + raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__init_anon_rwsem); struct rwsem_waiter { struct list_head list; @@ -54,8 +54,8 @@ struct rwsem_waiter { * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if downgrading is false */ -static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wake_type) +static struct rw_anon_semaphore * +__rwsem_do_wake(struct rw_anon_semaphore *sem, int wake_type) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -169,8 +169,8 @@ __rwsem_do_wake(struct rw_semaphore *sem /* * wait for a lock to be granted */ -static struct rw_semaphore __sched * -rwsem_down_failed_common(struct rw_semaphore *sem, +static struct rw_anon_semaphore __sched * +rwsem_down_failed_common(struct rw_anon_semaphore *sem, unsigned int flags, signed long adjustment) { struct rwsem_waiter waiter; @@ -180,7 +180,7 @@ rwsem_down_failed_common(struct rw_semap set_task_state(tsk, TASK_UNINTERRUPTIBLE); /* set up my own style of waitqueue */ - spin_lock_irq(&sem->wait_lock); + raw_spin_lock_irq(&sem->wait_lock); waiter.task = tsk; waiter.flags = flags; get_task_struct(tsk); @@ -204,7 +204,7 @@ rwsem_down_failed_common(struct rw_semap adjustment == -RWSEM_ACTIVE_WRITE_BIAS) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); - spin_unlock_irq(&sem->wait_lock); + raw_spin_unlock_irq(&sem->wait_lock); /* wait to be given the lock */ for (;;) { @@ -222,7 +222,8 @@ rwsem_down_failed_common(struct rw_semap /* * wait for the read lock to be granted */ -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +struct rw_anon_semaphore __sched * +rwsem_down_read_failed(struct rw_anon_semaphore *sem) { return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ, -RWSEM_ACTIVE_READ_BIAS); @@ -231,7 +232,8 @@ struct rw_semaphore __sched *rwsem_down_ /* * wait for the write lock to be granted */ -struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +struct rw_anon_semaphore __sched * +rwsem_down_write_failed(struct rw_anon_semaphore *sem) { return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE, -RWSEM_ACTIVE_WRITE_BIAS); @@ -241,17 +243,17 @@ struct rw_semaphore __sched *rwsem_down_ * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +struct rw_anon_semaphore *rwsem_wake(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return sem; } @@ -261,17 +263,17 @@ struct rw_semaphore *rwsem_wake(struct r * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +struct rw_anon_semaphore *rwsem_downgrade_wake(struct rw_anon_semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->wait_lock, flags); + raw_spin_lock_irqsave(&sem->wait_lock, flags); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); - spin_unlock_irqrestore(&sem->wait_lock, flags); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return sem; } Index: linux-3.0/kernel/time/timer_stats.c =================================================================== --- linux-3.0.orig/kernel/time/timer_stats.c +++ linux-3.0/kernel/time/timer_stats.c @@ -81,7 +81,7 @@ struct entry { /* * Spinlock protecting the tables - not taken during lookup: */ -static DEFINE_SPINLOCK(table_lock); +static DEFINE_RAW_SPINLOCK(table_lock); /* * Per-CPU lookup locks for fast hash lookup: @@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct prev = NULL; curr = *head; - spin_lock(&table_lock); + raw_spin_lock(&table_lock); /* * Make sure we have not raced with another CPU: */ @@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct *head = curr; } out_unlock: - spin_unlock(&table_lock); + raw_spin_unlock(&table_lock); return curr; } Index: linux-3.0/kernel/latencytop.c =================================================================== --- linux-3.0.orig/kernel/latencytop.c +++ linux-3.0/kernel/latencytop.c @@ -58,7 +58,7 @@ #include #include -static DEFINE_SPINLOCK(latency_lock); +static DEFINE_RAW_SPINLOCK(latency_lock); #define MAXLR 128 static struct latency_record latency_record[MAXLR]; @@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct ta if (!latencytop_enabled) return; - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static void clear_global_latency_tracing(void) { unsigned long flags; - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); memset(&latency_record, 0, sizeof(latency_record)); - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static void __sched @@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_ lat.max = usecs; store_stacktrace(tsk, &lat); - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); account_global_scheduler_latency(tsk, &lat); @@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_ memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static int lstats_show(struct seq_file *m, void *v) Index: linux-3.0/drivers/video/console/vgacon.c =================================================================== --- linux-3.0.orig/drivers/video/console/vgacon.c +++ linux-3.0/drivers/video/console/vgacon.c @@ -50,7 +50,7 @@ #include