From: Kirill Korotaev This patch fixes dcache race between shrink_dcache_XXX functions and dput(). Example race scenario: CPU 0 CPU 1 umount /dev/sda1 generic_shutdown_super shrink_dcache_memory() shrink_dcache_parent dput dentry select_parent prune_one_dentry() <<<< child is dead, locks are released, but parent is still referenced!!! >>>> skip dentry->parent, since it's d_count > 0 message: BUSY inodes after umount... <<< parent is left on dentry_unused list, referencing freed super block We faced these messages about busy inodes constantly after some stress testing with mount/umount operations parrallel with some other activity. This patch helped the problem. The patch was heavilly tested on 2.6.8 during 2 months, this forward-ported version boots and works ok as well. Signed-Off-By: Kirill Korotaev Signed-Off-By: Andrey Savochkin Signed-Off-By: Dmitry Mishin Signed-off-by: Andrew Morton --- fs/dcache.c | 237 +++++++++++++++++++++++++++++++++++++++++-------- fs/super.c | 4 include/linux/dcache.h | 3 include/linux/fs.h | 1 4 files changed, 205 insertions(+), 40 deletions(-) diff -puN fs/dcache.c~fix-of-dcache-race-leading-to-busy-inodes-on-umount fs/dcache.c --- 25/fs/dcache.c~fix-of-dcache-race-leading-to-busy-inodes-on-umount 2005-05-11 19:19:08.000000000 -0700 +++ 25-akpm/fs/dcache.c 2005-05-11 19:34:03.000000000 -0700 @@ -111,6 +111,78 @@ static inline void dentry_iput(struct de } } +struct dcache_shrinker { + struct list_head list; + struct dentry *dentry; +}; + +DECLARE_WAIT_QUEUE_HEAD(dcache_shrinker_wq); + +/* called under dcache_lock */ +static void dcache_shrinker_add(struct dcache_shrinker *ds, + struct dentry *parent, struct dentry *dentry) +{ + if (parent != dentry) { + struct super_block *sb; + + sb = parent->d_sb; + ds->dentry = parent; + list_add(&ds->list, &sb->s_dshrinkers); + } else + INIT_LIST_HEAD(&ds->list); +} + +/* called under dcache_lock */ +static void dcache_shrinker_del(struct dcache_shrinker *ds) +{ + if (ds == NULL || list_empty(&ds->list)) + return; + + list_del_init(&ds->list); + wake_up_all(&dcache_shrinker_wq); +} + +/* called under dcache_lock, drops inside */ +static void dcache_shrinker_wait(struct super_block *sb) +{ + DECLARE_WAITQUEUE(wq, current); + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&dcache_shrinker_wq, &wq); + spin_unlock(&dcache_lock); + + schedule(); + remove_wait_queue(&dcache_shrinker_wq, &wq); + __set_current_state(TASK_RUNNING); +} + +void dcache_shrinker_wait_sb(struct super_block *sb) +{ + /* the root dentry can be held in dput_recursive */ + spin_lock(&dcache_lock); + while (!list_empty(&sb->s_dshrinkers)) { + dcache_shrinker_wait(sb); + spin_lock(&dcache_lock); + } + spin_unlock(&dcache_lock); +} + +/* dcache_lock protects shrinker's list */ +static void shrink_dcache_racecheck(struct dentry *parent, int *racecheck) +{ + struct super_block *sb; + struct dcache_shrinker *ds; + + sb = parent->d_sb; + list_for_each_entry(ds, &sb->s_dshrinkers, list) { + /* is one of dcache shrinkers working on the dentry? */ + if (ds->dentry == parent) { + *racecheck = 1; + break; + } + } +} + /* * This is dput * @@ -129,8 +201,9 @@ static inline void dentry_iput(struct de */ /* - * dput - release a dentry - * @dentry: dentry to release + * dput_recursive - go upward through the dentry tree and release dentries + * @dentry: starting dentry + * @ds: shrinker to be added to active list (see shrink_dcache_parent) * * Release a dentry. This will drop the usage count and if appropriate * call the dentry unlink method as well as removing it from the queues and @@ -140,17 +213,15 @@ static inline void dentry_iput(struct de * no dcache lock, please. */ -void dput(struct dentry *dentry) +static void dput_recursive(struct dentry *dentry, struct dcache_shrinker *ds) { - if (!dentry) - return; - -repeat: if (atomic_read(&dentry->d_count) == 1) might_sleep(); if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) return; + dcache_shrinker_del(ds); +repeat: spin_lock(&dentry->d_lock); if (atomic_read(&dentry->d_count)) { spin_unlock(&dentry->d_lock); @@ -182,6 +253,7 @@ unhash_it: kill_it: { struct dentry *parent; + struct dcache_shrinker lds; /* If dentry was on d_lru list * delete it from there @@ -191,18 +263,43 @@ kill_it: { dentry_stat.nr_unused--; } list_del(&dentry->d_child); + parent = dentry->d_parent; + dcache_shrinker_add(&lds, parent, dentry); dentry_stat.nr_dentry--; /* For d_free, below */ /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry); - parent = dentry->d_parent; d_free(dentry); if (dentry == parent) return; dentry = parent; - goto repeat; + spin_lock(&dcache_lock); + dcache_shrinker_del(&lds); + if (atomic_dec_and_test(&dentry->d_count)) + goto repeat; + spin_unlock(&dcache_lock); } } +/* + * dput - release a dentry + * @dentry: dentry to release + * + * Release a dentry. This will drop the usage count and if appropriate + * call the dentry unlink method as well as removing it from the queues and + * releasing its resources. If the parent dentries were scheduled for release + * they too may now get deleted. + * + * no dcache lock, please. + */ + +void dput(struct dentry *dentry) +{ + if (!dentry) + return; + + dput_recursive(dentry, NULL); +} + /** * d_invalidate - invalidate a dentry * @dentry: dentry to invalidate @@ -361,19 +458,23 @@ restart: * removed. * Called with dcache_lock, drops it and then regains. */ -static inline void prune_one_dentry(struct dentry * dentry) +static void prune_one_dentry(struct dentry *dentry) { struct dentry * parent; + struct dcache_shrinker ds; __d_drop(dentry); list_del(&dentry->d_child); + parent = dentry->d_parent; + dcache_shrinker_add(&ds, parent, dentry); dentry_stat.nr_dentry--; /* For d_free, below */ dentry_iput(dentry); parent = dentry->d_parent; d_free(dentry); if (parent != dentry) - dput(parent); + dput_recursive(parent, &ds); spin_lock(&dcache_lock); + dcache_shrinker_del(&ds); } /** @@ -562,13 +663,12 @@ positive: * drop the lock and return early due to latency * constraints. */ -static int select_parent(struct dentry * parent) +static int select_parent(struct dentry *parent, int *racecheck) { struct dentry *this_parent = parent; struct list_head *next; int found = 0; - spin_lock(&dcache_lock); repeat: next = this_parent->d_subdirs.next; resume: @@ -610,6 +710,9 @@ dentry->d_parent->d_name.name, dentry->d #endif goto repeat; } + + if (!found && racecheck != NULL) + shrink_dcache_racecheck(dentry, racecheck); } /* * All done at this level ... ascend and resume the search. @@ -624,7 +727,6 @@ this_parent->d_parent->d_name.name, this goto resume; } out: - spin_unlock(&dcache_lock); return found; } @@ -637,10 +739,66 @@ out: void shrink_dcache_parent(struct dentry * parent) { - int found; + int found, r; + + while (1) { + spin_lock(&dcache_lock); + found = select_parent(parent, NULL); + if (found) + goto found; - while ((found = select_parent(parent)) != 0) + /* + * try again with a dput_recursive() race check. + * it returns quickly if everything was really shrinked + */ + r = 0; + found = select_parent(parent, &r); + if (found) + goto found; + if (!r) + break; + + /* drops the lock inside */ + dcache_shrinker_wait(parent->d_sb); + continue; + +found: + spin_unlock(&dcache_lock); prune_dcache(found); + } + spin_unlock(&dcache_lock); +} + +/* + * Move any unused anon dentries to the end of the unused list. + * called under dcache_lock + */ +static int select_anon(struct hlist_head *head, int *racecheck) +{ + struct hlist_node *lp; + int found = 0; + + hlist_for_each(lp, head) { + struct dentry *this = hlist_entry(lp, struct dentry, d_hash); + if (!list_empty(&this->d_lru)) { + dentry_stat.nr_unused--; + list_del_init(&this->d_lru); + } + + /* + * move only zero ref count dentries to the end + * of the unused list for prune_dcache + */ + if (!atomic_read(&this->d_count)) { + list_add_tail(&this->d_lru, &dentry_unused); + dentry_stat.nr_unused++; + found++; + } + + if (!found && racecheck != NULL) + shrink_dcache_racecheck(this, racecheck); + } + return found; } /** @@ -653,33 +811,36 @@ void shrink_dcache_parent(struct dentry * done under dcache_lock. * */ -void shrink_dcache_anon(struct hlist_head *head) +void shrink_dcache_anon(struct super_block *sb) { - struct hlist_node *lp; - int found; - do { - found = 0; + int found, r; + + while (1) { spin_lock(&dcache_lock); - hlist_for_each(lp, head) { - struct dentry *this = hlist_entry(lp, struct dentry, d_hash); - if (!list_empty(&this->d_lru)) { - dentry_stat.nr_unused--; - list_del_init(&this->d_lru); - } + found = select_anon(&sb->s_anon, NULL); + if (found) + goto found; - /* - * move only zero ref count dentries to the end - * of the unused list for prune_dcache - */ - if (!atomic_read(&this->d_count)) { - list_add_tail(&this->d_lru, &dentry_unused); - dentry_stat.nr_unused++; - found++; - } - } + /* + * try again with a dput_recursive() race check. + * it returns quickly if everything was really shrinked + */ + r = 0; + found = select_anon(&sb->s_anon, &r); + if (found) + goto found; + if (!r) + break; + + /* drops the lock inside */ + dcache_shrinker_wait(sb); + continue; + +found: spin_unlock(&dcache_lock); prune_dcache(found); - } while(found); + } + spin_unlock(&dcache_lock); } /* diff -puN fs/super.c~fix-of-dcache-race-leading-to-busy-inodes-on-umount fs/super.c --- 25/fs/super.c~fix-of-dcache-race-leading-to-busy-inodes-on-umount 2005-05-11 19:19:08.000000000 -0700 +++ 25-akpm/fs/super.c 2005-05-11 19:19:08.000000000 -0700 @@ -69,6 +69,7 @@ static struct super_block *alloc_super(v INIT_LIST_HEAD(&s->s_io); INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); + INIT_LIST_HEAD(&s->s_dshrinkers); INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); init_rwsem(&s->s_umount); @@ -230,8 +231,9 @@ void generic_shutdown_super(struct super if (root) { sb->s_root = NULL; shrink_dcache_parent(root); - shrink_dcache_anon(&sb->s_anon); + shrink_dcache_anon(sb); dput(root); + dcache_shrinker_wait_sb(sb); fsync_super(sb); lock_super(sb); sb->s_flags &= ~MS_ACTIVE; diff -puN include/linux/dcache.h~fix-of-dcache-race-leading-to-busy-inodes-on-umount include/linux/dcache.h --- 25/include/linux/dcache.h~fix-of-dcache-race-leading-to-busy-inodes-on-umount 2005-05-11 19:19:08.000000000 -0700 +++ 25-akpm/include/linux/dcache.h 2005-05-11 19:34:00.000000000 -0700 @@ -209,7 +209,8 @@ extern struct dentry * d_alloc_anon(stru extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); -extern void shrink_dcache_anon(struct hlist_head *); +extern void shrink_dcache_anon(struct super_block *); +extern void dcache_shrinker_wait_sb(struct super_block *sb); extern int d_invalidate(struct dentry *); /* only used at mount-time */ diff -puN include/linux/fs.h~fix-of-dcache-race-leading-to-busy-inodes-on-umount include/linux/fs.h --- 25/include/linux/fs.h~fix-of-dcache-race-leading-to-busy-inodes-on-umount 2005-05-11 19:19:08.000000000 -0700 +++ 25-akpm/include/linux/fs.h 2005-05-11 19:19:08.000000000 -0700 @@ -784,6 +784,7 @@ struct super_block { struct list_head s_io; /* parked for writeback */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_files; + struct list_head s_dshrinkers; /* active dcache shrinkers */ struct block_device *s_bdev; struct list_head s_instances; _