From: Daniel McNeil This patch moves the serializing of writebacks up one level to before where the dirty_pages are moved to the io_pages list. This prevents writebacks from missing any pages that are moved back to the dirty list by an SYNC_NONE writeback. I have not seen this race in testing -- just by looking at the code. It also skips the serialization for blockdevs. Also this patch changes filemap_fdatawrite() to leave the page on the locked_pages list until the i/o has finished. This prevents parallel filemap_fdatawait()s from missing a page that should be waited on. I have not seen this in testing, either. --- fs/fs-writeback.c | 25 ++++++++++++++++++++++++- mm/filemap.c | 51 ++++++++++++++++++++++++++++++++++++++++++--------- mm/page-writeback.c | 23 ++--------------------- 3 files changed, 68 insertions(+), 31 deletions(-) diff -puN fs/fs-writeback.c~serialise-writeback-fdatawait fs/fs-writeback.c --- 25/fs/fs-writeback.c~serialise-writeback-fdatawait 2004-03-07 17:11:49.000000000 -0800 +++ 25-akpm/fs/fs-writeback.c 2004-03-07 17:11:49.000000000 -0800 @@ -139,6 +139,7 @@ __sync_single_inode(struct inode *inode, struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; int wait = wbc->sync_mode == WB_SYNC_ALL; + int blkdev = inode->i_sb == blockdev_superblock; BUG_ON(inode->i_state & I_LOCK); @@ -146,6 +147,22 @@ __sync_single_inode(struct inode *inode, dirty = inode->i_state & I_DIRTY; inode->i_state |= I_LOCK; inode->i_state &= ~I_DIRTY; + spin_unlock(&inode_lock); + + /* + * Serialize writebacks except for blockdevs + */ + if (!blkdev) { + /* + * Only allow 1 SYNC writeback at a time, to be able to wait + * for all i/o without worrying about racing WB_SYNC_NONE + * writers. + */ + if (wait) + down_write(&mapping->wb_rwsema); + else + down_read(&mapping->wb_rwsema); + } /* * smp_rmb(); note: if you remove write_lock below, you must add this. @@ -157,10 +174,16 @@ __sync_single_inode(struct inode *inode, if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) list_splice_init(&mapping->dirty_pages, &mapping->io_pages); spin_unlock(&mapping->page_lock); - spin_unlock(&inode_lock); do_writepages(mapping, wbc); + if (!blkdev) { + if (wait) + up_write(&mapping->wb_rwsema); + else + up_read(&mapping->wb_rwsema); + } + /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) write_inode(inode, wait); diff -puN mm/filemap.c~serialise-writeback-fdatawait mm/filemap.c --- 25/mm/filemap.c~serialise-writeback-fdatawait 2004-03-07 17:11:49.000000000 -0800 +++ 25-akpm/mm/filemap.c 2004-03-07 17:11:49.000000000 -0800 @@ -129,6 +129,8 @@ static inline int sync_page(struct page return 0; } +extern struct super_block *blockdev_superblock; + /** * filemap_fdatawrite - start writeback against all of a mapping's dirty pages * @mapping: address space structure to write @@ -145,14 +147,29 @@ static int __filemap_fdatawrite(struct a .sync_mode = sync_mode, .nr_to_write = mapping->nrpages * 2, }; + int blkdev = mapping->host->i_sb == blockdev_superblock; if (mapping->backing_dev_info->memory_backed) return 0; + if (!blkdev) { + if (sync_mode == WB_SYNC_NONE) + down_read(&mapping->wb_rwsema); + else + down_write(&mapping->wb_rwsema); + } + spin_lock(&mapping->page_lock); list_splice_init(&mapping->dirty_pages, &mapping->io_pages); spin_unlock(&mapping->page_lock); ret = do_writepages(mapping, &wbc); + + if (!blkdev) { + if (sync_mode == WB_SYNC_NONE) + up_read(&mapping->wb_rwsema); + else + up_write(&mapping->wb_rwsema); + } return ret; } @@ -190,13 +207,24 @@ restart: struct page *page; page = list_entry(mapping->locked_pages.next,struct page,list); - list_del(&page->list); - if (PageDirty(page)) - list_add(&page->list, &mapping->dirty_pages); - else + /* + * Leave page on locked list until i/o has finished + * so parallel filemap_fdatawait()s will all see the page. + */ + + if (!PageDirty(page) && !PageLocked(page) && + !PageWriteback(page)) { + + /* + * The page is checked if locked because it might + * be in process of being setup for writeback with + * PG_dirty cleared and PG_writeback not set yet. + * The page is not dirty and i/o has finished + * so we can move it to the clean list. + */ + list_del(&page->list); list_add(&page->list, &mapping->clean_pages); - if (!PageWriteback(page)) { if (++progress > 32) { if (need_resched()) { spin_unlock(&mapping->page_lock); @@ -211,10 +239,15 @@ restart: page_cache_get(page); spin_unlock(&mapping->page_lock); - wait_on_page_writeback(page); - if (PageError(page)) - ret = -EIO; - + lock_page(page); + if (PageDirty(page) && mapping->a_ops->writepage) { + write_one_page(page, 1); + } else { + wait_on_page_writeback(page); + unlock_page(page); + } + if (PageError(page)) + ret = -EIO; page_cache_release(page); spin_lock(&mapping->page_lock); } diff -puN mm/page-writeback.c~serialise-writeback-fdatawait mm/page-writeback.c --- 25/mm/page-writeback.c~serialise-writeback-fdatawait 2004-03-07 17:11:49.000000000 -0800 +++ 25-akpm/mm/page-writeback.c 2004-03-07 17:11:49.000000000 -0800 @@ -482,28 +482,9 @@ void __init page_writeback_init(void) int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { - int ret; - - /* - * Only allow 1 SYNC writeback at a time, to be able to wait for all - * I/O without worrying about racing WB_SYNC_NONE writers. - */ - if (wbc->sync_mode == WB_SYNC_NONE) - down_read(&mapping->wb_rwsema); - else - down_write(&mapping->wb_rwsema); - if (mapping->a_ops->writepages) - ret = mapping->a_ops->writepages(mapping, wbc); - else - ret = generic_writepages(mapping, wbc); - - if (wbc->sync_mode == WB_SYNC_NONE) - up_read(&mapping->wb_rwsema); - else - up_write(&mapping->wb_rwsema); - - return ret; + return mapping->a_ops->writepages(mapping, wbc); + return generic_writepages(mapping, wbc); } /** _