diff -urN 2.2.14pre9/drivers/block/rd.c 2.2.14pre9-buf-races/drivers/block/rd.c --- 2.2.14pre9/drivers/block/rd.c Sun Oct 31 23:31:25 1999 +++ 2.2.14pre9-buf-races/drivers/block/rd.c Wed Dec 1 00:55:05 1999 @@ -191,7 +191,10 @@ switch (cmd) { case BLKFLSBUF: if (!capable(CAP_SYS_ADMIN)) return -EACCES; - invalidate_buffers(inode->i_rdev); + /* special: we want to release the ramdisk memory, + it's not like with the other blockdevices where + this ioctl only flushes away the buffer cache. */ + destroy_buffers(inode->i_rdev); break; case BLKGETSIZE: /* Return device size */ @@ -347,7 +350,7 @@ int i; for (i = 0 ; i < NUM_RAMDISKS; i++) - invalidate_buffers(MKDEV(MAJOR_NR, i)); + destroy_buffers(MKDEV(MAJOR_NR, i)); unregister_blkdev( MAJOR_NR, "ramdisk" ); blk_dev[MAJOR_NR].request_fn = 0; diff -urN 2.2.14pre9/fs/buffer.c 2.2.14pre9-buf-races/fs/buffer.c --- 2.2.14pre9/fs/buffer.c Wed Dec 1 00:54:44 1999 +++ 2.2.14pre9-buf-races/fs/buffer.c Wed Dec 1 00:59:05 1999 @@ -24,6 +24,9 @@ * - RMK */ +/* invalidate_buffers/set_blocksize/sync_dev race conditions and + fs corruption fixes, 1999, Andrea Arcangeli */ + #include #include #include @@ -263,11 +266,14 @@ void sync_dev(kdev_t dev) { - sync_buffers(dev, 0); sync_supers(dev); sync_inodes(dev); - sync_buffers(dev, 0); DQUOT_SYNC(dev); + /* sync all the dirty buffers out to disk only _after_ all the + high level layers finished generated buffer dirty data + (or we'll return with some buffer still dirty on the blockdevice + so breaking the semantics of this call) */ + sync_buffers(dev, 0); /* * FIXME(eric) we need to sync the physical devices here. * This is because some (scsi) controllers have huge amounts of @@ -396,31 +402,6 @@ return err; } -void invalidate_buffers(kdev_t dev) -{ - int i; - int nlist; - struct buffer_head * bh; - - for(nlist = 0; nlist < NR_LIST; nlist++) { - bh = lru_list[nlist]; - for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) { - if (bh->b_dev != dev) - continue; - wait_on_buffer(bh); - if (bh->b_dev != dev) - continue; - if (bh->b_count) - continue; - bh->b_flushtime = 0; - clear_bit(BH_Protected, &bh->b_state); - clear_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Dirty, &bh->b_state); - clear_bit(BH_Req, &bh->b_state); - } - } -} - /* After several hours of tedious analysis, the following hash * function won. Do not mess with it... -DaveM */ @@ -492,11 +473,14 @@ remove_from_lru_list(bh); } -static inline void put_last_free(struct buffer_head * bh) +static void put_last_free(struct buffer_head * bh) { if (bh) { struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)]; + bh->b_count = 0; + bh->b_state = 0; + remove_from_queues(bh); bh->b_dev = B_FREE; /* So it is obvious we are on the free list. */ /* Add to back of free list. */ @@ -516,7 +500,7 @@ { /* put at end of free list */ if(bh->b_dev == B_FREE) { - put_last_free(bh); + panic("B_FREE inserted into queues"); } else { struct buffer_head **bhp = &lru_list[bh->b_list]; @@ -606,10 +590,61 @@ return 0; } +/* If invalidate_buffers() will trash dirty buffers, it means some kind + of fs corruption is going on. Trashing dirty data always imply losing + information that was supposed to be just stored on the physical layer + by the user. + + Thus invalidate_buffers in general usage is not allwowed to trash dirty + buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved. + + NOTE: In the case where the user removed a removable-media-disk even if + there's still dirty data not synced on disk (due a bug in the device driver + or due an error of the user), by not destroying the dirty buffers we could + generate corruption also on the next media inserted, thus a parameter is + necessary to handle this case in the most safe way possible (trying + to not corrupt also the new disk inserted with the data belonging to + the old now corrupted disk). Also for the ramdisk the natural thing + to do in order to release the ramdisk memory is to destroy dirty buffers. + + These are two special cases. Normal usage imply the device driver + to issue a sync on the device (without waiting I/O completation) and + then an invalidate_buffers call that doesn't trashes dirty buffers. */ +void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers) +{ + int i, nlist, slept; + struct buffer_head * bh, * bhnext; + + again: + slept = 0; + for(nlist = 0; nlist < NR_LIST; nlist++) { + bh = lru_list[nlist]; + if (!bh) + continue; + for (i = nr_buffers_type[nlist] ; i > 0 ; + bh = bhnext, i--) + { + bhnext = bh->b_next_free; + if (bh->b_dev != dev) + continue; + if (buffer_locked(bh)) + { + slept = 1; + __wait_on_buffer(bh); + } + if (!bh->b_count && + (destroy_dirty_buffers || !buffer_dirty(bh))) + put_last_free(bh); + if (slept) + goto again; + } + } +} + void set_blocksize(kdev_t dev, int size) { extern int *blksize_size[]; - int i, nlist; + int i, nlist, slept; struct buffer_head * bh, *bhnext; if (!blksize_size[MAJOR(dev)]) @@ -631,27 +666,35 @@ /* We need to be quite careful how we do this - we are moving entries * around on the free list, and we can get in a loop if we are not careful. */ - for(nlist = 0; nlist < NR_LIST; nlist++) { + again: + slept = 0; + for(nlist = 0; nlist < NR_LIST; nlist++) { bh = lru_list[nlist]; - for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) { - if(!bh) - break; - - bhnext = bh->b_next_free; - if (bh->b_dev != dev) - continue; - if (bh->b_size == size) - continue; - bhnext->b_count++; - wait_on_buffer(bh); - bhnext->b_count--; - if (bh->b_dev == dev && bh->b_size != size) { - clear_bit(BH_Dirty, &bh->b_state); - clear_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Req, &bh->b_state); - bh->b_flushtime = 0; + if (!bh) + continue; + for (i = nr_buffers_type[nlist] ; i > 0 ; + bh = bhnext, i--) + { + bhnext = bh->b_next_free; + if (bh->b_dev != dev || bh->b_size == size) + continue; + if (buffer_dirty(bh)) + printk(KERN_ERR "set_blocksize: dev %s buffer_dirty %lu size %lu\n", kdevname(dev), bh->b_blocknr, bh->b_size); + if (buffer_locked(bh)) + { + slept = 1; + wait_on_buffer(bh); } - remove_from_hash_queue(bh); + if (!bh->b_count) + put_last_free(bh); + else + printk(KERN_ERR + "set_blocksize: " + "b_count %d, dev %s, block %lu!\n", + bh->b_count, bdevname(bh->b_dev), + bh->b_blocknr); + if (slept) + goto again; } } } @@ -831,9 +874,6 @@ __brelse(buf); return; } - buf->b_count = 0; - buf->b_state = 0; - remove_from_queues(buf); put_last_free(buf); } diff -urN 2.2.14pre9/fs/devices.c 2.2.14pre9-buf-races/fs/devices.c --- 2.2.14pre9/fs/devices.c Sun Oct 31 23:31:33 1999 +++ 2.2.14pre9-buf-races/fs/devices.c Wed Dec 1 00:55:05 1999 @@ -216,7 +216,8 @@ if (sb && invalidate_inodes(sb)) printk("VFS: busy inodes on changed media.\n"); - invalidate_buffers(dev); + /* special: trash all dirty data as well as the media is changed */ + destroy_buffers(dev); if (fops->revalidate) fops->revalidate(dev); diff -urN 2.2.14pre9/fs/super.c 2.2.14pre9-buf-races/fs/super.c --- 2.2.14pre9/fs/super.c Sun Oct 31 23:31:33 1999 +++ 2.2.14pre9-buf-races/fs/super.c Wed Dec 1 00:55:05 1999 @@ -1276,7 +1276,10 @@ umount_error = do_umount(old_root_dev,1, 0); if (!umount_error) { printk("okay\n"); - invalidate_buffers(old_root_dev); + /* special: the old device driver is going to be + a ramdisk and the point of this call is to free its + protected memory (even if dirty). */ + destroy_buffers(old_root_dev); return 0; } printk(KERN_ERR "error %d\n",umount_error); diff -urN 2.2.14pre9/include/linux/fs.h 2.2.14pre9-buf-races/include/linux/fs.h --- 2.2.14pre9/include/linux/fs.h Wed Dec 1 00:54:44 1999 +++ 2.2.14pre9-buf-races/include/linux/fs.h Wed Dec 1 00:55:05 1999 @@ -783,7 +783,9 @@ extern int check_disk_change(kdev_t dev); extern int invalidate_inodes(struct super_block * sb); extern void invalidate_inode_pages(struct inode *); -extern void invalidate_buffers(kdev_t dev); +#define invalidate_buffers(dev) __invalidate_buffers((dev), 0) +#define destroy_buffers(dev) __invalidate_buffers((dev), 1) +extern void __invalidate_buffers(kdev_t dev, int); extern int floppy_is_wp(int minor); extern void sync_inodes(kdev_t dev); extern void write_inode_now(struct inode *inode); diff -urN 2.2.14pre9/kernel/ksyms.c 2.2.14pre9-buf-races/kernel/ksyms.c --- 2.2.14pre9/kernel/ksyms.c Wed Dec 1 00:54:44 1999 +++ 2.2.14pre9-buf-races/kernel/ksyms.c Wed Dec 1 00:55:05 1999 @@ -148,7 +148,7 @@ EXPORT_SYMBOL(fput); EXPORT_SYMBOL(put_filp); EXPORT_SYMBOL(check_disk_change); -EXPORT_SYMBOL(invalidate_buffers); +EXPORT_SYMBOL(__invalidate_buffers); EXPORT_SYMBOL(invalidate_inodes); EXPORT_SYMBOL(invalidate_inode_pages); EXPORT_SYMBOL(truncate_inode_pages);