diff -urN ref/drivers/block/ll_rw_blk.c buffer-4/drivers/block/ll_rw_blk.c --- ref/drivers/block/ll_rw_blk.c Fri Feb 4 16:59:04 2000 +++ buffer-4/drivers/block/ll_rw_blk.c Fri Feb 4 04:03:32 2000 @@ -494,10 +494,6 @@ count = bh->b_size >> 9; sector = bh->b_rsector; - /* It had better not be a new buffer by the time we see it */ - if (buffer_new(bh)) - BUG(); - /* Only one thread can actually submit the I/O. */ if (test_and_set_bit(BH_Lock, &bh->b_state)) return; diff -urN ref/drivers/block/rd.c buffer-4/drivers/block/rd.c --- ref/drivers/block/rd.c Fri Feb 4 16:59:04 2000 +++ buffer-4/drivers/block/rd.c Fri Feb 4 04:03:32 2000 @@ -290,7 +290,10 @@ switch (cmd) { case BLKFLSBUF: if (!capable(CAP_SYS_ADMIN)) return -EACCES; - invalidate_buffers(inode->i_rdev); + /* special: we want to release the ramdisk memory, + it's not like with the other blockdevices where + this ioctl only flushes away the buffer cache. */ + destroy_buffers(inode->i_rdev); break; case BLKGETSIZE: /* Return device size */ @@ -391,7 +394,7 @@ int i; for (i = 0 ; i < NUM_RAMDISKS; i++) - invalidate_buffers(MKDEV(MAJOR_NR, i)); + destroy_buffers(MKDEV(MAJOR_NR, i)); unregister_blkdev( MAJOR_NR, "ramdisk" ); blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR)); diff -urN ref/fs/block_dev.c buffer-4/fs/block_dev.c --- ref/fs/block_dev.c Fri Feb 4 16:59:05 2000 +++ buffer-4/fs/block_dev.c Fri Feb 4 04:03:32 2000 @@ -533,7 +533,7 @@ if (sb && invalidate_inodes(sb)) printk("VFS: busy inodes on changed media.\n"); - invalidate_buffers(dev); + destroy_buffers(dev); if (bdops->revalidate) bdops->revalidate(dev); diff -urN ref/fs/buffer.c buffer-4/fs/buffer.c --- ref/fs/buffer.c Fri Feb 4 16:59:05 2000 +++ buffer-4/fs/buffer.c Fri Feb 4 16:57:03 2000 @@ -94,6 +94,7 @@ kmem_cache_t *bh_cachep; static int grow_buffers(int size); +static void __refile_buffer(struct buffer_head *); /* This is used by some architectures to estimate available memory. */ atomic_t buffermem_pages = ATOMIC_INIT(0); @@ -277,11 +278,14 @@ void sync_dev(kdev_t dev) { - sync_buffers(dev, 0); sync_supers(dev); sync_inodes(dev); - sync_buffers(dev, 0); DQUOT_SYNC(dev); + /* sync all the dirty buffers out to disk only _after_ all the + high level layers finished generated buffer dirty data + (or we'll return with some buffer still dirty on the blockdevice + so breaking the semantics of this call) */ + sync_buffers(dev, 0); /* * FIXME(eric) we need to sync the physical devices here. * This is because some (scsi) controllers have huge amounts of @@ -412,40 +416,6 @@ return err; } -void invalidate_buffers(kdev_t dev) -{ - int nlist; - - spin_lock(&lru_list_lock); - for(nlist = 0; nlist < NR_LIST; nlist++) { - struct buffer_head * bh; - int i; - retry: - bh = lru_list[nlist]; - if (!bh) - continue; - for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) { - if (bh->b_dev != dev) - continue; - if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); - spin_unlock(&lru_list_lock); - wait_on_buffer(bh); - spin_lock(&lru_list_lock); - atomic_dec(&bh->b_count); - goto retry; - } - if (atomic_read(&bh->b_count)) - continue; - clear_bit(BH_Protected, &bh->b_state); - clear_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Dirty, &bh->b_state); - clear_bit(BH_Req, &bh->b_state); - } - } - spin_unlock(&lru_list_lock); -} - /* After several hours of tedious analysis, the following hash * function won. Do not mess with it... -DaveM */ @@ -464,10 +434,13 @@ static __inline__ void __hash_unlink(struct buffer_head *bh) { - if (bh->b_next) - bh->b_next->b_pprev = bh->b_pprev; - *(bh->b_pprev) = bh->b_next; - bh->b_pprev = NULL; + if (bh->b_pprev) + { + if (bh->b_next) + bh->b_next->b_pprev = bh->b_pprev; + *(bh->b_pprev) = bh->b_next; + bh->b_pprev = NULL; + } } static void __insert_into_lru_list(struct buffer_head * bh, int blist) @@ -514,17 +487,12 @@ bh->b_next_free = bh->b_prev_free = NULL; } -/* The following two functions must operate atomically - * because they control the visibility of a buffer head - * to the rest of the kernel. - */ -static __inline__ void __remove_from_queues(struct buffer_head *bh) +/* must be called with both the hash_table_lock and the lru_list_lock + held */ +static void __remove_from_queues(struct buffer_head *bh) { - write_lock(&hash_table_lock); - if (bh->b_pprev) - __hash_unlink(bh); + __hash_unlink(bh); __remove_from_lru_list(bh, bh->b_list); - write_unlock(&hash_table_lock); } static void insert_into_queues(struct buffer_head *bh) @@ -547,6 +515,8 @@ struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)]; struct buffer_head **bhp = &head->list; + bh->b_state = 0; + spin_lock(&head->lock); bh->b_dev = B_FREE; if(!*bhp) { @@ -604,11 +574,75 @@ return 0; } +/* If invalidate_buffers() will trash dirty buffers, it means some kind + of fs corruption is going on. Trashing dirty data always imply losing + information that was supposed to be just stored on the physical layer + by the user. + + Thus invalidate_buffers in general usage is not allwowed to trash dirty + buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved. + + NOTE: In the case where the user removed a removable-media-disk even if + there's still dirty data not synced on disk (due a bug in the device driver + or due an error of the user), by not destroying the dirty buffers we could + generate corruption also on the next media inserted, thus a parameter is + necessary to handle this case in the most safe way possible (trying + to not corrupt also the new disk inserted with the data belonging to + the old now corrupted disk). Also for the ramdisk the natural thing + to do in order to release the ramdisk memory is to destroy dirty buffers. + + These are two special cases. Normal usage imply the device driver + to issue a sync on the device (without waiting I/O completation) and + then an invalidate_buffers call that doesn't trashes dirty buffers. */ +void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers) +{ + int i, nlist, slept; + struct buffer_head * bh, * bh_next; + + retry: + slept = 0; + spin_lock(&lru_list_lock); + for(nlist = 0; nlist < NR_LIST; nlist++) { + bh = lru_list[nlist]; + if (!bh) + continue; + for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) + { + bh_next = bh->b_next_free; + if (bh->b_dev != dev) + continue; + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + slept = 1; + spin_lock(&lru_list_lock); + atomic_dec(&bh->b_count); + } + + write_lock(&hash_table_lock); + if (!atomic_read(&bh->b_count) && + (destroy_dirty_buffers || !buffer_dirty(bh))) + { + __remove_from_queues(bh); + put_last_free(bh); + } + write_unlock(&hash_table_lock); + if (slept) + goto out; + } + } +out: + spin_unlock(&lru_list_lock); + if (slept) + goto retry; +} + void set_blocksize(kdev_t dev, int size) { extern int *blksize_size[]; - int i, nlist; - struct buffer_head * bh, *bhnext; + int i, nlist, slept; + struct buffer_head * bh, * bh_next; if (!blksize_size[MAJOR(dev)]) return; @@ -626,41 +660,57 @@ sync_buffers(dev, 2); blksize_size[MAJOR(dev)][MINOR(dev)] = size; - /* We need to be quite careful how we do this - we are moving entries - * around on the free list, and we can get in a loop if we are not careful. - */ + retry: + slept = 0; + spin_lock(&lru_list_lock); for(nlist = 0; nlist < NR_LIST; nlist++) { - repeat: - spin_lock(&lru_list_lock); bh = lru_list[nlist]; - for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) { - if(!bh) - break; - - bhnext = bh->b_next_free; - if (bh->b_dev != dev) - continue; - if (bh->b_size == size) - continue; + if (!bh) + continue; + for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) + { + bh_next = bh->b_next_free; + if (bh->b_dev != dev || bh->b_size == size) + continue; if (buffer_locked(bh)) { atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); wait_on_buffer(bh); + slept = 1; + spin_lock(&lru_list_lock); atomic_dec(&bh->b_count); - goto repeat; } - if (bh->b_dev == dev && bh->b_size != size) { - clear_bit(BH_Dirty, &bh->b_state); - clear_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Req, &bh->b_state); - } - if (atomic_read(&bh->b_count) == 0) { + + write_lock(&hash_table_lock); + if (!atomic_read(&bh->b_count)) + { + if (buffer_dirty(bh)) + printk(KERN_WARNING + "set_blocksize: dev %s buffer_dirty %lu size %hu\n", + kdevname(dev), bh->b_blocknr, bh->b_size); __remove_from_queues(bh); put_last_free(bh); } + else + { + if (atomic_set_buffer_clean(bh)) + __refile_buffer(bh); + clear_bit(BH_Uptodate, &bh->b_state); + printk(KERN_WARNING + "set_blocksize: " + "b_count %d, dev %s, block %lu, from %p\n", + atomic_read(&bh->b_count), bdevname(bh->b_dev), + bh->b_blocknr, __builtin_return_address(0)); + } + write_unlock(&hash_table_lock); + if (slept) + goto out; } - spin_unlock(&lru_list_lock); } + out: + spin_unlock(&lru_list_lock); + if (slept) + goto retry; } /* @@ -785,30 +835,29 @@ atomic_set(&bh->b_count, 1); } spin_unlock(&free_list[isize].lock); - if (!bh) - goto refill; + if (bh) + { + /* OK, FINALLY we know that this buffer is the only one of + its kind, we hold a reference (b_count>0), it is unlocked, + and it is clean. */ + init_buffer(bh, end_buffer_io_sync, NULL); + bh->b_dev = dev; + bh->b_blocknr = block; + bh->b_state = 1 << BH_Mapped; - /* OK, FINALLY we know that this buffer is the only one of its kind, - * we hold a reference (b_count>0), it is unlocked, and it is clean. - */ - init_buffer(bh, end_buffer_io_sync, NULL); - bh->b_dev = dev; - bh->b_blocknr = block; - bh->b_state = 1 << BH_Mapped; - - /* Insert the buffer into the regular lists */ - insert_into_queues(bh); - goto out; + /* Insert the buffer into the regular lists */ + insert_into_queues(bh); + out: + touch_buffer(bh); + return bh; + } /* * If we block while refilling the free list, somebody may * create the buffer first ... search the hashes again. */ -refill: refill_freelist(size); goto repeat; -out: - return bh; } /* -1 -> no need to flush @@ -851,23 +900,38 @@ wakeup_bdflush(state); } -static inline void __mark_dirty(struct buffer_head *bh, int flag) +#define set_bh_age(bh, flag) \ +do { \ + (bh)->b_flushtime = jiffies + \ + ((flag) ? bdf_prm.b_un.age_super : \ + bdf_prm.b_un.age_buffer); \ +} while(0) + +static __inline__ void __mark_dirty(struct buffer_head *bh, int flag) { - bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer); - clear_bit(BH_New, &bh->b_state); + set_bh_age(bh, flag); refile_buffer(bh); } +/* atomic version, the user must call balance_dirty() by hand + as soon as it become possible to block */ void __mark_buffer_dirty(struct buffer_head *bh, int flag) { - __mark_dirty(bh, flag); + if (!atomic_set_buffer_dirty(bh)) + __mark_dirty(bh, flag); +} + +void mark_buffer_dirty(struct buffer_head *bh, int flag) +{ + __mark_buffer_dirty(bh, flag); + balance_dirty(bh->b_dev); } /* * A buffer may need to be moved from one buffer list to another * (e.g. in case it is not shared any more). Handle this. */ -static __inline__ void __refile_buffer(struct buffer_head *bh) +static void __refile_buffer(struct buffer_head *bh) { int dispose = BUF_CLEAN; if (buffer_locked(bh)) @@ -895,8 +959,6 @@ */ void __brelse(struct buffer_head * buf) { - touch_buffer(buf); - if (atomic_read(&buf->b_count)) { atomic_dec(&buf->b_count); return; @@ -917,12 +979,10 @@ write_lock(&hash_table_lock); if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf)) goto in_use; - if (buf->b_pprev) - __hash_unlink(buf); + __hash_unlink(buf); write_unlock(&hash_table_lock); __remove_from_lru_list(buf, buf->b_list); spin_unlock(&lru_list_lock); - buf->b_state = 0; put_last_free(buf); return; @@ -1230,6 +1290,7 @@ clear_bit(BH_Uptodate, &bh->b_state); clear_bit(BH_Mapped, &bh->b_state); clear_bit(BH_Req, &bh->b_state); + clear_bit(BH_New, &bh->b_state); } } @@ -1308,7 +1369,6 @@ static void unmap_underlying_metadata(struct buffer_head * bh) { -#if 0 if (buffer_new(bh)) { struct buffer_head *old_bh; @@ -1321,7 +1381,6 @@ __bforget(old_bh); } } -#endif } /* @@ -1331,7 +1390,7 @@ int block_write_full_page(struct dentry *dentry, struct page *page) { struct inode *inode = dentry->d_inode; - int err, i; + int err, i, need_balance_dirty = 0; unsigned long block; struct buffer_head *bh, *head; @@ -1369,12 +1428,19 @@ unmap_underlying_metadata(bh); } set_bit(BH_Uptodate, &bh->b_state); - mark_buffer_dirty(bh,0); + if (!atomic_set_buffer_dirty(bh)) + { + __mark_dirty(bh, 0); + need_balance_dirty = 1; + } bh = bh->b_this_page; block++; } while (bh != head); + if (need_balance_dirty) + balance_dirty(bh->b_dev); + SetPageUptodate(page); return 0; out: @@ -1421,12 +1487,12 @@ if (err) goto out; unmap_underlying_metadata(bh); - } - if (buffer_new(bh)) { - zeroto = block_end; - if (block_start < zerofrom) - zerofrom = block_start; - continue; + if (buffer_new(bh)) { + zeroto = block_end; + if (block_start < zerofrom) + zerofrom = block_start; + continue; + } } if (!buffer_uptodate(bh) && (block_start < zerofrom || block_end > to)) { @@ -1480,7 +1546,7 @@ partial = 1; } else { set_bit(BH_Uptodate, &bh->b_state); - if (!test_and_set_bit(BH_Dirty, &bh->b_state)) { + if (!atomic_set_buffer_dirty(bh)) { __mark_dirty(bh, 0); need_balance_dirty = 1; } @@ -2030,13 +2096,10 @@ /* The buffer can be either on the regular * queues or on the free list.. */ - if (p->b_dev == B_FREE) { + if (p->b_dev != B_FREE) + __remove_from_queues(p); + else __remove_from_free_list(p, index); - } else { - if (p->b_pprev) - __hash_unlink(p); - __remove_from_lru_list(p, p->b_list); - } __put_unused_buffer_head(p); } while (tmp != bh); spin_unlock(&unused_list_lock); diff -urN ref/fs/super.c buffer-4/fs/super.c --- ref/fs/super.c Fri Feb 4 16:59:05 2000 +++ buffer-4/fs/super.c Fri Feb 4 04:03:32 2000 @@ -1378,7 +1378,10 @@ bdev = do_umount(old_root_dev,1, 0); if (!IS_ERR(bdev)) { printk("okay\n"); - invalidate_buffers(old_root_dev); + /* special: the old device driver is going to be + a ramdisk and the point of this call is to free its + protected memory (even if dirty). */ + destroy_buffers(old_root_dev); if (bdev) { blkdev_put(bdev, BDEV_FS); bdput(bdev); diff -urN ref/include/linux/fs.h buffer-4/include/linux/fs.h --- ref/include/linux/fs.h Fri Feb 4 16:59:05 2000 +++ buffer-4/include/linux/fs.h Fri Feb 4 04:07:19 2000 @@ -852,20 +852,17 @@ } extern void FASTCALL(__mark_buffer_dirty(struct buffer_head *bh, int flag)); +extern void FASTCALL(mark_buffer_dirty(struct buffer_head *bh, int flag)); #define atomic_set_buffer_dirty(bh) test_and_set_bit(BH_Dirty, &(bh)->b_state) -extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag) -{ - if (!atomic_set_buffer_dirty(bh)) - __mark_buffer_dirty(bh, flag); -} - extern void balance_dirty(kdev_t); extern int check_disk_change(kdev_t); extern int invalidate_inodes(struct super_block *); extern void invalidate_inode_pages(struct inode *); -extern void invalidate_buffers(kdev_t); +#define invalidate_buffers(dev) __invalidate_buffers((dev), 0) +#define destroy_buffers(dev) __invalidate_buffers((dev), 1) +extern void __invalidate_buffers(kdev_t dev, int); extern int floppy_is_wp(int); extern void sync_inodes(kdev_t); extern void write_inode_now(struct inode *); diff -urN ref/kernel/ksyms.c buffer-4/kernel/ksyms.c --- ref/kernel/ksyms.c Fri Feb 4 16:59:05 2000 +++ buffer-4/kernel/ksyms.c Fri Feb 4 04:03:32 2000 @@ -158,6 +158,7 @@ EXPORT_SYMBOL(d_alloc); EXPORT_SYMBOL(d_lookup); EXPORT_SYMBOL(d_path); +EXPORT_SYMBOL(mark_buffer_dirty); EXPORT_SYMBOL(__mark_buffer_dirty); EXPORT_SYMBOL(__mark_inode_dirty); EXPORT_SYMBOL(free_kiovec); @@ -172,7 +173,7 @@ EXPORT_SYMBOL(put_filp); EXPORT_SYMBOL(files_lock); EXPORT_SYMBOL(check_disk_change); -EXPORT_SYMBOL(invalidate_buffers); +EXPORT_SYMBOL(__invalidate_buffers); EXPORT_SYMBOL(invalidate_inodes); EXPORT_SYMBOL(invalidate_inode_pages); EXPORT_SYMBOL(truncate_inode_pages);