diff -urN 2.3.18ac5/drivers/block/ll_rw_blk.c 2.3.18ac5-dirtycache/drivers/block/ll_rw_blk.c --- 2.3.18ac5/drivers/block/ll_rw_blk.c Tue Sep 14 14:35:35 1999 +++ 2.3.18ac5-dirtycache/drivers/block/ll_rw_blk.c Sun Sep 19 17:39:26 1999 @@ -694,7 +694,7 @@ sorry: for (i = 0; i < nr; i++) { - clear_bit(BH_Dirty, &bh[i]->b_state); + mark_buffer_clean(bh[i]); /* remeber to refile it */ clear_bit(BH_Uptodate, &bh[i]->b_state); bh[i]->b_end_io(bh[i], 0); } diff -urN 2.3.18ac5/fs/buffer.c 2.3.18ac5-dirtycache/fs/buffer.c --- 2.3.18ac5/fs/buffer.c Thu Sep 16 17:56:01 1999 +++ 2.3.18ac5-dirtycache/fs/buffer.c Sun Sep 19 17:50:59 1999 @@ -26,6 +26,8 @@ /* Thread it... -DaveM */ +/* async buffer flushing, 1999 Andrea Arcangeli */ + #include #include #include @@ -76,6 +78,7 @@ static struct buffer_head *lru_list[NR_LIST]; static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED; static int nr_buffers_type[NR_LIST] = {0,}; +static unsigned long size_buffers_type[NR_LIST] = {0,}; static struct buffer_head * unused_list = NULL; static int nr_unused_buffer_heads = 0; @@ -121,7 +124,7 @@ int dummy3; /* unused */ } b_un; unsigned int data[N_PARAM]; -} bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}}; +} bdf_prm = {{60, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}}; /* These are the min and max parameter values that we will allow to be assigned */ int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1}; @@ -482,6 +485,7 @@ (*bhp)->b_prev_free->b_next_free = bh; (*bhp)->b_prev_free = bh; nr_buffers_type[blist]++; + size_buffers_type[blist] += bh->b_size; } static void __remove_from_lru_list(struct buffer_head * bh, int blist) @@ -495,6 +499,7 @@ lru_list[blist] = NULL; bh->b_next_free = bh->b_prev_free = NULL; nr_buffers_type[blist]--; + size_buffers_type[blist] -= bh->b_size; } } @@ -813,6 +818,27 @@ return bh; } +/* -1 -> no need to flush + 0 -> async flush + 1 -> sync flush (wait for I/O completation) */ +static int balance_dirty_state(kdev_t dev) +{ + unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; + + dirty = size_buffers_type[BUF_DIRTY]; + tot = (nr_lru_pages + nr_free_pages) << PAGE_SHIFT; + hard_dirty_limit = tot * bdf_prm.b_un.nfract / 100; + soft_dirty_limit = hard_dirty_limit >> 1; + + if (dirty > soft_dirty_limit) + { + if (dirty > hard_dirty_limit) + return 1; + return 0; + } + return -1; +} + /* * if a new dirty buffer is created we need to balance bdflush. * @@ -820,23 +846,13 @@ * pressures on different devices - thus the (currently unused) * 'dev' parameter. */ -static int too_many_dirty_buffers; - void balance_dirty(kdev_t dev) { - int dirty = nr_buffers_type[BUF_DIRTY]; - int ndirty = bdf_prm.b_un.ndirty; + int state = balance_dirty_state(dev); - if (dirty > ndirty) { - if (dirty > 2*ndirty) { - too_many_dirty_buffers = 1; - wakeup_bdflush(1); - return; - } - wakeup_bdflush(0); - } - too_many_dirty_buffers = 0; - return; + if (state < 0) + return; + wakeup_bdflush(state); } static inline void __mark_dirty(struct buffer_head *bh, int flag) @@ -1345,6 +1361,7 @@ unsigned long bbits, blocks, i, len; struct buffer_head *bh, *head; char * target_buf; + int need_balance_dirty; target_buf = (char *)page_address(page) + offset; @@ -1384,6 +1401,7 @@ i = 0; bh = head; partial = 0; + need_balance_dirty = 0; do { if (!bh) BUG(); @@ -1453,8 +1471,7 @@ set_bit(BH_Uptodate, &bh->b_state); if (!test_and_set_bit(BH_Dirty, &bh->b_state)) { __mark_dirty(bh, 0); - if (too_many_dirty_buffers) - balance_dirty(bh->b_dev); + need_balance_dirty = 1; } if (err) { @@ -1468,6 +1485,9 @@ bh = bh->b_this_page; } while (bh != head); + if (need_balance_dirty) + balance_dirty(bh->b_dev); + /* * is this a partial write that happened to make all buffers * uptodate then we can optimize away a bogus readpage() for @@ -1499,6 +1519,7 @@ struct buffer_head *bh, *head; char * target_buf, *target_data; unsigned long data_offset = offset; + int need_balance_dirty; offset = page->offset-inode->i_size; if (offset < 0) @@ -1546,6 +1567,7 @@ i = 0; bh = head; partial = 0; + need_balance_dirty = 0; do { if (!bh) BUG(); @@ -1623,8 +1645,7 @@ set_bit(BH_Uptodate, &bh->b_state); if (!test_and_set_bit(BH_Dirty, &bh->b_state)) { __mark_dirty(bh, 0); - if (too_many_dirty_buffers) - balance_dirty(bh->b_dev); + need_balance_dirty = 1; } if (err) { @@ -1638,6 +1659,9 @@ bh = bh->b_this_page; } while (bh != head); + if (need_balance_dirty) + balance_dirty(bh->b_dev); + /* * is this a partial write that happened to make all buffers * uptodate then we can optimize away a bogus readpage() for @@ -2147,7 +2171,6 @@ busy_buffer_page: /* Uhhuh, start writeback so that we don't end up with all dirty pages */ - too_many_dirty_buffers = 1; wakeup_bdflush(0); ret = 0; goto out; @@ -2225,21 +2248,92 @@ * response to dirty buffers. Once this process is activated, we write back * a limited number of buffers to the disks and then go back to sleep again. */ -static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait); static DECLARE_WAIT_QUEUE_HEAD(bdflush_done); struct task_struct *bdflush_tsk = 0; -void wakeup_bdflush(int wait) +void wakeup_bdflush(int block) { + DECLARE_WAITQUEUE(wait, current); + if (current == bdflush_tsk) return; - if (wait) - run_task_queue(&tq_disk); - wake_up(&bdflush_wait); - if (wait) - sleep_on(&bdflush_done); -} + if (!block) + { + wake_up_process(bdflush_tsk); + return; + } + + /* kflushd can wakeup us before we have a chance to + go to sleep so we must be smart in handling + this wakeup event from kflushd to avoid deadlocking in SMP + (we are not holding any lock anymore in these two paths). */ + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&bdflush_done, &wait); + + wake_up_process(bdflush_tsk); + schedule(); + + remove_wait_queue(&bdflush_done, &wait); + __set_current_state(TASK_RUNNING); +} + +/* This is the _only_ function that deals with flushing async writes + to disk. + NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list + as all dirty buffers lives _only_ in the DIRTY lru list. + As we never browse the LOCKED and CLEAN lru lists they are infact + completly useless. */ +static void flush_dirty_buffers(int check_flushtime) +{ + struct buffer_head * bh, *next; + int flushed = 0, i; + + restart: + spin_lock(&lru_list_lock); + bh = lru_list[BUF_DIRTY]; + if (!bh) + goto out_unlock; + for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) + { + next = bh->b_next_free; + + if (!buffer_dirty(bh)) + { + refile_buffer(bh); + continue; + } + if (buffer_locked(bh)) + continue; + + if (check_flushtime) + { + /* The dirty lru list is chronogical ordered so + if the current bh is not yet timed out, + then also all the following bhs + will be too young. */ + if (time_before(jiffies, bh->b_flushtime)) + goto out_unlock; + } + else + { + if (++flushed > bdf_prm.b_un.ndirty) + goto out_unlock; + } + + /* OK, now we are committed to write it out. */ + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + ll_rw_block(WRITE, 1, &bh); + atomic_dec(&bh->b_count); + + if (current->need_resched) + schedule(); + goto restart; + } + out_unlock: + spin_unlock(&lru_list_lock); +} /* * Here we attempt to write back old buffers. We also try to flush inodes @@ -2251,47 +2345,13 @@ static int sync_old_buffers(void) { - int nlist; - lock_kernel(); sync_supers(0); sync_inodes(0); unlock_kernel(); - for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) { - struct buffer_head *bh; - repeat: - spin_lock(&lru_list_lock); - bh = lru_list[nlist]; - if(bh) { - struct buffer_head *next; - int i; - for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) { - next = bh->b_next_free; - - /* If the buffer is not on the proper list, - * then refile it. - */ - if ((nlist == BUF_DIRTY && - (!buffer_dirty(bh) && !buffer_locked(bh))) || - (nlist == BUF_LOCKED && !buffer_locked(bh))) { - __refile_buffer(bh); - continue; - } - - if (buffer_locked(bh) || !buffer_dirty(bh)) - continue; - - /* OK, now we are committed to write it out. */ - atomic_inc(&bh->b_count); - spin_unlock(&lru_list_lock); - ll_rw_block(WRITE, 1, &bh); - atomic_dec(&bh->b_count); - goto repeat; - } - } - spin_unlock(&lru_list_lock); - } + flush_dirty_buffers(1); + /* must really sync all the active I/O request to disk here */ run_task_queue(&tq_disk); return 0; } @@ -2367,79 +2427,38 @@ sprintf(current->comm, "kflushd"); bdflush_tsk = current; - for (;;) { - int nlist; + /* avoid getting signals */ + spin_lock_irq(¤t->sigmask_lock); + flush_signals(current); + sigfillset(¤t->blocked); + recalc_sigpending(current); + spin_unlock_irq(¤t->sigmask_lock); + for (;;) { CHECK_EMERGENCY_SYNC - for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) { - int nr, major, written = 0; - struct buffer_head *next; - - repeat: - spin_lock(&lru_list_lock); - next = lru_list[nlist]; - nr = nr_buffers_type[nlist]; - while (nr-- > 0) { - struct buffer_head *bh = next; - - next = next->b_next_free; - - /* If the buffer is not on the correct list, - * then refile it. - */ - if ((nlist == BUF_DIRTY && - (!buffer_dirty(bh) && !buffer_locked(bh))) || - (nlist == BUF_LOCKED && !buffer_locked(bh))) { - __refile_buffer(bh); - continue; - } - - /* If we aren't in panic mode, don't write out too much - * at a time. Also, don't write out buffers we don't - * really have to write out yet.. - */ - if (!too_many_dirty_buffers) { - if (written > bdf_prm.b_un.ndirty) - break; - if (time_before(jiffies, bh->b_flushtime)) - continue; - } - - if (buffer_locked(bh) || !buffer_dirty(bh)) - continue; - - major = MAJOR(bh->b_dev); - written++; + flush_dirty_buffers(0); - /* - * For the loop major we can try to do asynchronous writes, - * but we have to guarantee that we're making some progress.. - */ - atomic_inc(&bh->b_count); - spin_unlock(&lru_list_lock); - ll_rw_block(WRITE, 1, &bh); - atomic_dec(&bh->b_count); - goto repeat; - } - spin_unlock(&lru_list_lock); - } - run_task_queue(&tq_disk); + /* If wakeup_bdflush will wakeup us + after our bdflush_done wakeup, then + we must make sure to not sleep + in schedule_timeout otherwise + wakeup_bdflush may wait for our + bdflush_done wakeup that would never arrive + (as we would be sleeping) and so it would + deadlock in SMP. */ + __set_current_state(TASK_INTERRUPTIBLE); wake_up(&bdflush_done); - /* * If there are still a lot of dirty buffers around, * skip the sleep and flush some more. Otherwise, we - * sleep for a while and mark us as not being in panic - * mode.. + * sleep for a while. */ - if (!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) { - too_many_dirty_buffers = 0; - spin_lock_irq(¤t->sigmask_lock); - flush_signals(current); - spin_unlock_irq(¤t->sigmask_lock); - interruptible_sleep_on_timeout(&bdflush_wait, 5*HZ); - } + if (balance_dirty_state(NODEV) < 0) + schedule_timeout(5*HZ); + /* Remember to mark us as running otherwise + the next schedule will block. */ + __set_current_state(TASK_RUNNING); } }