diff -urN 2.3.18ac5/drivers/block/ll_rw_blk.c 2.3.18ac5-dirtycache/drivers/block/ll_rw_blk.c
--- 2.3.18ac5/drivers/block/ll_rw_blk.c	Tue Sep 14 14:35:35 1999
+++ 2.3.18ac5-dirtycache/drivers/block/ll_rw_blk.c	Sun Sep 19 17:39:26 1999
@@ -694,7 +694,7 @@
 
       sorry:
 	for (i = 0; i < nr; i++) {
-		clear_bit(BH_Dirty, &bh[i]->b_state);
+		mark_buffer_clean(bh[i]); /* remeber to refile it */
 		clear_bit(BH_Uptodate, &bh[i]->b_state);
 		bh[i]->b_end_io(bh[i], 0);
 	}
diff -urN 2.3.18ac5/fs/buffer.c 2.3.18ac5-dirtycache/fs/buffer.c
--- 2.3.18ac5/fs/buffer.c	Thu Sep 16 17:56:01 1999
+++ 2.3.18ac5-dirtycache/fs/buffer.c	Sun Sep 19 17:40:27 1999
@@ -26,6 +26,8 @@
 
 /* Thread it... -DaveM */
 
+/* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
+
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/malloc.h>
@@ -76,6 +78,7 @@
 static struct buffer_head *lru_list[NR_LIST];
 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
 static int nr_buffers_type[NR_LIST] = {0,};
+static unsigned long size_buffers_type[NR_LIST] = {0,};
 
 static struct buffer_head * unused_list = NULL;
 static int nr_unused_buffer_heads = 0;
@@ -121,7 +124,7 @@
 		int dummy3;    /* unused */
 	} b_un;
 	unsigned int data[N_PARAM];
-} bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
+} bdf_prm = {{60, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 
 /* These are the min and max parameter values that we will allow to be assigned */
 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
@@ -482,6 +485,7 @@
 	(*bhp)->b_prev_free->b_next_free = bh;
 	(*bhp)->b_prev_free = bh;
 	nr_buffers_type[blist]++;
+	size_buffers_type[blist] += bh->b_size;
 }
 
 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
@@ -495,6 +499,7 @@
 			lru_list[blist] = NULL;
 		bh->b_next_free = bh->b_prev_free = NULL;
 		nr_buffers_type[blist]--;
+		size_buffers_type[blist] -= bh->b_size;
 	}
 }
 
@@ -813,6 +818,27 @@
 	return bh;
 }
 
+/* -1 -> no need to flush
+    0 -> async flush
+    1 -> sync flush (wait for I/O completation) */
+static int balance_dirty_state(kdev_t dev)
+{
+	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
+
+	dirty = size_buffers_type[BUF_DIRTY];
+	tot = atomic_read(&buffermem) + (nr_free_pages << PAGE_SHIFT);
+	hard_dirty_limit = tot * bdf_prm.b_un.nfract / 100;
+	soft_dirty_limit = hard_dirty_limit >> 1;
+
+	if (dirty > soft_dirty_limit)
+	{
+		if (dirty > hard_dirty_limit)
+			return 1;
+		return 0;
+	}
+	return -1;
+}
+
 /*
  * if a new dirty buffer is created we need to balance bdflush.
  *
@@ -820,23 +846,13 @@
  * pressures on different devices - thus the (currently unused)
  * 'dev' parameter.
  */
-static int too_many_dirty_buffers;
-
 void balance_dirty(kdev_t dev)
 {
-	int dirty = nr_buffers_type[BUF_DIRTY];
-	int ndirty = bdf_prm.b_un.ndirty;
+	int state = balance_dirty_state(dev);
 
-	if (dirty > ndirty) {
-		if (dirty > 2*ndirty) {
-			too_many_dirty_buffers = 1;
-			wakeup_bdflush(1);
-			return;
-		}
-		wakeup_bdflush(0);
-	}
-	too_many_dirty_buffers = 0;
-	return;
+	if (state < 0)
+		return;
+	wakeup_bdflush(state);
 }
 
 static inline void __mark_dirty(struct buffer_head *bh, int flag)
@@ -1345,6 +1361,7 @@
 	unsigned long bbits, blocks, i, len;
 	struct buffer_head *bh, *head;
 	char * target_buf;
+	int need_balance_dirty;
 
 	target_buf = (char *)page_address(page) + offset;
 
@@ -1384,6 +1401,7 @@
 	i = 0;
 	bh = head;
 	partial = 0;
+	need_balance_dirty = 0;
 	do {
 		if (!bh)
 			BUG();
@@ -1453,8 +1471,7 @@
 		set_bit(BH_Uptodate, &bh->b_state);
 		if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
 			__mark_dirty(bh, 0);
-			if (too_many_dirty_buffers)
-				balance_dirty(bh->b_dev);
+			need_balance_dirty = 1;
 		}
 
 		if (err) {
@@ -1468,6 +1485,9 @@
 		bh = bh->b_this_page;
 	} while (bh != head);
 
+	if (need_balance_dirty)
+		balance_dirty(bh->b_dev);
+
 	/*
 	 * is this a partial write that happened to make all buffers
 	 * uptodate then we can optimize away a bogus readpage() for
@@ -1499,6 +1519,7 @@
 	struct buffer_head *bh, *head;
 	char * target_buf, *target_data;
 	unsigned long data_offset = offset;
+	int need_balance_dirty;
 
 	offset = page->offset-inode->i_size;
 	if (offset < 0) 
@@ -1546,6 +1567,7 @@
 	i = 0;
 	bh = head;
 	partial = 0;
+	need_balance_dirty = 0;
 	do {
 		if (!bh)
 			BUG();
@@ -1623,8 +1645,7 @@
 		set_bit(BH_Uptodate, &bh->b_state);
 		if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
 			__mark_dirty(bh, 0);
-			if (too_many_dirty_buffers)
-				balance_dirty(bh->b_dev);
+			need_balance_dirty = 1;
 		}
 
 		if (err) {
@@ -1638,6 +1659,9 @@
 		bh = bh->b_this_page;
 	} while (bh != head);
 
+	if (need_balance_dirty)
+		balance_dirty(bh->b_dev);
+
 	/*
 	 * is this a partial write that happened to make all buffers
 	 * uptodate then we can optimize away a bogus readpage() for
@@ -2147,7 +2171,6 @@
 
 busy_buffer_page:
 	/* Uhhuh, start writeback so that we don't end up with all dirty pages */
-	too_many_dirty_buffers = 1;
 	wakeup_bdflush(0);
 	ret = 0;
 	goto out;
@@ -2225,21 +2248,92 @@
  * response to dirty buffers.  Once this process is activated, we write back
  * a limited number of buffers to the disks and then go back to sleep again.
  */
-static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
 struct task_struct *bdflush_tsk = 0;
 
-void wakeup_bdflush(int wait)
+void wakeup_bdflush(int block)
 {
+	DECLARE_WAITQUEUE(wait, current);
+
 	if (current == bdflush_tsk)
 		return;
-	if (wait)
-		run_task_queue(&tq_disk);
-	wake_up(&bdflush_wait);
-	if (wait)
-		sleep_on(&bdflush_done);
-}
 
+	if (!block)
+	{
+		wake_up_process(bdflush_tsk);
+		return;
+	}
+
+	/* kflushd can wakeup us before we have a chance to
+	   go to sleep so we must be smart in handling
+	   this wakeup event from kflushd to avoid deadlocking in SMP
+	   (we are not holding any lock anymore in these two paths). */
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	add_wait_queue(&bdflush_done, &wait);
+
+	wake_up_process(bdflush_tsk);
+	schedule();
+
+	remove_wait_queue(&bdflush_done, &wait);
+	__set_current_state(TASK_RUNNING);
+}
+
+/* This is the _only_ function that deals with flushing async writes
+   to disk.
+   NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
+   as all dirty buffers lives _only_ in the DIRTY lru list.
+   As we never browse the LOCKED and CLEAN lru lists they are infact
+   completly useless. */
+static void flush_dirty_buffers(int check_flushtime)
+{
+	struct buffer_head * bh, *next;
+	int flushed = 0, i;
+
+ restart:
+	spin_lock(&lru_list_lock);
+	bh = lru_list[BUF_DIRTY];
+	if (!bh)
+		goto out_unlock;
+	for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next)
+	{
+		next = bh->b_next_free;
+
+		if (!buffer_dirty(bh))
+		{
+			refile_buffer(bh);
+			continue;
+		}
+		if (buffer_locked(bh))
+			continue;
+
+		if (check_flushtime)
+		{
+			/* The dirty lru list is chronogical ordered so
+			   if the current bh is not yet timed out,
+			   then also all the following bhs
+			   will be too young. */
+			if (time_before(jiffies, bh->b_flushtime))
+				goto out_unlock;
+		}
+		else
+		{
+			if (++flushed > bdf_prm.b_un.ndirty)
+				goto out_unlock;
+		}
+
+		/* OK, now we are committed to write it out. */
+		atomic_inc(&bh->b_count);
+		spin_unlock(&lru_list_lock);
+		ll_rw_block(WRITE, 1, &bh);
+		atomic_dec(&bh->b_count);
+
+		if (current->need_resched)
+			schedule();
+		goto restart;
+	}
+ out_unlock:
+	spin_unlock(&lru_list_lock);
+}
 
 /* 
  * Here we attempt to write back old buffers.  We also try to flush inodes 
@@ -2251,47 +2345,13 @@
 
 static int sync_old_buffers(void)
 {
-	int nlist;
-
 	lock_kernel();
 	sync_supers(0);
 	sync_inodes(0);
 	unlock_kernel();
 
-	for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
-		struct buffer_head *bh;
-	repeat:
-		spin_lock(&lru_list_lock);
-		bh = lru_list[nlist];
-		if(bh) {
-			struct buffer_head *next;
-			int i;
-			for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
-				next = bh->b_next_free;
-
-				/* If the buffer is not on the proper list,
-				 * then refile it.
-				 */
-				if ((nlist == BUF_DIRTY &&
-				     (!buffer_dirty(bh) && !buffer_locked(bh))) ||
-				    (nlist == BUF_LOCKED && !buffer_locked(bh))) {
-					__refile_buffer(bh);
-					continue;
-				}
-				 
-				if (buffer_locked(bh) || !buffer_dirty(bh))
-					continue;
-
-				/* OK, now we are committed to write it out. */
-				atomic_inc(&bh->b_count);
-				spin_unlock(&lru_list_lock);
-				ll_rw_block(WRITE, 1, &bh);
-				atomic_dec(&bh->b_count);
-				goto repeat;
-			}
-		}
-		spin_unlock(&lru_list_lock);
-	}
+	flush_dirty_buffers(1);
+	/* must really sync all the active I/O request to disk here */
 	run_task_queue(&tq_disk);
 	return 0;
 }
@@ -2367,79 +2427,38 @@
 	sprintf(current->comm, "kflushd");
 	bdflush_tsk = current;
 
-	for (;;) {
-		int nlist;
+	/* avoid getting signals */
+	spin_lock_irq(&current->sigmask_lock);
+	flush_signals(current);
+	sigfillset(&current->blocked);
+	recalc_sigpending(current);
+	spin_unlock_irq(&current->sigmask_lock);
 
+	for (;;) {
 		CHECK_EMERGENCY_SYNC
 
-		for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
-			int nr, major, written = 0;
-			struct buffer_head *next;
-
-		repeat:
-			spin_lock(&lru_list_lock);
-			next = lru_list[nlist];
-			nr = nr_buffers_type[nlist];
-			while (nr-- > 0) {
-				struct buffer_head *bh = next;
-
-				next = next->b_next_free;
-					
-				/* If the buffer is not on the correct list,
-				 * then refile it.
-				 */
-				if ((nlist == BUF_DIRTY &&
-				     (!buffer_dirty(bh) && !buffer_locked(bh))) ||
-				    (nlist == BUF_LOCKED && !buffer_locked(bh))) {
-					__refile_buffer(bh);
-					continue;
-				}
-
-				/* If we aren't in panic mode, don't write out too much
-				 * at a time. Also, don't write out buffers we don't
-				 * really have to write out yet..
-				 */
-				if (!too_many_dirty_buffers) {
-					if (written > bdf_prm.b_un.ndirty)
-						break;
-					if (time_before(jiffies, bh->b_flushtime))
-						continue;
-				}
-
-				if (buffer_locked(bh) || !buffer_dirty(bh))
-					 continue;
-
-				major = MAJOR(bh->b_dev);
-				written++;
+		flush_dirty_buffers(0);
 
-				/*
-				 * For the loop major we can try to do asynchronous writes,
-				 * but we have to guarantee that we're making some progress..
-				 */
-				atomic_inc(&bh->b_count);
-				spin_unlock(&lru_list_lock);
-				ll_rw_block(WRITE, 1, &bh);
-				atomic_dec(&bh->b_count);
-				goto repeat;
-			}
-			spin_unlock(&lru_list_lock);
-		}
-		run_task_queue(&tq_disk);
+		/* If wakeup_bdflush will wakeup us
+		   after our bdflush_done wakeup, then
+		   we must make sure to not sleep
+		   in schedule_timeout otherwise
+		   wakeup_bdflush may wait for our
+		   bdflush_done wakeup that would never arrive
+		   (as we would be sleeping) and so it would
+		   deadlock in SMP. */
+		__set_current_state(TASK_INTERRUPTIBLE);
 		wake_up(&bdflush_done);
-		
 		/*
 		 * If there are still a lot of dirty buffers around,
 		 * skip the sleep and flush some more. Otherwise, we
-		 * sleep for a while and mark us as not being in panic
-		 * mode..
+		 * sleep for a while.
 		 */
-		if (!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) {
-			too_many_dirty_buffers = 0;
-			spin_lock_irq(&current->sigmask_lock);
-			flush_signals(current);
-			spin_unlock_irq(&current->sigmask_lock);
-			interruptible_sleep_on_timeout(&bdflush_wait, 5*HZ);
-		}
+		if (balance_dirty_state(NODEV) < 0)
+			schedule_timeout(5*HZ);
+		/* Remember to mark us as running otherwise
+		   the next schedule will block. */
+		__set_current_state(TASK_RUNNING);
 	}
 }