diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/drivers/block/as-iosched.c linux-2.6.5-rc1-mm2/drivers/block/as-iosched.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/drivers/block/as-iosched.c	2004-03-11 03:55:20.000000000 +0100
+++ linux-2.6.5-rc1-mm2/drivers/block/as-iosched.c	2004-03-19 16:10:26.794665823 +0100
@@ -1498,20 +1498,13 @@ as_insert_request(request_queue_t *q, st
 	struct as_data *ad = q->elevator.elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 
-	if (arq) {
-		if (arq->state != AS_RQ_PRESCHED) {
-			printk("arq->state: %d\n", arq->state);
-			WARN_ON(1);
-		}
+	if (arq)
 		arq->state = AS_RQ_NEW;
-	}
 
 	/* barriers must flush the reorder queue */
 	if (unlikely(rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)
-			&& where == ELEVATOR_INSERT_SORT)) {
-		WARN_ON(1);
+			&& where == ELEVATOR_INSERT_SORT))
 		where = ELEVATOR_INSERT_BACK;
-	}
 
 	switch (where) {
 		case ELEVATOR_INSERT_BACK:
@@ -1526,6 +1519,8 @@ as_insert_request(request_queue_t *q, st
 			break;
 		case ELEVATOR_INSERT_FRONT:
 			list_add(&rq->queuelist, ad->dispatch);
+			if (blk_fs_request(rq))
+				ad->nr_dispatched++;
 			as_antic_stop(ad);
 			break;
 		case ELEVATOR_INSERT_SORT:
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/drivers/block/ll_rw_blk.c linux-2.6.5-rc1-mm2/drivers/block/ll_rw_blk.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/drivers/block/ll_rw_blk.c	2004-03-19 15:13:57.000000000 +0100
+++ linux-2.6.5-rc1-mm2/drivers/block/ll_rw_blk.c	2004-03-19 16:10:26.794665823 +0100
@@ -255,6 +255,28 @@ void blk_queue_make_request(request_queu
 EXPORT_SYMBOL(blk_queue_make_request);
 
 /**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q:     the request queue
+ * @flag:  see below
+ *
+ * Description:
+ *   For journalled file systems, doing ordered writes on a commit
+ *   block instead of explicitly doing wait_on_buffer (which is bad
+ *   for performance) can be a big win. Block drivers supporting this
+ *   feature should call this function and indicate so.
+ *
+ **/
+void blk_queue_ordered(request_queue_t *q, int flag)
+{
+	if (flag)
+		set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+	else
+		clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_ordered);
+
+/**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q:  the request queue for the device
  * @dma_addr:   bus address limit
@@ -1895,6 +1917,43 @@ int blk_execute_rq(request_queue_t *q, s
 
 EXPORT_SYMBOL(blk_execute_rq);
 
+/*
+ * the idea here is to insert a SYNC_CACHE scsi command, and let lower layers
+ * transform it if they have to. two possible ways to fix this to work on
+ * dm/md: turns the last part of this into a queue ->issue_flush_fn() so
+ * drivers can implement, or 
+ */
+int blkdev_issue_flush(struct block_device *bdev)
+{
+	request_queue_t *q;
+	struct request *rq;
+	int ret;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+	if (!q->request_fn)
+		return -EOPNOTSUPP;
+
+	rq = blk_get_request(q, WRITE, __GFP_WAIT);
+
+	memset(rq->cmd, 0, sizeof(rq->cmd));
+	rq->cmd[0] = 0x35;
+	rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER;
+	rq->data = NULL;
+	rq->cmd_len = 12;
+	rq->timeout = 60 * HZ;
+
+	ret = blk_execute_rq(q, bdev->bd_disk, rq);
+	blk_put_request(rq);
+	return ret;
+}
+
+EXPORT_SYMBOL(blkdev_issue_flush);
+
 void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
 {
 	int rw = rq_data_dir(rq);
@@ -1973,6 +2032,8 @@ void __blk_put_request(request_queue_t *
 
 	if (unlikely(!q))
 		return;
+
+	WARN_ON(!req->ref_count);
 	if (unlikely(--req->ref_count))
 		return;
 
@@ -2148,7 +2209,7 @@ EXPORT_SYMBOL(__blk_attempt_remerge);
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
 	struct request *req, *freereq = NULL;
-	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
+	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err;
 	sector_t sector;
 
 	sector = bio->bi_sector;
@@ -2166,9 +2227,11 @@ static int __make_request(request_queue_
 
 	spin_lock_prefetch(q->queue_lock);
 
-	barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw);
-
-	ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
+	barrier = bio_barrier(bio);
+	if (barrier && !(q->queue_flags & (1 << QUEUE_FLAG_ORDERED))) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
 
 again:
 	spin_lock_irq(q->queue_lock);
@@ -2248,7 +2311,8 @@ get_rq:
 			/*
 			 * READA bit set
 			 */
-			if (ra)
+			err = -EWOULDBLOCK;
+			if (bio_rw_ahead(bio))
 				goto end_io;
 	
 			freereq = get_request_wait(q, rw);
@@ -2259,10 +2323,9 @@ get_rq:
 	req->flags |= REQ_CMD;
 
 	/*
-	 * inherit FAILFAST from bio and don't stack up
-	 * retries for read ahead
+	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
 	 */
-	if (ra || test_bit(BIO_RW_FAILFAST, &bio->bi_rw))	
+	if (bio_rw_ahead(bio) || bio_failfast(bio))
 		req->flags |= REQ_FAILFAST;
 
 	/*
@@ -2300,7 +2363,7 @@ out:
 	return 0;
 
 end_io:
-	bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK);
+	bio_endio(bio, nr_sectors << 9, err);
 	return 0;
 }
 
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/drivers/ide/ide-disk.c linux-2.6.5-rc1-mm2/drivers/ide/ide-disk.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/drivers/ide/ide-disk.c	2004-03-19 15:13:51.000000000 +0100
+++ linux-2.6.5-rc1-mm2/drivers/ide/ide-disk.c	2004-03-19 16:10:26.987645047 +0100
@@ -1361,6 +1361,7 @@ static int set_nowerr(ide_drive_t *drive
 static int write_cache (ide_drive_t *drive, int arg)
 {
 	ide_task_t args;
+	int err;
 
 	if (!(drive->id->cfs_enable_2 & 0x3000))
 		return 1;
@@ -1371,7 +1372,10 @@ static int write_cache (ide_drive_t *dri
 	args.tfRegister[IDE_COMMAND_OFFSET]	= WIN_SETFEATURES;
 	args.command_type			= IDE_DRIVE_TASK_NO_DATA;
 	args.handler				= &task_no_data_intr;
-	(void) ide_raw_taskfile(drive, &args, NULL);
+
+	err = ide_raw_taskfile(drive, &args, NULL);
+	if (err)
+		return err;
 
 	drive->wcache = arg;
 	return 0;
@@ -1680,6 +1684,8 @@ static void idedisk_setup (ide_drive_t *
 	if (drive->id->cfs_enable_2 & 0x3000)
 		write_cache(drive, (id->cfs_enable_2 & 0x3000));
 
+	blk_queue_ordered(drive->queue, 1);
+
 #ifdef CONFIG_BLK_DEV_IDE_TCQ_DEFAULT
 	if (drive->using_dma)
 		__ide_dma_queued_on(drive);
@@ -1728,10 +1734,14 @@ static ide_driver_t idedisk_driver = {
 static int idedisk_open(struct inode *inode, struct file *filp)
 {
 	ide_drive_t *drive = inode->i_bdev->bd_disk->private_data;
+	u8 cf;
+
 	drive->usage++;
-	if (drive->removable && drive->usage == 1) {
+	if (drive->usage != 1)
+		return 0;
+
+	if (drive->removable) {
 		ide_task_t args;
-		u8 cf;
 		memset(&args, 0, sizeof(ide_task_t));
 		args.tfRegister[IDE_COMMAND_OFFSET] = WIN_DOORLOCK;
 		args.command_type = IDE_DRIVE_TASK_NO_DATA;
@@ -1744,18 +1754,19 @@ static int idedisk_open(struct inode *in
 		 */
 		if (drive->doorlocking && ide_raw_taskfile(drive, &args, NULL))
 			drive->doorlocking = 0;
-		drive->wcache = 0;
-		/* Cache enabled ? */
-		if (drive->id->csfo & 1)
-		drive->wcache = 1;
-		/* Cache command set available ? */
-		if (drive->id->cfs_enable_1 & (1<<5))
-			drive->wcache = 1;
-		/* ATA6 cache extended commands */
-		cf = drive->id->command_set_2 >> 24;
-		if((cf & 0xC0) == 0x40 && (cf & 0x30) != 0)
-			drive->wcache = 1;
 	}
+
+	drive->wcache = 0;
+	/* Cache enabled ? */
+	if (drive->id->csfo & 1)
+		drive->wcache = 1;
+	/* Cache command set available ? */
+	if (drive->id->cfs_enable_1 & (1<<5))
+		drive->wcache = 1;
+	/* ATA6 cache extended commands */
+	cf = drive->id->command_set_2 >> 24;
+	if((cf & 0xC0) == 0x40 && (cf & 0x30) != 0)
+		drive->wcache = 1;
 	return 0;
 }
 
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/drivers/ide/ide-io.c linux-2.6.5-rc1-mm2/drivers/ide/ide-io.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/drivers/ide/ide-io.c	2004-03-19 15:13:51.000000000 +0100
+++ linux-2.6.5-rc1-mm2/drivers/ide/ide-io.c	2004-03-19 16:10:26.990644724 +0100
@@ -54,30 +54,84 @@
 #include <asm/io.h>
 #include <asm/bitops.h>
 
-/**
- *	ide_end_request		-	complete an IDE I/O
- *	@drive: IDE device for the I/O
- *	@uptodate: 
- *	@nr_sectors: number of sectors completed
- *
- *	This is our end_request wrapper function. We complete the I/O
- *	update random number input and dequeue the request, which if
- *	it was tagged may be out of order.
+static void ide_fill_flush_cmd(ide_drive_t *drive, struct request *rq)
+{
+	memset(drive->special_buf, 0, sizeof(drive->special_buf));
+
+	rq->flags &= ~REQ_BLOCK_PC;
+	rq->flags |= REQ_DRIVE_TASK | REQ_STARTED;
+	rq->buffer = drive->special_buf;
+	rq->buffer[0] = WIN_FLUSH_CACHE;
+
+	if (drive->id->cfs_enable_2 & 0x2400)
+		rq->buffer[0] = WIN_FLUSH_CACHE_EXT;
+}
+
+static int ide_transform_pc_req(ide_drive_t *drive, struct request *rq)
+{
+	if (rq->cmd[0] != 0x35) {
+		ide_end_request(drive, 0, 0);
+		return 1;
+	}
+
+	if (!drive->wcache) {
+		ide_end_request(drive, 1, 0);
+		return 1;
+	}
+
+	ide_fill_flush_cmd(drive, rq);
+	return 0;
+}
+
+/*
+ * preempt pending requests, and store this cache flush for immediate
+ * execution
  */
- 
-int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
+static struct request *ide_queue_flush_cmd(ide_drive_t *drive,
+					   struct request *rq, int post)
 {
-	struct request *rq;
-	unsigned long flags;
-	int ret = 1;
+	struct request *flush_rq = &HWGROUP(drive)->wrq;
 
-	spin_lock_irqsave(&ide_lock, flags);
-	rq = HWGROUP(drive)->rq;
+	/*
+	 * write cache disabled, just return barrier write immediately
+	 */
+	if (!drive->wcache)
+		return rq;
 
-	BUG_ON(!(rq->flags & REQ_STARTED));
+	/*
+	 * if last rq issued was the post-flush, we can skip the pre-flush
+	 */
+#if 0
+	if (drive->last_rq_flush) {
+		rq->flags |= REQ_BAR_PREFLUSH;
+		return rq;
+	}
+#endif
 
-	if (!nr_sectors)
-		nr_sectors = rq->hard_cur_sectors;
+	ide_init_drive_cmd(flush_rq);
+	ide_fill_flush_cmd(drive, flush_rq);
+
+	flush_rq->special = rq;
+	flush_rq->nr_sectors = rq->nr_sectors;
+
+	if (!post) {
+		drive->doing_barrier = 1;
+		flush_rq->flags |= REQ_BAR_PREFLUSH;
+		blkdev_dequeue_request(rq);
+	} else
+		flush_rq->flags |= REQ_BAR_POSTFLUSH;
+
+	__elv_add_request(drive->queue, flush_rq, ELEVATOR_INSERT_FRONT, 0);
+	HWGROUP(drive)->rq = NULL;
+	return flush_rq;
+}
+
+static int __ide_end_request(ide_drive_t *drive, struct request *rq,
+			     int uptodate, int nr_sectors)
+{
+	int ret = 1;
+
+	BUG_ON(!(rq->flags & REQ_STARTED));
 
 	/*
 	 * if failfast is set on a request, override number of sectors and
@@ -86,6 +140,9 @@ int ide_end_request (ide_drive_t *drive,
 	if (blk_noretry_request(rq) && !uptodate)
 		nr_sectors = rq->hard_nr_sectors;
 
+	if (!blk_fs_request(rq) && !uptodate && !rq->errors)
+		rq->errors = -EIO;
+
 	/*
 	 * decide whether to reenable DMA -- 3 is a random magic for now,
 	 * if we DMA timeout more than 3 times, just stay in PIO
@@ -97,14 +154,54 @@ int ide_end_request (ide_drive_t *drive,
 
 	if (!end_that_request_first(rq, uptodate, nr_sectors)) {
 		add_disk_randomness(rq->rq_disk);
-		if (!blk_rq_tagged(rq))
-			blkdev_dequeue_request(rq);
-		else
+
+		if (blk_rq_tagged(rq))
 			blk_queue_end_tag(drive->queue, rq);
-		HWGROUP(drive)->rq = NULL;
+
+		blkdev_dequeue_request(rq);
 		end_that_request_last(rq);
+		HWGROUP(drive)->rq = NULL;
 		ret = 0;
 	}
+
+	return ret;
+}
+
+/**
+ *	ide_end_request		-	complete an IDE I/O
+ *	@drive: IDE device for the I/O
+ *	@uptodate: 
+ *	@nr_sectors: number of sectors completed
+ *
+ *	This is our end_request wrapper function. We complete the I/O
+ *	update random number input and dequeue the request, which if
+ *	it was tagged may be out of order.
+ */
+ 
+int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
+{
+	struct request *rq;
+	unsigned long flags;
+	int ret = 1;
+
+	spin_lock_irqsave(&ide_lock, flags);
+	rq = HWGROUP(drive)->rq;
+
+	if (!nr_sectors)
+		nr_sectors = rq->hard_cur_sectors;
+
+	if (!blk_barrier_rq(rq))
+		ret = __ide_end_request(drive, rq, uptodate, nr_sectors);
+	else {
+		struct request *flush_rq = &HWGROUP(drive)->wrq;
+
+		flush_rq->nr_sectors -= nr_sectors;
+		if (!flush_rq->nr_sectors) {
+			ide_queue_flush_cmd(drive, rq, 1);
+			ret = 0;
+		}
+	}
+
 	spin_unlock_irqrestore(&ide_lock, flags);
 	return ret;
 }
@@ -140,6 +237,97 @@ static void ide_complete_pm_request (ide
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
 
+/*
+ * FIXME: probably move this somewhere else, name is bad too :)
+ */
+static sector_t ide_get_error_location(ide_drive_t *drive, char *args)
+{
+	u32 high, low;
+	u8 hcyl, lcyl, sect;
+	sector_t sector;
+
+	high = 0;
+	hcyl = args[5];
+	lcyl = args[4];
+	sect = args[3];
+	
+	if (drive->id->cfs_enable_2 & 0x2400) {
+		low = (hcyl << 16) | (lcyl << 8) | sect;
+		HWIF(drive)->OUTB(drive->ctl|0x80, IDE_CONTROL_REG);
+		high = ide_read_24(drive);
+	} else {
+		u8 cur = HWIF(drive)->INB(IDE_SELECT_REG);
+		if (cur & 0x40)
+			low = (hcyl << 16) | (lcyl << 8) | sect;
+		else {
+			low = hcyl * drive->head * drive->sect;
+			low += lcyl * drive->sect;
+			low += sect - 1;
+		}
+	}
+
+	sector = ((sector_t) high << 24) | low;
+	return sector;
+}
+
+static void ide_complete_barrier(ide_drive_t *drive, struct request *rq,
+				 int error)
+{
+	struct request *real_rq = rq->special;
+	int good_sectors, bad_sectors;
+	sector_t sector;
+
+	if (!error) {
+		if (blk_barrier_postflush(rq)) {
+			/*
+			 * this completes the barrier write
+			 */
+			__ide_end_request(drive, real_rq, 1, real_rq->hard_nr_sectors);
+			drive->doing_barrier = 0;
+			drive->last_rq_flush = 1;
+		} else {
+			/*
+			 * just indicate that we did the pre flush
+			 */
+			real_rq->flags |= REQ_BAR_PREFLUSH;
+			__elv_add_request(drive->queue, real_rq, ELEVATOR_INSERT_FRONT, 0);
+		}
+
+#ifdef IDE_DUMP_FLUSH_TIMINGS
+		printk("%s: %sflush took %lu jiffies\n", drive->name, blk_barrier_postflush(rq) ? "post" : "pre", jiffies - rq->timeout);
+#endif
+
+		/*
+		 * all is fine, return
+		 */
+		return;
+	}
+
+	/*
+	 * bummer, flush failed. if it was the pre-flush, fail the barrier.
+	 * if it was the post-flush, complete the succesful part of the request
+	 * and fail the rest
+	 */
+	good_sectors = 0;
+	if (blk_barrier_postflush(rq)) {
+		sector = ide_get_error_location(drive, rq->buffer);
+
+		if ((sector >= real_rq->hard_sector) &&
+		    (sector < real_rq->hard_sector + real_rq->hard_nr_sectors))
+			good_sectors = sector - real_rq->hard_sector;
+	} else
+		sector = real_rq->hard_sector;
+
+	bad_sectors = real_rq->hard_nr_sectors - good_sectors;
+	if (good_sectors)
+		__ide_end_request(drive, real_rq, 1, good_sectors);
+	if (bad_sectors)
+		__ide_end_request(drive, real_rq, 0, bad_sectors);
+
+	printk(KERN_ERR "%s: failed barrier write: sector=%Lx(good=%d/bad=%d)\n", drive->name, sector, good_sectors, bad_sectors);
+	blk_queue_ordered(drive->queue, 0);
+}
+
 /**
  *	ide_end_drive_cmd	-	end an explicit drive command
  *	@drive: command 
@@ -229,6 +417,10 @@ void ide_end_drive_cmd (ide_drive_t *dri
 
 	spin_lock_irqsave(&ide_lock, flags);
 	blkdev_dequeue_request(rq);
+
+	if (blk_barrier_preflush(rq) || blk_barrier_postflush(rq))
+		ide_complete_barrier(drive, rq, err);
+
 	HWGROUP(drive)->rq = NULL;
 	end_that_request_last(rq);
 	spin_unlock_irqrestore(&ide_lock, flags);
@@ -610,6 +802,16 @@ ide_startstop_t start_request (ide_drive
 	if (drive->suspend_reset)
 		goto kill_rq;
 
+	/*
+	 * basic transformation support for scsi -> ata commands
+	 */
+	if (blk_pc_request(rq)) {
+		if (drive->media != ide_disk)
+			goto kill_rq;
+		if (ide_transform_pc_req(drive, rq))
+			return ide_stopped;
+	}
+
 	block    = rq->sector;
 	if (blk_fs_request(rq) &&
 	    (drive->media == ide_disk || drive->media == ide_floppy)) {
@@ -715,6 +917,15 @@ static inline ide_drive_t *choose_drive 
 repeat:	
 	best = NULL;
 	drive = hwgroup->drive;
+
+	/*
+	 * drive is doing pre-flush, ordered write, post-flush sequence. even
+	 * though that is 3 requests, it must be seen as a single transaction.
+	 * we must not preempt this drive until that is complete
+	 */
+	if (drive->doing_barrier)
+		return drive;
+
 	do {
 		if ((!drive->sleep || time_after_eq(jiffies, drive->sleep))
 		    && !elv_queue_empty(drive->queue)) {
@@ -882,6 +1093,15 @@ queue_next:
 		}
 
 		/*
+		 * if rq is a barrier write, issue pre cache flush if not
+		 * already done
+		 */
+		if (blk_barrier_rq(rq) && !blk_barrier_preflush(rq))
+			rq = ide_queue_flush_cmd(drive, rq, 0);
+
+		drive->last_rq_flush = 0;
+
+		/*
 		 * Sanity: don't accept a request that isn't a PM request
 		 * if we are currently power managed. This is very important as
 		 * blk_stop_queue() doesn't prevent the elv_next_request()
@@ -900,6 +1120,10 @@ queue_next:
 			break;
 		}
 
+		/*
+		 * we can only queue read-write requests, so let the drive
+		 * queue drain before continuing with this command.
+		 */
 		if (!rq->bio && ata_pending_commands(drive))
 			break;
 
@@ -1305,6 +1529,7 @@ void ide_init_drive_cmd (struct request 
 {
 	memset(rq, 0, sizeof(*rq));
 	rq->flags = REQ_DRIVE_CMD;
+	rq->ref_count = 1;
 }
 
 EXPORT_SYMBOL(ide_init_drive_cmd);
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/buffer.c linux-2.6.5-rc1-mm2/fs/buffer.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/buffer.c	2004-03-19 15:13:58.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/buffer.c	2004-03-19 16:10:24.604901573 +0100
@@ -1589,6 +1589,7 @@ int try_to_release_page(struct page *pag
 		return mapping->a_ops->releasepage(page, gfp_mask);
 	return try_to_free_buffers(page);
 }
+EXPORT_SYMBOL(try_to_release_page);
 
 /**
  * block_invalidatepage - invalidate part of all of a buffer-backed page
@@ -2707,6 +2708,9 @@ int submit_bh(int rw, struct buffer_head
 	if (rw == READ && buffer_dirty(bh))
 		buffer_error();
 
+	if (buffer_ordered(bh) && (rw == WRITE))
+		rw = WRITE_BARRIER;
+
 	/* Only clear out a write error when rewriting */
 	if (test_set_buffer_req(bh) && rw == WRITE)
 		clear_buffer_write_io_error(bh);
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/ext3/super.c linux-2.6.5-rc1-mm2/fs/ext3/super.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/ext3/super.c	2004-03-19 15:13:58.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/ext3/super.c	2004-03-19 16:10:24.212943771 +0100
@@ -536,7 +536,8 @@ enum {
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload,
 	Opt_commit, Opt_journal_update, Opt_journal_inum,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-	Opt_ignore, Opt_err,
+	Opt_ignore, Opt_barrier,
+	Opt_err,
 };
 
 static match_table_t tokens = {
@@ -575,6 +576,7 @@ static match_table_t tokens = {
 	{Opt_ignore, "noquota"},
 	{Opt_ignore, "quota"},
 	{Opt_ignore, "usrquota"},
+	{Opt_barrier, "barrier=%u"},
 	{Opt_err, NULL}
 };
 
@@ -762,6 +764,14 @@ static int parse_options (char * options
 		case Opt_abort:
 			set_opt(sbi->s_mount_opt, ABORT);
 			break;
+		case Opt_barrier:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option)
+				set_opt(sbi->s_mount_opt, BARRIER);
+			else
+				clear_opt(sbi->s_mount_opt, BARRIER);
+			break;
 		case Opt_ignore:
 			break;
 		default:
@@ -1419,16 +1429,23 @@ out_fail:
  * initial mount, once the journal has been initialised but before we've
  * done any recovery; and again on any subsequent remount. 
  */
-static void ext3_init_journal_params(struct ext3_sb_info *sbi, 
-				     journal_t *journal)
+static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
 {
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
 	if (sbi->s_commit_interval)
 		journal->j_commit_interval = sbi->s_commit_interval;
 	/* We could also set up an ext3-specific default for the commit
 	 * interval here, but for now we'll just fall back to the jbd
 	 * default. */
-}
 
+	spin_lock(&journal->j_state_lock);
+	if (test_opt(sb, BARRIER))
+		journal->j_flags |= JFS_BARRIER;
+	else
+		journal->j_flags &= ~JFS_BARRIER;
+	spin_unlock(&journal->j_state_lock);
+}
 
 static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
 {
@@ -1465,7 +1482,7 @@ static journal_t *ext3_get_journal(struc
 		iput(journal_inode);
 	}
 	journal->j_private = sb;
-	ext3_init_journal_params(EXT3_SB(sb), journal);
+	ext3_init_journal_params(sb, journal);
 	return journal;
 }
 
@@ -1550,7 +1567,7 @@ static journal_t *ext3_get_dev_journal(s
 		goto out_journal;
 	}
 	EXT3_SB(sb)->journal_bdev = bdev;
-	ext3_init_journal_params(EXT3_SB(sb), journal);
+	ext3_init_journal_params(sb, journal);
 	return journal;
 out_journal:
 	journal_destroy(journal);
@@ -1843,7 +1860,7 @@ int ext3_remount (struct super_block * s
 
 	es = sbi->s_es;
 
-	ext3_init_journal_params(sbi, sbi->s_journal);
+	ext3_init_journal_params(sb, sbi->s_journal);
 
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 		if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/jbd/commit.c linux-2.6.5-rc1-mm2/fs/jbd/commit.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/jbd/commit.c	2004-03-11 03:55:44.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/jbd/commit.c	2004-03-19 16:10:24.488914060 +0100
@@ -636,6 +636,8 @@ wait_for_iobuf:
 	{
 		struct buffer_head *bh = jh2bh(descriptor);
 		set_buffer_uptodate(bh);
+		if (journal->j_flags & JFS_BARRIER)
+			set_buffer_ordered(bh);
 		sync_dirty_buffer(bh);
 		if (unlikely(!buffer_uptodate(bh)))
 			err = -EIO;
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/Kconfig linux-2.6.5-rc1-mm2/fs/Kconfig
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/Kconfig	2004-03-19 15:13:58.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/Kconfig	2004-03-19 16:10:24.604901573 +0100
@@ -244,6 +244,40 @@ config REISERFS_PROC_INFO
 	  Almost everyone but ReiserFS developers and people fine-tuning
 	  reiserfs or tracing problems should say N.
 
+config REISERFS_FS_XATTR
+	bool "ReiserFS extended attributes"
+	depends on REISERFS_FS
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config REISERFS_FS_POSIX_ACL
+	bool "ReiserFS POSIX Access Control Lists"
+	depends on REISERFS_FS_XATTR
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config REISERFS_FS_SECURITY
+	bool "ReiserFS Security Labels"
+	depends on REISERFS_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ReiserFS filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
 config JFS_FS
 	tristate "JFS filesystem support"
 	select NLS
@@ -282,13 +316,13 @@ config JFS_STATISTICS
 	  to be made available to the user in the /proc/fs/jfs/ directory.
 
 config FS_POSIX_ACL
-# Posix ACL utility routines (for now, only ext2/ext3/jfs)
+# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs)
 #
 # NOTE: you can implement Posix ACLs without these helpers (XFS does).
 # 	Never use this symbol for ifdefs.
 #
 	bool
-	depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL
+	depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL
 	default y
 
 config XFS_FS
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/bitmap.c linux-2.6.5-rc1-mm2/fs/reiserfs/bitmap.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/bitmap.c	2004-03-11 03:55:35.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/bitmap.c	2004-03-19 16:10:24.015964978 +0100
@@ -12,6 +12,7 @@
 #include <linux/pagemap.h>
 #include <linux/reiserfs_fs_sb.h>
 #include <linux/reiserfs_fs_i.h>
+#include <linux/quotaops.h>
 
 #define PREALLOCATION_SIZE 9
 
@@ -281,7 +282,8 @@ static int scan_bitmap (struct reiserfs_
 }
 
 static void _reiserfs_free_block (struct reiserfs_transaction_handle *th,
-				  b_blocknr_t block)
+				  struct inode *inode, b_blocknr_t block,
+				  int for_unformatted)
 {
     struct super_block * s = th->t_super;
     struct reiserfs_super_block * rs;
@@ -323,11 +325,13 @@ static void _reiserfs_free_block (struct
     set_sb_free_blocks( rs, sb_free_blocks(rs) + 1 );
 
     journal_mark_dirty (th, s, sbh);
-  s->s_dirt = 1;
+    if (for_unformatted)
+        DQUOT_FREE_BLOCK_NODIRTY(inode, 1);
 }
 
 void reiserfs_free_block (struct reiserfs_transaction_handle *th, 
-                          b_blocknr_t block)
+			  struct inode *inode, b_blocknr_t block,
+			  int for_unformatted)
 {
     struct super_block * s = th->t_super;
 
@@ -335,42 +339,46 @@ void reiserfs_free_block (struct reiserf
     RFALSE(is_reusable (s, block, 1) == 0, "vs-4071: can not free such block");
     /* mark it before we clear it, just in case */
     journal_mark_freed(th, s, block) ;
-    _reiserfs_free_block(th, block) ;
+    _reiserfs_free_block(th, inode, block, for_unformatted) ;
 }
 
 /* preallocated blocks don't need to be run through journal_mark_freed */
 void reiserfs_free_prealloc_block (struct reiserfs_transaction_handle *th, 
-                          b_blocknr_t block) {
+			  struct inode *inode, b_blocknr_t block) {
     RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device");
     RFALSE(is_reusable (th->t_super, block, 1) == 0, "vs-4070: can not free such block");
-    _reiserfs_free_block(th, block) ;
+    _reiserfs_free_block(th, inode, block, 1) ;
 }
 
 static void __discard_prealloc (struct reiserfs_transaction_handle * th,
 				struct reiserfs_inode_info *ei)
 {
     unsigned long save = ei->i_prealloc_block ;
+    int dirty = 0;
+    struct inode *inode = &ei->vfs_inode;
 #ifdef CONFIG_REISERFS_CHECK
     if (ei->i_prealloc_count < 0)
 	reiserfs_warning("zam-4001:%s: inode has negative prealloc blocks count.\n", __FUNCTION__ );
 #endif
     while (ei->i_prealloc_count > 0) {
-	reiserfs_free_prealloc_block(th,ei->i_prealloc_block);
+	reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block);
 	ei->i_prealloc_block++;
 	ei->i_prealloc_count --;
+	dirty = 1;
     }
+    if (dirty)
+    	reiserfs_update_sd(th, inode);
     ei->i_prealloc_block = save;
     list_del_init(&(ei->i_prealloc_list));
 }
 
 /* FIXME: It should be inline function */
 void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th, 
-				struct inode * inode)
+				struct inode *inode)
 {
     struct reiserfs_inode_info *ei = REISERFS_I(inode);
-    if (ei->i_prealloc_count) {
+    if (ei->i_prealloc_count)
 	__discard_prealloc(th, ei);
-    }
 }
 
 void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th)
@@ -772,6 +780,24 @@ static inline int blocknrs_and_prealloc_
     int nr_allocated = 0;
 
     determine_prealloc_size(hint);
+    if (!hint->formatted_node) {
+        int quota_ret;
+#ifdef REISERQUOTA_DEBUG
+	printk(KERN_DEBUG "reiserquota: allocating %d blocks id=%u\n", amount_needed, hint->inode->i_uid);
+#endif
+	quota_ret = DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed);
+	if (quota_ret)    /* Quota exceeded? */
+	    return QUOTA_EXCEEDED;
+	if (hint->preallocate && hint->prealloc_size ) {
+#ifdef REISERQUOTA_DEBUG
+	    printk(KERN_DEBUG "reiserquota: allocating (prealloc) %d blocks id=%u\n", hint->prealloc_size, hint->inode->i_uid);
+#endif
+	    quota_ret = DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, hint->prealloc_size);
+	    if (quota_ret)
+		hint->preallocate=hint->prealloc_size=0;
+	}
+    }
+
     while((nr_allocated
 	  += allocate_without_wrapping_disk(hint, new_blocknrs + nr_allocated, start, finish,
 					  amount_needed - nr_allocated, hint->prealloc_size))
@@ -779,8 +805,14 @@ static inline int blocknrs_and_prealloc_
 
 	/* not all blocks were successfully allocated yet*/
 	if (second_pass) {	/* it was a second pass; we must free all blocks */
+	    if (!hint->formatted_node) {
+#ifdef REISERQUOTA_DEBUG
+		printk(KERN_DEBUG "reiserquota: freeing (nospace) %d blocks id=%u\n", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid);
+#endif
+		DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated);     /* Free not allocated blocks */
+	    }
 	    while (nr_allocated --)
-		reiserfs_free_block(hint->th, new_blocknrs[nr_allocated]);
+		reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node);
 
 	    return NO_DISK_SPACE;
 	} else {		/* refine search parameters for next pass */
@@ -789,7 +821,19 @@ static inline int blocknrs_and_prealloc_
 	    start = 0;
 	    continue;
 	}
-      }
+    }
+    if ( !hint->formatted_node && 
+         amount_needed + hint->prealloc_size > 
+	 nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
+    /* Some of preallocation blocks were not allocated */
+#ifdef REISERQUOTA_DEBUG
+	printk(KERN_DEBUG "reiserquota: freeing (failed prealloc) %d blocks id=%u\n", amount_needed + hint->prealloc_size - nr_allocated - INODE_INFO(hint->inode)->i_prealloc_count, hint->inode->i_uid);
+#endif
+	DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + 
+	                         hint->prealloc_size - nr_allocated - 
+				 REISERFS_I(hint->inode)->i_prealloc_count);
+    }
+
     return CARRY_ON;
 }
 
@@ -858,7 +902,7 @@ int reiserfs_allocate_blocknrs(reiserfs_
 
     if (ret != CARRY_ON) {
 	while (amount_needed ++ < initial_amount_needed) {
-	    reiserfs_free_block(hint->th, *(--new_blocknrs));
+	    reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1);
 	}
     }
     return ret;
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/dir.c linux-2.6.5-rc1-mm2/fs/reiserfs/dir.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/dir.c	2004-03-19 15:13:52.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/dir.c	2004-03-19 16:10:24.009965624 +0100
@@ -115,6 +115,17 @@ static int reiserfs_readdir (struct file
 		    /* too big to send back to VFS */
 		    continue ;
 		}
+
+                /* Ignore the .reiserfs_priv entry */
+                if (reiserfs_xattrs (inode->i_sb) &&
+                    !old_format_only(inode->i_sb) &&
+                    filp->f_dentry == inode->i_sb->s_root &&
+                    REISERFS_SB(inode->i_sb)->priv_root &&
+                    REISERFS_SB(inode->i_sb)->priv_root->d_inode &&
+                    deh_objectid(deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid)) {
+                  continue;
+                }
+
 		d_off = deh_offset (deh);
 		filp->f_pos = d_off ;
 		d_ino = deh_objectid (deh);
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/do_balan.c linux-2.6.5-rc1-mm2/fs/reiserfs/do_balan.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/do_balan.c	2004-03-11 03:55:23.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/do_balan.c	2004-03-19 16:10:24.008965731 +0100
@@ -30,34 +30,11 @@ struct tree_balance * cur_tb = NULL; /* 
                                         is interrupting do_balance */
 #endif
 
-/*
- * AKPM: The __mark_buffer_dirty() call here will not
- * put the buffer on the dirty buffer LRU because we've just
- * set BH_Dirty.  That's a thinko in reiserfs.
- *
- * I'm reluctant to "fix" this bug because that would change
- * behaviour.  Using mark_buffer_dirty() here would make the
- * buffer eligible for VM and periodic writeback, which may
- * violate ordering constraints.  I'll just leave the code
- * as-is by removing the __mark_buffer_dirty call altogether.
- *
- * Chris says this code has "probably never been run" anyway.
- * It is due to go away.
- */
-
 inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, 
 					struct buffer_head * bh, int flag)
 {
-    if (reiserfs_dont_log(tb->tb_sb)) {
-	if (!test_set_buffer_dirty(bh)) {
-//	    __mark_buffer_dirty(bh) ;
-	    tb->need_balance_dirty = 1;
-	}
-    } else {
-	int windex = push_journal_writer("do_balance") ;
-	journal_mark_dirty(tb->transaction_handle, tb->transaction_handle->t_super, bh) ;
-	pop_journal_writer(windex) ;
-    }
+    journal_mark_dirty(tb->transaction_handle, 
+                       tb->transaction_handle->t_super, bh) ;
 }
 
 #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
@@ -1257,7 +1234,7 @@ static void free_thrown(struct tree_bala
 	    if (buffer_dirty (tb->thrown[i]))
 	      printk ("free_thrown deals with dirty buffer %d\n", blocknr);
 	    brelse(tb->thrown[i]) ; /* incremented in store_thrown */
-	    reiserfs_free_block (tb->transaction_handle, blocknr);
+	    reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
 	}
     }
 }
@@ -1270,10 +1247,6 @@ void reiserfs_invalidate_buffer (struct 
     set_blkh_nr_item( blkh, 0 );
     
     clear_buffer_dirty(bh);
-    /* reiserfs_free_block is no longer schedule safe 
-    reiserfs_free_block (tb->transaction_handle, tb->tb_sb, bh->b_blocknr);
-    */
-
     store_thrown (tb, bh);
 }
 
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/file.c linux-2.6.5-rc1-mm2/fs/reiserfs/file.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/file.c	2004-03-19 15:13:58.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/file.c	2004-03-19 16:30:24.726645382 +0100
@@ -5,10 +5,15 @@
 
 #include <linux/time.h>
 #include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_acl.h>
+#include <linux/reiserfs_xattr.h>
 #include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/quotaops.h>
 
 /*
 ** We pack the tails of files on file close, not at the time they are written.
@@ -29,7 +34,6 @@ static int reiserfs_file_release (struct
 {
 
     struct reiserfs_transaction_handle th ;
-    int windex ;
 
     if (!S_ISREG (inode->i_mode))
 	BUG ();
@@ -59,9 +63,7 @@ static int reiserfs_file_release (struct
 	   appended (we append by unformatted node only) or its direct
 	   item(s) had to be converted, then it may have to be
 	   indirect2direct converted */
-	windex = push_journal_writer("file_release") ;
 	reiserfs_truncate_file(inode, 0) ;
-	pop_journal_writer(windex) ;
     }
     up (&inode->i_sem); 
     reiserfs_write_unlock(inode->i_sb);
@@ -86,63 +88,19 @@ static int reiserfs_sync_file(
 			      ) {
   struct inode * p_s_inode = p_s_dentry->d_inode;
   int n_err;
-
-  reiserfs_write_lock(p_s_inode->i_sb);
+  int barrier_done;
 
   if (!S_ISREG(p_s_inode->i_mode))
       BUG ();
-
   n_err = sync_mapping_buffers(p_s_inode->i_mapping) ;
-  reiserfs_commit_for_inode(p_s_inode) ;
+  reiserfs_write_lock(p_s_inode->i_sb);
+  barrier_done = reiserfs_commit_for_inode(p_s_inode);
   reiserfs_write_unlock(p_s_inode->i_sb);
+  if (barrier_done != 1)
+      blkdev_issue_flush(p_s_inode->i_sb->s_bdev);
   return ( n_err < 0 ) ? -EIO : 0;
 }
 
-static int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
-    struct inode *inode = dentry->d_inode ;
-    int error ;
-    reiserfs_write_lock(inode->i_sb);
-    if (attr->ia_valid & ATTR_SIZE) {
-	/* version 2 items will be caught by the s_maxbytes check
-	** done for us in vmtruncate
-	*/
-	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
-	    attr->ia_size > MAX_NON_LFS) {
-	    error = -EFBIG ;
-	    goto out;
-	}
-	/* fill in hole pointers in the expanding truncate case. */
-        if (attr->ia_size > inode->i_size) {
-	    error = generic_cont_expand(inode, attr->ia_size) ;
-	    if (REISERFS_I(inode)->i_prealloc_count > 0) {
-		struct reiserfs_transaction_handle th ;
-		/* we're changing at most 2 bitmaps, inode + super */
-		journal_begin(&th, inode->i_sb, 4) ;
-		reiserfs_discard_prealloc (&th, inode);
-		journal_end(&th, inode->i_sb, 4) ;
-	    }
-	    if (error)
-	        goto out;
-	}
-    }
-
-    if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
-	 ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
-	(get_inode_sd_version (inode) == STAT_DATA_V1)) {
-		/* stat data of format v3.5 has 16 bit uid and gid */
-	    error = -EINVAL;
-	    goto out;	
-	}
-
-    error = inode_change_ok(inode, attr) ;
-    if (!error)
-        inode_setattr(inode, attr) ;
-
-out:
-    reiserfs_write_unlock(inode->i_sb);
-    return error ;
-}
-
 /* I really do not want to play with memory shortage right now, so
    to simplify the code, we are not going to write more than this much pages at
    a time. This still should considerably improve performance compared to 4k
@@ -153,6 +111,7 @@ out:
    Maps all unmapped but prepared pages from the list.
    Updates metadata with newly allocated blocknumbers as needed */
 int reiserfs_allocate_blocks_for_region(
+				struct reiserfs_transaction_handle *th,
 				struct inode *inode, /* Inode we work with */
 				loff_t pos, /* Writing position */
 				int num_pages, /* number of pages write going
@@ -170,7 +129,6 @@ int reiserfs_allocate_blocks_for_region(
     struct cpu_key key; // cpu key of item that we are going to deal with
     struct item_head *ih; // pointer to item head that we are going to deal with
     struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
-    struct reiserfs_transaction_handle th; // transaction handle for transaction we are going to create.
     __u32 * item; // pointer to item we are going to deal with
     INITIALIZE_PATH(path); // path to item, that we are going to deal with.
     b_blocknr_t allocated_blocks[blocks_to_allocate]; // Pointer to a place where allocated blocknumbers would be stored. Right now statically allocated, later that will change.
@@ -197,7 +155,7 @@ int reiserfs_allocate_blocks_for_region(
     /* If we came here, it means we absolutely need to open a transaction,
        since we need to allocate some blocks */
     reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
-    journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); // Wish I know if this number enough
+    journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); // Wish I know if this number enough
     reiserfs_update_inode_transaction(inode) ;
 
     /* Look for the in-tree position of our write, need path for block allocator */
@@ -209,14 +167,20 @@ int reiserfs_allocate_blocks_for_region(
    
     /* Allocate blocks */
     /* First fill in "hint" structure for block allocator */
-    hint.th = &th; // transaction handle.
+    hint.th = th; // transaction handle.
     hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine.
     hint.inode = inode; // Inode is needed by block allocator too.
     hint.search_start = 0; // We have no hint on where to search free blocks for block allocator.
     hint.key = key.on_disk_key; // on disk key of file.
     hint.block = inode->i_blocks>>(inode->i_sb->s_blocksize_bits-9); // Number of disk blocks this file occupies already.
     hint.formatted_node = 0; // We are allocating blocks for unformatted node.
-    hint.preallocate = 0; // We do not do any preallocation for now.
+
+    /* only preallocate if this is a small write */
+    if (blocks_to_allocate < 
+        REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize) 
+        hint.preallocate = 1;
+    else
+        hint.preallocate = 0;
 
     /* Call block allocator to allocate blocks */
     res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
@@ -225,7 +189,7 @@ int reiserfs_allocate_blocks_for_region(
 	    /* We flush the transaction in case of no space. This way some
 	       blocks might become free */
 	    SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
-	    restart_transaction(&th, inode, &path);
+	    restart_transaction(th, inode, &path);
 
 	    /* We might have scheduled, so search again */
 	    res = search_for_position_by_key(inode->i_sb, &key, &path);
@@ -280,7 +244,20 @@ int reiserfs_allocate_blocks_for_region(
 	// position, and how many blocks it is going to cover (we need to
 	//  populate pointers to file blocks representing the hole with zeros)
 
-	hole_size = (pos + 1 - (le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key))+op_bytes_number(ih, inode->i_sb->s_blocksize))) >> inode->i_sb->s_blocksize_bits;
+	{
+	    int item_offset = 1;
+	    /* 
+	     * if ih is stat data, its offset is 0 and we don't want to 
+	     * add 1 to pos in the hole_size calculation
+	     */
+	    if (is_statdata_le_ih(ih))
+	        item_offset = 0;
+	    hole_size = (pos + item_offset - 
+	            (le_key_k_offset( get_inode_item_key_version(inode), 
+		    &(ih->ih_key)) + 
+		    op_bytes_number(ih, inode->i_sb->s_blocksize))) >> 
+		    inode->i_sb->s_blocksize_bits;
+	}
 
 	if ( hole_size > 0 ) {
 	    int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); // How much data to insert first time.
@@ -299,7 +276,7 @@ int reiserfs_allocate_blocks_for_region(
 		    /* Ok, there is existing indirect item already. Need to append it */
 		    /* Calculate position past inserted item */
 		    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
-		    res = reiserfs_paste_into_item( &th, &path, &key, (char *)zeros, UNFM_P_SIZE*to_paste);
+		    res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)zeros, UNFM_P_SIZE*to_paste);
 		    if ( res ) {
 			kfree(zeros);
 			goto error_exit_free_blocks;
@@ -329,7 +306,7 @@ int reiserfs_allocate_blocks_for_region(
 		        kfree(zeros);
 			goto error_exit_free_blocks;
 		    }
-		    res = reiserfs_insert_item( &th, &path, &key, &ins_ih, (char *)zeros);
+		    res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)zeros);
 		} else {
 		    reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key);
 		}
@@ -339,8 +316,8 @@ int reiserfs_allocate_blocks_for_region(
 		}
 		/* Now we want to check if transaction is too full, and if it is
 		   we restart it. This will also free the path. */
-		if (journal_transaction_should_end(&th, th.t_blocks_allocated))
-		    restart_transaction(&th, inode, &path);
+		if (journal_transaction_should_end(th, th->t_blocks_allocated))
+		    restart_transaction(th, inode, &path);
 
 		/* Well, need to recalculate path and stuff */
 		set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits));
@@ -371,7 +348,7 @@ retry:
 	       one. */
 	    /* First if we are already modifying current item, log it */
 	    if ( modifying_this_item ) {
-		journal_mark_dirty (&th, inode->i_sb, bh);
+		journal_mark_dirty (th, inode->i_sb, bh);
 		modifying_this_item = 0;
 	    }
 	    /* Then set the key to look for a new indirect item (offset of old
@@ -435,7 +412,7 @@ retry:
 
     if ( modifying_this_item ) { // We need to log last-accessed block, if it
 				 // was modified, but not logged yet.
-	journal_mark_dirty (&th, inode->i_sb, bh);
+	journal_mark_dirty (th, inode->i_sb, bh);
     }
 
     if ( curr_block < blocks_to_allocate ) {
@@ -446,7 +423,7 @@ retry:
 	    // position. We do not need to recalculate path as it should
 	    // already point to correct place.
 	    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
-	    res = reiserfs_paste_into_item( &th, &path, &key, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
+	    res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
 	    if ( res ) {
 		goto error_exit_free_blocks;
 	    }
@@ -477,29 +454,17 @@ retry:
 		goto error_exit_free_blocks;
 	    }
 	    /* Insert item into the tree with the data as its body */
-	    res = reiserfs_insert_item( &th, &path, &key, &ins_ih, (char *)(allocated_blocks+curr_block));
+	    res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)(allocated_blocks+curr_block));
 	} else {
 	    reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key);
 	}
     }
 
-    /* Now the final thing, if we have grew the file, we must update it's size*/
-    if ( pos + write_bytes > inode->i_size) {
-	inode->i_size = pos + write_bytes; // Set new size
-	/* If the file have grown so much that tail packing is no longer possible, reset
-	   "need to pack" flag */
-	if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
-	     (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
-	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
-    }
-
-    /* Amount of on-disk blocks used by file have changed, update it */
-    inode->i_blocks += blocks_to_allocate << (inode->i_blkbits - 9);
-    reiserfs_update_sd(&th, inode); // And update on-disk metadata
-    // finish all journal stuff now, We are not going to play with metadata
-    // anymore.
+    // the caller is responsible for closing the transaction
+    // unless we return an error, they are also responsible for logging
+    // the inode.
+    //
     pathrelse(&path);
-    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
     reiserfs_write_unlock(inode->i_sb);
 
     // go through all the pages/buffers and map the buffers to newly allocated
@@ -530,6 +495,7 @@ retry:
 	    if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it
 		map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block]));
 		curr_block++;
+		set_buffer_new(bh);
 	    }
 	}
     }
@@ -543,10 +509,11 @@ error_exit_free_blocks:
     pathrelse(&path);
     // free blocks
     for( i = 0; i < blocks_to_allocate; i++ )
-	reiserfs_free_block( &th, le32_to_cpu(allocated_blocks[i]));
+	reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), 1);
 
 error_exit:
-    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
+    reiserfs_update_sd(th, inode); // update any changes we made to blk count
+    journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
     reiserfs_write_unlock(inode->i_sb);
 
     return res;
@@ -606,12 +573,63 @@ int reiserfs_copy_from_user_to_file_regi
     return page_fault?-EFAULT:0;
 }
 
+/* taken fs/buffer.c:__block_commit_write */
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+		unsigned from, unsigned to)
+{
+    unsigned block_start, block_end;
+    int partial = 0;
+    unsigned blocksize;
+    struct buffer_head *bh, *head;
+    unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
+    int new;
+
+    blocksize = 1 << inode->i_blkbits;
+
+    for(bh = head = page_buffers(page), block_start = 0; 
+        bh != head || !block_start;
+	block_start=block_end, bh = bh->b_this_page) 
+    {
+
+	new = buffer_new(bh);
+	clear_buffer_new(bh);
+	block_end = block_start + blocksize;
+	if (block_end <= from || block_start >= to) {
+	    if (!buffer_uptodate(bh))
+		    partial = 1;
+	} else {
+	    set_buffer_uptodate(bh);
+	    if (!buffer_dirty(bh)) {
+		mark_buffer_dirty(bh);
+		/* do data=ordered on any page past the end
+		 * of file and any buffer marked BH_New.
+		 */
+		if (reiserfs_data_ordered(inode->i_sb) &&
+		    (new || page->index >= i_size_index)) {
+		    reiserfs_add_ordered_list(inode, bh);
+	        }
+	    }
+	}
+    }
+
+    /*
+     * If this is a partial write which happened to make all buffers
+     * uptodate then we can optimize away a bogus readpage() for
+     * the next read(). Here we 'discover' whether the page went
+     * uptodate as a result of this (potentially partial) write.
+     */
+    if (!partial)
+	SetPageUptodate(page);
+    return 0;
+}
 
 
 /* Submit pages for write. This was separated from actual file copying
    because we might want to allocate block numbers in-between.
    This function assumes that caller will adjust file size to correct value. */
 int reiserfs_submit_file_region_for_write(
+				struct reiserfs_transaction_handle *th,
+				struct inode *inode,
 				loff_t pos, /* Writing position offset */
 				int num_pages, /* Number of pages to write */
 				int write_bytes, /* number of bytes to write */
@@ -622,12 +640,14 @@ int reiserfs_submit_file_region_for_writ
     int retval = 0; // Return value we are going to return.
     int i; // loop counter
     int offset; // Writing offset in page.
+    int orig_write_bytes = write_bytes;
+    int sd_update = 0;
 
     for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
 	int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
 	struct page *page=prepared_pages[i]; // Current page we process.
 
-	status = block_commit_write(page, offset, offset+count);
+	status = reiserfs_commit_page(inode, page, offset, offset+count);
 	if ( status )
 	    retval = status; // To not overcomplicate matters We are going to
 			     // submit all the pages even if there was error.
@@ -639,6 +659,41 @@ int reiserfs_submit_file_region_for_writ
 			  // to grab_cache_page
 	page_cache_release(page);
     }
+    /* now that we've gotten all the ordered buffers marked dirty,
+     * we can safely update i_size and close any running transaction
+     */
+    if ( pos + orig_write_bytes > inode->i_size) {
+	inode->i_size = pos + orig_write_bytes; // Set new size
+	/* If the file have grown so much that tail packing is no 
+	 * longer possible, reset "need to pack" flag */
+	if ( (have_large_tails (inode->i_sb) && 
+	      inode->i_size > i_block_size (inode)*4) ||
+	     (have_small_tails (inode->i_sb) && 
+	     inode->i_size > i_block_size(inode)) )
+	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
+        else if ( (have_large_tails (inode->i_sb) && 
+	          inode->i_size < i_block_size (inode)*4) ||
+	          (have_small_tails (inode->i_sb) && 
+		  inode->i_size < i_block_size(inode)) )
+	    REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
+
+	if (th->t_trans_id) {
+	    reiserfs_write_lock(inode->i_sb);
+	    reiserfs_update_sd(th, inode); // And update on-disk metadata
+	    reiserfs_write_unlock(inode->i_sb);
+	} else
+	    inode->i_sb->s_op->dirty_inode(inode);
+	    
+        sd_update = 1;
+    }
+    if (th->t_trans_id) {
+	reiserfs_write_lock(inode->i_sb);
+	if (!sd_update)
+	    reiserfs_update_sd(th, inode);
+	journal_end(th, th->t_super, th->t_blocks_allocated);
+	reiserfs_write_unlock(inode->i_sb);
+    }
+    th->t_trans_id = 0;
     return retval;
 }
 
@@ -1006,19 +1061,18 @@ ssize_t reiserfs_file_write( struct file
     loff_t pos; // Current position in the file.
     size_t res; // return value of various functions that we call.
     struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to.
-    struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
 				/* To simplify coding at this time, we store
 				   locked pages in array for now */
-    if ( count <= PAGE_CACHE_SIZE )
-        return generic_file_write(file, buf, count, ppos);
+    struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
+    struct reiserfs_transaction_handle th;
+    th.t_trans_id = 0;
 
-    if ( file->f_flags & O_DIRECT) { // Direct IO needs some special threating.
+    if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment
 	int result, after_file_end = 0;
 	if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) {
 	    /* If we are appending a file, we need to put this savelink in here.
 	       If we will crash while doing direct io, finish_unfinished will
 	       cut the garbage from the file end. */
-	    struct reiserfs_transaction_handle th;
 	    reiserfs_write_lock(inode->i_sb);
 	    journal_begin(&th, inode->i_sb,  JOURNAL_PER_BALANCE_CNT );
 	    reiserfs_update_inode_transaction(inode);
@@ -1043,7 +1097,6 @@ ssize_t reiserfs_file_write( struct file
 	return result;
     }
 
-
     if ( unlikely((ssize_t) count < 0 ))
         return -EINVAL;
 
@@ -1146,12 +1199,8 @@ ssize_t reiserfs_file_write( struct file
 
 	if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/
 	    /* Fill in all the possible holes and append the file if needed */
-	    res = reiserfs_allocate_blocks_for_region(inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
-	} else if ( pos + write_bytes > inode->i_size ) {
-	    /* File might have grown even though no new blocks were added */
-	    inode->i_size = pos + write_bytes;
-	    inode->i_sb->s_op->dirty_inode(inode);
-	}
+	    res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
+	} 
 
 	/* well, we have allocated the blocks, so it is time to free
 	   the reservation we made earlier. */
@@ -1173,7 +1222,8 @@ ssize_t reiserfs_file_write( struct file
 	}
 
 	/* Send the pages to disk and unlock them. */
-	res = reiserfs_submit_file_region_for_write(pos, num_pages, write_bytes, prepared_pages);
+	res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages,
+	                                            write_bytes,prepared_pages);
 	if ( res )
 	    break;
 
@@ -1184,10 +1234,17 @@ ssize_t reiserfs_file_write( struct file
 	balance_dirty_pages_ratelimited(inode->i_mapping);
     }
 
+    /* this is only true on error */
+    if (th.t_trans_id) {
+        reiserfs_write_lock(inode->i_sb);
+	journal_end(&th, th.t_super, th.t_blocks_allocated);
+        reiserfs_write_unlock(inode->i_sb);
+    }
     if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
 	res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
 
     up(&inode->i_sem);
+    reiserfs_async_progress_wait(inode->i_sb);
     return (already_written != 0)?already_written:res;
 
 out:
@@ -1219,6 +1276,11 @@ struct file_operations reiserfs_file_ope
 struct  inode_operations reiserfs_file_inode_operations = {
     .truncate	= reiserfs_vfs_truncate_file,
     .setattr    = reiserfs_setattr,
+    .setxattr   = reiserfs_setxattr,
+    .getxattr   = reiserfs_getxattr,
+    .listxattr  = reiserfs_listxattr,
+    .removexattr = reiserfs_removexattr,
+    .permission = reiserfs_permission,
 };
 
 
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/fix_node.c linux-2.6.5-rc1-mm2/fs/reiserfs/fix_node.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/fix_node.c	2004-03-11 03:55:24.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/fix_node.c	2004-03-19 16:10:24.008965731 +0100
@@ -795,8 +795,9 @@ static int  get_empty_nodes(
   else /* If we have enough already then there is nothing to do. */
     return CARRY_ON;
 
-  if ( reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
-                                   n_amount_needed) == NO_DISK_SPACE )
+  /* No need to check quota - is not allocated for blocks used for formatted nodes */
+  if (reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
+                                   n_amount_needed) == NO_DISK_SPACE)
     return NO_DISK_SPACE;
 
   /* for each blocknumber we just got, get a buffer and stick it on FEB */
@@ -2106,9 +2107,9 @@ static void tb_buffer_sanity_check (stru
 {;}
 #endif
 
-static void clear_all_dirty_bits(struct super_block *s, 
+static int clear_all_dirty_bits(struct super_block *s, 
                                  struct buffer_head *bh) {
-  reiserfs_prepare_for_journal(s, bh, 0) ;
+  return reiserfs_prepare_for_journal(s, bh, 0) ;
 }
 
 static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
@@ -2137,11 +2138,11 @@ static int wait_tb_buffers_until_unlocke
 					    p_s_tb->tb_path->path_length - i);
 		}
 #endif
-		clear_all_dirty_bits(p_s_tb->tb_sb, 
-				     PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) ;
-
-		if ( buffer_locked (PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) )
+		if (!clear_all_dirty_bits(p_s_tb->tb_sb, 
+				     PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)))
+		{
 		    locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i);
+		}
 	    }
 	}
 
@@ -2151,22 +2152,19 @@ static int wait_tb_buffers_until_unlocke
 
 		if ( p_s_tb->L[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]) ;
-		    if ( buffer_locked (p_s_tb->L[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]))
 			locked = p_s_tb->L[i];
 		}
 
 		if ( !locked && p_s_tb->FL[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]) ;
-		    if ( buffer_locked (p_s_tb->FL[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i])) 
 			locked = p_s_tb->FL[i];
 		}
 
 		if ( !locked && p_s_tb->CFL[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]) ;
-		    if ( buffer_locked (p_s_tb->CFL[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]))
 			locked = p_s_tb->CFL[i];
 		}
 
@@ -2176,23 +2174,20 @@ static int wait_tb_buffers_until_unlocke
 
 		if ( p_s_tb->R[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]) ;
-		    if ( buffer_locked (p_s_tb->R[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]))
 			locked = p_s_tb->R[i];
 		}
 
        
 		if ( !locked && p_s_tb->FR[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]) ;
-		    if ( buffer_locked (p_s_tb->FR[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]))
 			locked = p_s_tb->FR[i];
 		}
 
 		if ( !locked && p_s_tb->CFR[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]) ;
-		    if ( buffer_locked (p_s_tb->CFR[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]))
 			locked = p_s_tb->CFR[i];
 		}
 	    }
@@ -2207,10 +2202,8 @@ static int wait_tb_buffers_until_unlocke
 	*/
 	for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) { 
 	    if ( p_s_tb->FEB[i] ) {
-		clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]) ;
-		if (buffer_locked(p_s_tb->FEB[i])) {
+		if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]))
 		    locked = p_s_tb->FEB[i] ;
-		}
 	    }
 	}
 
@@ -2280,7 +2273,6 @@ int fix_nodes (int n_op_mode,
     ** during wait_tb_buffers_run
     */
     int wait_tb_buffers_run = 0 ; 
-    int windex ;
     struct buffer_head  * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
 
     ++ REISERFS_SB(p_s_tb -> tb_sb) -> s_fix_nodes;
@@ -2407,10 +2399,7 @@ int fix_nodes (int n_op_mode,
 		p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
     }
 
-    
-    windex = push_journal_writer("fix_nodes") ;
     if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) {
-	pop_journal_writer(windex) ;
 	if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
 	    wait_tb_buffers_run = 1 ;
 	    n_ret_value = REPEAT_SEARCH ;
@@ -2420,7 +2409,6 @@ int fix_nodes (int n_op_mode,
 	}
     } else {
 	wait_tb_buffers_run = 1 ;
-	pop_journal_writer(windex) ;
 	goto repeat; 
     }
 
@@ -2505,7 +2493,7 @@ void unfix_nodes (struct tree_balance * 
 	    /* de-allocated block which was not used by balancing and
                bforget about buffer for it */
 	    brelse (tb->FEB[i]);
-	    reiserfs_free_block (tb->transaction_handle, blocknr);
+	    reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
 	}
 	if (tb->used[i]) {
 	    /* release used as new nodes including a new root */
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/ibalance.c linux-2.6.5-rc1-mm2/fs/reiserfs/ibalance.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/ibalance.c	2004-03-11 03:55:28.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/ibalance.c	2004-03-19 16:10:24.009965624 +0100
@@ -633,7 +633,6 @@ static void balance_internal_when_delete
 		/* use check_internal if new root is an internal node */
 		check_internal (new_root);
 	    /*&&&&&&&&&&&&&&&&&&&&&&*/
-	    tb->tb_sb->s_dirt = 1;
 
 	    /* do what is needed for buffer thrown from tree */
 	    reiserfs_invalidate_buffer(tb, tbSh);
@@ -951,7 +950,6 @@ int balance_internal (struct tree_balanc
         PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr );
         PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 );
 	do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
-	tb->tb_sb->s_dirt = 1;
     }
 	
     if ( tb->blknum[h] == 2 ) {
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/inode.c linux-2.6.5-rc1-mm2/fs/reiserfs/inode.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/inode.c	2004-03-19 15:13:58.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/inode.c	2004-03-19 16:10:24.009965624 +0100
@@ -4,7 +4,10 @@
 
 #include <linux/config.h>
 #include <linux/time.h>
+#include <linux/fs.h>
 #include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_acl.h>
+#include <linux/reiserfs_xattr.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
@@ -13,6 +16,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
+#include <linux/quotaops.h>
 
 extern int reiserfs_default_io_size; /* default io size devuned in super.c */
 
@@ -22,29 +26,31 @@ extern int reiserfs_default_io_size; /* 
 #define GET_BLOCK_NO_HOLE 2   /* return -ENOENT for file holes */
 #define GET_BLOCK_READ_DIRECT 4  /* read the tail if indirect item not found */
 #define GET_BLOCK_NO_ISEM     8 /* i_sem is not held, don't preallocate */
+#define GET_BLOCK_NO_DANGLE   16 /* don't leave any transactions running */
 
 static int reiserfs_get_block (struct inode * inode, sector_t block,
 			       struct buffer_head * bh_result, int create);
+static int reiserfs_commit_write(struct file *f, struct page *page, 
+                                 unsigned from, unsigned to);
 
 void reiserfs_delete_inode (struct inode * inode)
 {
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2; 
-    int windex ;
     struct reiserfs_transaction_handle th ;
-
   
     reiserfs_write_lock(inode->i_sb);
 
+    DQUOT_FREE_INODE(inode);
     /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
     if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
 	down (&inode->i_sem); 
 
+	reiserfs_delete_xattrs (inode);
+
 	journal_begin(&th, inode->i_sb, jbegin_count) ;
 	reiserfs_update_inode_transaction(inode) ;
-	windex = push_journal_writer("delete_inode") ;
 
 	reiserfs_delete_object (&th, inode);
-	pop_journal_writer(windex) ;
 
 	journal_end(&th, inode->i_sb, jbegin_count) ;
 
@@ -107,12 +113,6 @@ inline void make_le_item_head (struct it
     put_ih_entry_count( ih, entry_count );
 }
 
-static void add_to_flushlist(struct inode *inode, struct buffer_head *bh) {
-    struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
-
-    buffer_insert_list(&j->j_dirty_buffers_lock, bh, &j->j_dirty_buffers) ;
-}
-
 //
 // FIXME: we might cache recently accessed indirect item
 
@@ -206,6 +206,10 @@ static int file_capable (struct inode * 
   struct super_block *s = th->t_super ;
   int len = th->t_blocks_allocated ;
 
+  /* we cannot restart while nested */
+  if (th->t_refcount > 1) {
+      return  ;
+  }
   pathrelse(path) ;
   reiserfs_update_sd(th, inode) ;
   journal_end(th, s, len) ;
@@ -437,7 +441,8 @@ static int reiserfs_get_blocks_direct_io
        reiserfs_get_block() */
     bh_result->b_size = (1 << inode->i_blkbits);
 
-    ret = reiserfs_get_block(inode, iblock, bh_result, create) ;
+    ret = reiserfs_get_block(inode, iblock, bh_result, 
+                             create | GET_BLOCK_NO_DANGLE) ;
 
     /* don't allow direct io onto tail pages */
     if (ret == 0 && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
@@ -510,15 +515,14 @@ static int convert_tail_for_hole(struct 
     ** won't trigger a get_block in this case.
     */
     fix_tail_page_for_writing(tail_page) ;
-    retval = block_prepare_write(tail_page, tail_start, tail_end, 
-                                 reiserfs_get_block) ; 
+    retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
     if (retval)
         goto unlock ;
 
     /* tail conversion might change the data in the page */
     flush_dcache_page(tail_page) ;
 
-    retval = generic_commit_write(NULL, tail_page, tail_start, tail_end) ;
+    retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ;
 
 unlock:
     if (tail_page != hole_page) {
@@ -557,8 +561,7 @@ int reiserfs_get_block (struct inode * i
     __u32 * item;
     int done;
     int fs_gen;
-    int windex ;
-    struct reiserfs_transaction_handle th ;
+    struct reiserfs_transaction_handle *th = NULL;
     /* space reserved in transaction batch: 
         . 3 balancings in direct->indirect conversion
         . 1 block involved into reiserfs_update_sd()
@@ -566,12 +569,11 @@ int reiserfs_get_block (struct inode * i
        can incur (much) more that 3 balancings. */
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1;
     int version;
-    int transaction_started = 0 ;
+    int dangle = 1;
     loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
 
 				/* bad.... */
     reiserfs_write_lock(inode->i_sb);
-    th.t_trans_id = 0 ;
     version = get_inode_item_key_version (inode);
 
     if (block < 0) {
@@ -595,6 +597,13 @@ int reiserfs_get_block (struct inode * i
 	reiserfs_write_unlock(inode->i_sb);
 	return ret;
     }
+    /* 
+     * if we're already in a transaction, make sure to close
+     * any new transactions we start in this func
+     */
+    if ((create & GET_BLOCK_NO_DANGLE) || 
+        reiserfs_transaction_running(inode->i_sb))
+        dangle = 0;
 
     /* If file is of such a size, that it might have a tail and tails are enabled
     ** we should mark it as possibly needing tail packing on close
@@ -603,15 +612,17 @@ int reiserfs_get_block (struct inode * i
 	 (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) )
 	REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
 
-    windex = push_journal_writer("reiserfs_get_block") ;
-  
     /* set the key of the first byte in the 'block'-th block of file */
     make_cpu_key (&key, inode, new_offset,
 		  TYPE_ANY, 3/*key length*/);
     if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
-	journal_begin(&th, inode->i_sb, jbegin_count) ;
+start_trans:
+	th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
+	if (!th) {
+	    retval = -ENOMEM;
+	    goto failure;
+	}
 	reiserfs_update_inode_transaction(inode) ;
-	transaction_started = 1 ;
     }
  research:
 
@@ -631,28 +642,29 @@ int reiserfs_get_block (struct inode * i
 
     if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
 	/* we have to allocate block for the unformatted node */
-	if (!transaction_started) {
+	if (!th) {
 	    pathrelse(&path) ;
-	    journal_begin(&th, inode->i_sb, jbegin_count) ;
-	    reiserfs_update_inode_transaction(inode) ;
-	    transaction_started = 1 ;
-	    goto research ;
+	    goto start_trans;
 	}
 
-	repeat = _allocate_block(&th, block, inode, &allocated_block_nr, &path, create);
+	repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
 
-	if (repeat == NO_DISK_SPACE) {
+	if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
 	    /* restart the transaction to give the journal a chance to free
 	    ** some blocks.  releases the path, so we have to go back to
 	    ** research if we succeed on the second try
 	    */
-	    restart_transaction(&th, inode, &path) ; 
-	    repeat = _allocate_block(&th, block, inode, &allocated_block_nr, NULL, create);
+	    SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
+	    restart_transaction(th, inode, &path) ; 
+	    repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
 
-	    if (repeat != NO_DISK_SPACE) {
+	    if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
 		goto research ;
 	    }
-	    retval = -ENOSPC;
+	    if (repeat == QUOTA_EXCEEDED)
+		retval = -EDQUOT;
+	    else
+		retval = -ENOSPC;
 	    goto failure;
 	}
 
@@ -675,17 +687,17 @@ int reiserfs_get_block (struct inode * i
 		goto research;
 	    }
 	    set_buffer_new(bh_result);
+	    if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb))
+	    	reiserfs_add_ordered_list(inode, bh_result);
 	    put_block_num(item, pos_in_item, allocated_block_nr) ;
             unfm_ptr = allocated_block_nr;
-	    journal_mark_dirty (&th, inode->i_sb, bh);
-	    inode->i_blocks += (inode->i_sb->s_blocksize / 512) ;
-	    reiserfs_update_sd(&th, inode) ;
+	    journal_mark_dirty (th, inode->i_sb, bh);
+	    reiserfs_update_sd(th, inode) ;
 	}
 	set_block_dev_mapped(bh_result, unfm_ptr, inode);
 	pathrelse (&path);
-	pop_journal_writer(windex) ;
-	if (transaction_started)
-	    journal_end(&th, inode->i_sb, jbegin_count) ;
+	if (!dangle && th)
+	    reiserfs_end_persistent_transaction(th);
 
 	reiserfs_write_unlock(inode->i_sb);
 	 
@@ -696,16 +708,9 @@ int reiserfs_get_block (struct inode * i
 	return 0;
     }
 
-    if (!transaction_started) {
-	/* if we don't pathrelse, we could vs-3050 on the buffer if
-	** someone is waiting for it (they can't finish until the buffer
-	** is released, we can start a new transaction until they finish)
-	*/
+    if (!th) {
 	pathrelse(&path) ;
-	journal_begin(&th, inode->i_sb, jbegin_count) ;
-	reiserfs_update_inode_transaction(inode) ;
-	transaction_started = 1 ;
-	goto research;
+	goto start_trans;
     }
 
     /* desired position is not found or is in the direct item. We have
@@ -733,13 +738,11 @@ int reiserfs_get_block (struct inode * i
 	    set_cpu_key_k_offset (&tmp_key, 1);
 	    PATH_LAST_POSITION(&path) ++;
 
-	    retval = reiserfs_insert_item (&th, &path, &tmp_key, &tmp_ih, (char *)&unp);
+	    retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp);
 	    if (retval) {
-		reiserfs_free_block (&th, allocated_block_nr);
-		goto failure; // retval == -ENOSPC or -EIO or -EEXIST
+		reiserfs_free_block (th, inode, allocated_block_nr, 1);
+		goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
 	    }
-	    if (unp)
-		inode->i_blocks += inode->i_sb->s_blocksize / 512;
 	    //mark_tail_converted (inode);
 	} else if (is_direct_le_ih (ih)) {
 	    /* direct item has to be converted */
@@ -759,8 +762,14 @@ int reiserfs_get_block (struct inode * i
 		   node. FIXME: this should also get into page cache */
 
 		pathrelse(&path) ;
-		journal_end(&th, inode->i_sb, jbegin_count) ;
-		transaction_started = 0 ;
+		/*
+		 * ugly, but we can only end the transaction if
+		 * we aren't nested
+		 */
+		if (th->t_refcount == 1) {
+		    reiserfs_end_persistent_transaction(th);
+		    th = NULL;
+		}
 
 		retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
 		if (retval) {
@@ -768,18 +777,19 @@ int reiserfs_get_block (struct inode * i
 			printk("clm-6004: convert tail failed inode %lu, error %d\n", inode->i_ino, retval) ;
 		    if (allocated_block_nr) {
 			/* the bitmap, the super, and the stat data == 3 */
-			journal_begin(&th, inode->i_sb, 3) ;
-			reiserfs_free_block (&th, allocated_block_nr);
-			transaction_started = 1 ;
+			if (!th)
+			    th = reiserfs_persistent_transaction(inode->i_sb,3);
+			if (th)
+			    reiserfs_free_block (th,inode,allocated_block_nr,1);
 		    }
 		    goto failure ;
 		}
 		goto research ;
 	    }
-	    retval = direct2indirect (&th, inode, &path, unbh, tail_offset);
+	    retval = direct2indirect (th, inode, &path, unbh, tail_offset);
 	    if (retval) {
 		reiserfs_unmap_buffer(unbh);
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, inode, allocated_block_nr, 1);
 		goto failure;
 	    }
 	    /* it is important the set_buffer_uptodate is done after
@@ -799,7 +809,7 @@ int reiserfs_get_block (struct inode * i
 		/* we've converted the tail, so we must
 		** flush unbh before the transaction commits
 		*/
-		add_to_flushlist(inode, unbh) ;
+		reiserfs_add_tail_list(inode, unbh) ;
 
 		/* mark it dirty now to prevent commit_write from adding
 		** this buffer to the inode's dirty buffer list
@@ -812,9 +822,6 @@ int reiserfs_get_block (struct inode * i
 		 */
 		mark_buffer_dirty(unbh) ;
 	    }
-
-	    //inode->i_blocks += inode->i_sb->s_blocksize / 512;
-	    //mark_tail_converted (inode);
 	} else {
 	    /* append indirect item with holes if needed, when appending
 	       pointer to 'block'-th block use block, which is already
@@ -862,24 +869,21 @@ int reiserfs_get_block (struct inode * i
 		   only have space for one block */
 		blocks_needed=max_to_insert?max_to_insert:1;
 	    }
-	    retval = reiserfs_paste_into_item (&th, &path, &tmp_key, (char *)un, UNFM_P_SIZE * blocks_needed);
+	    retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed);
 
 	    if (blocks_needed != 1)
 		kfree(un);
 
 	    if (retval) {
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, inode, allocated_block_nr, 1);
 		goto failure;
 	    }
-	    if (done) {
-		inode->i_blocks += inode->i_sb->s_blocksize / 512;
-	    } else {
+	    if (!done) {
 		/* We need to mark new file size in case this function will be
 		   interrupted/aborted later on. And we may do this only for
 		   holes. */
 		inode->i_size += inode->i_sb->s_blocksize * blocks_needed;
 	    }
-	    //mark_tail_converted (inode);
 	}
 
 	if (done == 1)
@@ -893,8 +897,8 @@ int reiserfs_get_block (struct inode * i
 	** release the path so that anybody waiting on the path before
 	** ending their transaction will be able to continue.
 	*/
-	if (journal_transaction_should_end(&th, th.t_blocks_allocated)) {
-	  restart_transaction(&th, inode, &path) ; 
+	if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
+	  restart_transaction(th, inode, &path) ; 
 	}
 	/* inserting indirect pointers for a hole can take a 
 	** long time.  reschedule if needed
@@ -911,7 +915,7 @@ int reiserfs_get_block (struct inode * i
 			      "%K should not be found\n", &key);
 	    retval = -EEXIST;
 	    if (allocated_block_nr)
-	        reiserfs_free_block (&th, allocated_block_nr);
+	        reiserfs_free_block (th, inode, allocated_block_nr, 1);
 	    pathrelse(&path) ;
 	    goto failure;
 	}
@@ -925,11 +929,10 @@ int reiserfs_get_block (struct inode * i
     retval = 0;
 
  failure:
-    if (transaction_started) {
-      reiserfs_update_sd(&th, inode) ;
-      journal_end(&th, inode->i_sb, jbegin_count) ;
+    if (th && !dangle) {
+      reiserfs_update_sd(th, inode) ;
+      reiserfs_end_persistent_transaction(th);
     }
-    pop_journal_writer(windex) ;
     reiserfs_write_unlock(inode->i_sb);
     reiserfs_check_path(&path) ;
     return retval;
@@ -942,6 +945,58 @@ reiserfs_readpages(struct file *file, st
     return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
 }
 
+/* Compute real number of used bytes by file
+ * Following three functions can go away when we'll have enough space in stat item
+ */
+static int real_space_diff(struct inode *inode, int sd_size)
+{
+    int bytes;
+    loff_t blocksize = inode->i_sb->s_blocksize ;
+
+    if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
+        return sd_size ;
+
+    /* End of file is also in full block with indirect reference, so round
+    ** up to the next block.
+    **
+    ** there is just no way to know if the tail is actually packed
+    ** on the file, so we have to assume it isn't.  When we pack the
+    ** tail, we add 4 bytes to pretend there really is an unformatted
+    ** node pointer
+    */
+    bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size;
+    return bytes ;
+}
+
+static inline loff_t to_real_used_space(struct inode *inode, ulong blocks, 
+                                        int sd_size)
+{
+    if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+        return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ;
+    }
+    return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9);
+}
+
+/* Compute number of blocks used by file in ReiserFS counting */
+static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
+{
+    loff_t bytes = inode_get_bytes(inode) ;
+    loff_t real_space = real_space_diff(inode, sd_size) ;
+    
+    /* keeps fsck and non-quota versions of reiserfs happy */
+    if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+        bytes += (loff_t)511 ;
+    }
+
+    /* files from before the quota patch might i_blocks such that
+    ** bytes < real_space.  Deal with that here to prevent it from
+    ** going negative.  
+    */
+    if (bytes < real_space)
+        return 0 ;
+    return (bytes - real_space) >> 9;
+}
+
 //
 // BAD: new directories have stat data of new type and all other items
 // of old type. Version stored in the inode says about body items, so
@@ -969,7 +1024,10 @@ static void init_inode (struct inode * i
     REISERFS_I(inode)->i_prealloc_block = 0;
     REISERFS_I(inode)->i_prealloc_count = 0;
     REISERFS_I(inode)->i_trans_id = 0;
-    REISERFS_I(inode)->i_trans_index = 0;
+    REISERFS_I(inode)->i_jl = NULL;
+    REISERFS_I(inode)->i_acl_access = NULL;
+    REISERFS_I(inode)->i_acl_default = NULL;
+    init_rwsem (&REISERFS_I(inode)->xattr_sem);
 
     if (stat_data_v1 (ih)) {
 	struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih);
@@ -1004,6 +1062,14 @@ static void init_inode (struct inode * i
 
         rdev = sd_v1_rdev(sd);
 	REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd);
+	/* an early bug in the quota code can give us an odd number for the
+	** block count.  This is incorrect, fix it here.
+	*/
+	if (inode->i_blocks & 1) {
+	    inode->i_blocks++ ;
+	}
+	inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, 
+	                                          SD_V1_SIZE));
 	/* nopack is initially zero for v1 objects. For v2 objects,
 	   nopack is initialised from sd_attrs */
 	REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
@@ -1036,6 +1102,8 @@ static void init_inode (struct inode * i
 	    set_inode_item_key_version (inode, KEY_FORMAT_3_6);
 	REISERFS_I(inode)->i_first_direct_byte = 0;
 	set_inode_sd_version (inode, STAT_DATA_V2);
+	inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, 
+	                                          SD_V2_SIZE));
 	/* read persistent inode attributes from sd and initalise
 	   generic inode flags from them */
 	REISERFS_I(inode)->i_attrs = sd_v2_attrs( sd );
@@ -1051,7 +1119,7 @@ static void init_inode (struct inode * i
 	inode->i_op = &reiserfs_dir_inode_operations;
 	inode->i_fop = &reiserfs_dir_operations;
     } else if (S_ISLNK (inode->i_mode)) {
-	inode->i_op = &page_symlink_inode_operations;
+	inode->i_op = &reiserfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
     } else {
 	inode->i_blocks = 0;
@@ -1061,7 +1129,7 @@ static void init_inode (struct inode * i
 
 
 // update new stat data with inode fields
-static void inode2sd (void * sd, struct inode * inode)
+static void inode2sd (void * sd, struct inode * inode, loff_t size)
 {
     struct stat_data * sd_v2 = (struct stat_data *)sd;
     __u16 flags;
@@ -1069,12 +1137,12 @@ static void inode2sd (void * sd, struct 
     set_sd_v2_mode(sd_v2, inode->i_mode );
     set_sd_v2_nlink(sd_v2, inode->i_nlink );
     set_sd_v2_uid(sd_v2, inode->i_uid );
-    set_sd_v2_size(sd_v2, inode->i_size );
+    set_sd_v2_size(sd_v2, size );
     set_sd_v2_gid(sd_v2, inode->i_gid );
     set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec );
     set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec );
     set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec );
-    set_sd_v2_blocks(sd_v2, inode->i_blocks );
+    set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
     if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 	set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
     else
@@ -1086,7 +1154,7 @@ static void inode2sd (void * sd, struct 
 
 
 // used to copy inode's fields to old stat data
-static void inode2sd_v1 (void * sd, struct inode * inode)
+static void inode2sd_v1 (void * sd, struct inode * inode, loff_t size)
 {
     struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd;
 
@@ -1094,7 +1162,7 @@ static void inode2sd_v1 (void * sd, stru
     set_sd_v1_uid(sd_v1, inode->i_uid );
     set_sd_v1_gid(sd_v1, inode->i_gid );
     set_sd_v1_nlink(sd_v1, inode->i_nlink );
-    set_sd_v1_size(sd_v1, inode->i_size );
+    set_sd_v1_size(sd_v1, size );
     set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec );
     set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec );
     set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec );
@@ -1102,7 +1170,7 @@ static void inode2sd_v1 (void * sd, stru
     if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
         set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
     else
-        set_sd_v1_blocks(sd_v1, inode->i_blocks );
+        set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
 
     // Sigh. i_first_direct_byte is back
     set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte);
@@ -1112,7 +1180,8 @@ static void inode2sd_v1 (void * sd, stru
 /* NOTE, you must prepare the buffer head before sending it here,
 ** and then log it after the call
 */
-static void update_stat_data (struct path * path, struct inode * inode)
+static void update_stat_data (struct path * path, struct inode * inode, 
+                              loff_t size)
 {
     struct buffer_head * bh;
     struct item_head * ih;
@@ -1126,17 +1195,17 @@ static void update_stat_data (struct pat
   
     if (stat_data_v1 (ih)) {
 	// path points to old stat data
-	inode2sd_v1 (B_I_PITEM (bh, ih), inode);
+	inode2sd_v1 (B_I_PITEM (bh, ih), inode, size);
     } else {
-	inode2sd (B_I_PITEM (bh, ih), inode);
+	inode2sd (B_I_PITEM (bh, ih), inode, size);
     }
 
     return;
 }
 
 
-void reiserfs_update_sd (struct reiserfs_transaction_handle *th, 
-			 struct inode * inode)
+void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th, 
+			      struct inode * inode, loff_t size)
 {
     struct cpu_key key;
     INITIALIZE_PATH(path);
@@ -1186,7 +1255,7 @@ void reiserfs_update_sd (struct reiserfs
 	}
 	break;
     }
-    update_stat_data (&path, inode);
+    update_stat_data (&path, inode, size);
     journal_mark_dirty(th, th->t_super, bh) ; 
     pathrelse (&path);
     return;
@@ -1469,6 +1538,7 @@ int reiserfs_sync_inode (struct reiserfs
 /* stat data of new object is inserted already, this inserts the item
    containing "." and ".." entries */
 static int reiserfs_new_directory (struct reiserfs_transaction_handle *th, 
+				   struct inode *inode,
 				   struct item_head * ih, struct path * path,
 				   struct inode * dir)
 {
@@ -1513,13 +1583,14 @@ static int reiserfs_new_directory (struc
     }
 
     /* insert item, that is empty directory item */
-    return reiserfs_insert_item (th, path, &key, ih, body);
+    return reiserfs_insert_item (th, path, &key, ih, inode, body);
 }
 
 
 /* stat data of object has been inserted, this inserts the item
    containing the body of symlink */
 static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th, 
+				 struct inode *inode,	/* Inode of symlink */
 				 struct item_head * ih,
 				 struct path * path, const char * symname, int item_len)
 {
@@ -1549,7 +1620,7 @@ static int reiserfs_new_symlink (struct 
     }
 
     /* insert item, that is body of symlink */
-    return reiserfs_insert_item (th, path, &key, ih, symname);
+    return reiserfs_insert_item (th, path, &key, ih, inode, symname);
 }
 
 
@@ -1617,7 +1688,8 @@ int reiserfs_new_inode (struct reiserfs_
 
     inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
     inode->i_size = i_size;
-    inode->i_blocks = (inode->i_size + 511) >> 9;
+    inode->i_blocks = 0;
+    inode->i_bytes = 0;
     REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : 
       U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/;
 
@@ -1626,10 +1698,13 @@ int reiserfs_new_inode (struct reiserfs_
     REISERFS_I(inode)->i_prealloc_block = 0;
     REISERFS_I(inode)->i_prealloc_count = 0;
     REISERFS_I(inode)->i_trans_id = 0;
-    REISERFS_I(inode)->i_trans_index = 0;
+    REISERFS_I(inode)->i_jl = 0;
     REISERFS_I(inode)->i_attrs =
 	REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
     sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode );
+    REISERFS_I(inode)->i_acl_access = NULL;
+    REISERFS_I(inode)->i_acl_default = NULL;
+    init_rwsem (&REISERFS_I(inode)->xattr_sem);
 
     if (old_format_only (sb))
 	make_le_item_head (&ih, 0, KEY_FORMAT_3_5, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
@@ -1659,9 +1734,9 @@ int reiserfs_new_inode (struct reiserfs_
 	    err = -EINVAL;
 	    goto out_bad_inode;
 	}
-	inode2sd_v1 (&sd, inode);
+	inode2sd_v1 (&sd, inode, inode->i_size);
     } else {
-	inode2sd (&sd, inode);
+	inode2sd (&sd, inode, inode->i_size);
     }
     // these do not go to on-disk stat data
     inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid);
@@ -1685,7 +1760,7 @@ int reiserfs_new_inode (struct reiserfs_
     if (REISERFS_I(dir)->new_packing_locality)
 	th->displace_new_blocks = 1;
 #endif
-    retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, (char *)(&sd));
+    retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd));
     if (retval) {
 	err = retval;
 	reiserfs_check_path(&path_to_key) ;
@@ -1698,14 +1773,14 @@ int reiserfs_new_inode (struct reiserfs_
 #endif
     if (S_ISDIR(mode)) {
 	/* insert item with "." and ".." */
-	retval = reiserfs_new_directory (th, &ih, &path_to_key, dir);
+	retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir);
     }
 
     if (S_ISLNK(mode)) {
 	/* insert body of symlink */
 	if (!old_format_only (sb))
 	    i_size = ROUND_UP(i_size);
-	retval = reiserfs_new_symlink (th, &ih, &path_to_key, symname, i_size);
+	retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size);
     }
     if (retval) {
 	err = retval;
@@ -1714,6 +1789,19 @@ int reiserfs_new_inode (struct reiserfs_
 	goto out_inserted_sd;
     }
 
+    /* XXX CHECK THIS */
+    if (reiserfs_posixacl (inode->i_sb)) {
+        retval = reiserfs_inherit_default_acl (dir, dentry, inode);
+        if (retval) {
+            err = retval;
+            reiserfs_check_path(&path_to_key) ;
+            journal_end(th, th->t_super, th->t_blocks_allocated);
+            goto out_inserted_sd;
+        }
+    } else if (inode->i_sb->s_flags & MS_POSIXACL) {
+        reiserfs_warning ("ACLs aren't enabled in the fs, but vfs thinks they are!\n");
+    }
+
     insert_inode_hash (inode);
     reiserfs_update_sd(th, inode);
     reiserfs_check_path(&path_to_key) ;
@@ -1730,6 +1818,9 @@ out_bad_inode:
 
     /* dquot_drop must be done outside a transaction */
     journal_end(th, th->t_super, th->t_blocks_allocated) ;
+    DQUOT_FREE_INODE(inode);
+    DQUOT_DROP(inode);
+    inode->i_flags |= S_NOQUOTA;
     make_bad_inode(inode);
 
 out_inserted_sd:
@@ -1832,7 +1923,6 @@ unlock:
 */
 void reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) {
     struct reiserfs_transaction_handle th ;
-    int windex ;
     /* we want the offset for the first byte after the end of the file */
     unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ;
     unsigned blocksize = p_s_inode->i_sb->s_blocksize ;
@@ -1867,14 +1957,12 @@ void reiserfs_truncate_file(struct inode
        cut_from_item. 1 is for update_sd */
     journal_begin(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
     reiserfs_update_inode_transaction(p_s_inode) ;
-    windex = push_journal_writer("reiserfs_vfs_truncate_file") ;
     if (update_timestamps)
 	    /* we are doing real truncate: if the system crashes before the last
 	       transaction of truncating gets committed - on reboot the file
 	       either appears truncated properly or not truncated at all */
 	add_save_link (&th, p_s_inode, 1);
     reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
-    pop_journal_writer(windex) ;
     journal_end(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
 
     if (update_timestamps)
@@ -2015,7 +2103,8 @@ out:
     /* this is where we fill in holes in the file. */
     if (use_get_block) {
 	retval = reiserfs_get_block(inode, block, bh_result, 
-	                            GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM) ;
+	                            GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM |
+				    GET_BLOCK_NO_DANGLE);
 	if (!retval) {
 	    if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
 	        /* get_block failed to find a mapped unformatted node. */
@@ -2037,32 +2126,6 @@ out:
     return retval ;
 }
 
-/*
- * does the right thing for deciding when to lock a buffer and
- * mark it for io during a writepage.  make sure the buffer is
- * dirty before sending it here though.
- */
-static void lock_buffer_for_writepage(struct page *page, 
-                                      struct writeback_control *wbc, 
-			              struct buffer_head *bh)
-{
-    if (wbc->sync_mode != WB_SYNC_NONE) {
-	lock_buffer(bh);
-    } else {
-	if (test_set_buffer_locked(bh)) {
-	    __set_page_dirty_nobuffers(page);
-	    return;
-	}
-    }
-    if (test_clear_buffer_dirty(bh)) {
-	if (!buffer_uptodate(bh))
-	    buffer_error();
-	mark_buffer_async_write(bh);
-    } else {
-	unlock_buffer(bh);
-    }
-}
-
 /* 
  * mason@suse.com: updated in 2.5.54 to follow the same general io 
  * start/recovery path as __block_write_full_page, along with special
@@ -2110,29 +2173,52 @@ static int reiserfs_write_full_page(stru
     }
     bh = head ;
     block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits) ;
+    /* first map all the buffers, logging any direct items we find */
     do {
-	get_bh(bh);
-	if (buffer_dirty(bh)) {
-	    if (buffer_mapped(bh) && bh->b_blocknr != 0) {
-		/* buffer mapped to an unformatted node */
-		lock_buffer_for_writepage(page, wbc, bh);
-	    } else {
-		/* not mapped yet, or it points to a direct item, search
-		 * the btree for the mapping info, and log any direct
-		 * items found
-		 */
-		if ((error = map_block_for_writepage(inode, bh, block))) {
-		    goto fail ;
-		}
-		if (buffer_mapped(bh) && bh->b_blocknr != 0)  {
-		    lock_buffer_for_writepage(page, wbc, bh);
-		} 
+	if (buffer_dirty(bh) && (!buffer_mapped(bh) || 
+	   (buffer_mapped(bh) && bh->b_blocknr == 0))) {
+	    /* not mapped yet, or it points to a direct item, search
+	     * the btree for the mapping info, and log any direct
+	     * items found
+	     */
+	    if ((error = map_block_for_writepage(inode, bh, block))) {
+		goto fail ;
 	    }
 	}
         bh = bh->b_this_page;
 	block++;
     } while(bh != head) ;
 
+    /* now go through and lock any dirty buffers on the page */
+    do {
+	get_bh(bh);
+	if (!buffer_mapped(bh))
+	    continue;
+	if (buffer_mapped(bh) && bh->b_blocknr == 0)
+	    continue;
+
+	/* from this point on, we know the buffer is mapped to a
+	 * real block and not a direct item
+	 */
+	if (wbc->sync_mode != WB_SYNC_NONE) {
+	    lock_buffer(bh);
+	} else {
+	    if (test_set_buffer_locked(bh)) {
+		__set_page_dirty_nobuffers(page);
+		continue;
+	    }
+	}
+	if (test_clear_buffer_dirty(bh)) {
+	    if (!buffer_uptodate(bh))
+		buffer_error();
+	    mark_buffer_async_write(bh);
+	} else {
+	    unlock_buffer(bh);
+	}
+
+        bh = bh->b_this_page;
+    } while((bh = bh->b_this_page) != head);
+
     BUG_ON(PageWriteback(page));
     set_page_writeback(page);
     unlock_page(page);
@@ -2227,13 +2313,43 @@ static int reiserfs_writepage (struct pa
     return reiserfs_write_full_page(page, wbc) ;
 }
 
-
 int reiserfs_prepare_write(struct file *f, struct page *page, 
 			   unsigned from, unsigned to) {
     struct inode *inode = page->mapping->host ;
+    int ret;
+    int old_ref = 0;
+
     reiserfs_wait_on_write_block(inode->i_sb) ;
     fix_tail_page_for_writing(page) ;
-    return block_prepare_write(page, from, to, reiserfs_get_block) ;
+    if (reiserfs_transaction_running(inode->i_sb)) {
+	struct reiserfs_transaction_handle *th;
+        th = (struct reiserfs_transaction_handle *)current->journal_info;
+	old_ref = th->t_refcount;
+	th->t_refcount++;
+    }
+
+    ret = block_prepare_write(page, from, to, reiserfs_get_block) ;
+    if (ret && reiserfs_transaction_running(inode->i_sb)) {
+    	struct reiserfs_transaction_handle *th = current->journal_info;
+	/* this gets a little ugly.  If reiserfs_get_block returned an
+	 * error and left a transacstion running, we've got to close it, 
+	 * and we've got to free handle if it was a persistent transaction.
+	 *
+	 * But, if we had nested into an existing transaction, we need
+	 * to just drop the ref count on the handle. 
+	 *
+	 * If old_ref == 0, the transaction is from reiserfs_get_block, 
+	 * and it was a persistent trans.  Otherwise, it was nested above.
+	 */
+	if (th->t_refcount > old_ref) {
+	    if (old_ref)
+	    	th->t_refcount--;
+	    else
+		reiserfs_end_persistent_transaction(th);
+	}
+    }
+    return ret;
+     
 }
 
 
@@ -2245,16 +2361,22 @@ static int reiserfs_commit_write(struct 
                                  unsigned from, unsigned to) {
     struct inode *inode = page->mapping->host ;
     loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-    int ret ; 
+    int ret = 0; 
+    int update_sd = 0;
+    struct reiserfs_transaction_handle *th = NULL;
     
     reiserfs_wait_on_write_block(inode->i_sb) ;
+    if (reiserfs_transaction_running(inode->i_sb)) {
+        th = current->journal_info;
+    }
+    reiserfs_commit_page(inode, page, from, to);
  
     /* generic_commit_write does this for us, but does not update the
     ** transaction tracking stuff when the size changes.  So, we have
     ** to do the i_size updates here.
     */
     if (pos > inode->i_size) {
-	struct reiserfs_transaction_handle th ;
+	struct reiserfs_transaction_handle myth ;
 	reiserfs_write_lock(inode->i_sb);
 	/* If the file have grown beyond the border where it
 	   can have a tail, unmark it as needing a tail
@@ -2263,16 +2385,22 @@ static int reiserfs_commit_write(struct 
 	     (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
 	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
 
-	journal_begin(&th, inode->i_sb, 1) ;
+	journal_begin(&myth, inode->i_sb, 1) ;
 	reiserfs_update_inode_transaction(inode) ;
 	inode->i_size = pos ;
-	reiserfs_update_sd(&th, inode) ;
-	journal_end(&th, inode->i_sb, 1) ;
+	reiserfs_update_sd(&myth, inode) ;
+	update_sd = 1;
+	journal_end(&myth, inode->i_sb, 1) ;
+	reiserfs_write_unlock(inode->i_sb);
+    }
+    if (th) {
+	reiserfs_write_lock(inode->i_sb);
+	if (!update_sd)
+	    reiserfs_update_sd(th, inode) ;
+        reiserfs_end_persistent_transaction(th);
 	reiserfs_write_unlock(inode->i_sb);
     }
  
-    ret = generic_commit_write(f, page, from, to) ;
-
     /* we test for O_SYNC here so we can commit the transaction
     ** for any packed tails the file might have had
     */
@@ -2332,16 +2460,110 @@ void i_attrs_to_sd_attrs( struct inode *
 	}
 }
 
+/* decide if this buffer needs to stay around for data logging or ordered
+** write purposes
+*/
+static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
+{
+    int ret = 1 ;
+    struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
+    
+    spin_lock(&j->j_dirty_buffers_lock) ;
+    if (!buffer_mapped(bh)) {
+        goto free_jh;
+    }
+    /* the page is locked, and the only places that log a data buffer
+     * also lock the page.  
+     */
+#if 0
+    if (reiserfs_file_data_log(inode)) {
+	/* very conservative, leave the buffer pinned if anyone might need it.
+	** this should be changed to drop the buffer if it is only in the
+	** current transaction
+	*/
+        if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
+	    ret = 0 ;
+	}
+    } else  
+#endif    
+    if (buffer_dirty(bh) || buffer_locked(bh)) {
+	struct reiserfs_journal_list *jl;
+	struct reiserfs_jh *jh = bh->b_private;
+
+	/* why is this safe?
+	 * reiserfs_setattr updates i_size in the on disk
+	 * stat data before allowing vmtruncate to be called.
+	 *
+	 * If buffer was put onto the ordered list for this
+	 * transaction, we know for sure either this transaction
+	 * or an older one already has updated i_size on disk,
+	 * and this ordered data won't be referenced in the file
+	 * if we crash.
+	 *
+	 * if the buffer was put onto the ordered list for an older
+	 * transaction, we need to leave it around
+	 */
+	if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
+	    ret = 0;
+    }
+free_jh:
+    if (ret && bh->b_private) {
+        reiserfs_free_jh(bh);
+    }
+    spin_unlock(&j->j_dirty_buffers_lock) ;
+    return ret ;
+}
+
+/* clm -- taken from fs/buffer.c:block_invalidate_page */
+static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
+{
+    struct buffer_head *head, *bh, *next;
+    struct inode *inode = page->mapping->host;
+    unsigned int curr_off = 0;
+    int ret = 1;
+
+    BUG_ON(!PageLocked(page));
+    if (!page_has_buffers(page))
+	goto out;
+
+    head = page_buffers(page);
+    bh = head;
+    do {
+	unsigned int next_off = curr_off + bh->b_size;
+	next = bh->b_this_page;
+
+	/*
+	 * is this block fully invalidated?
+	 */
+	if (offset <= curr_off) {
+	    if (invalidatepage_can_drop(inode, bh))
+		reiserfs_unmap_buffer(bh);
+	    else
+	        ret = 0;
+	}
+	curr_off = next_off;
+	bh = next;
+    } while (bh != head);
+
+    /*
+     * We release buffers only if the entire page is being invalidated.
+     * The get_block cached value has been unconditionally invalidated,
+     * so real IO is not possible anymore.
+     */
+    if (!offset && ret)
+	ret = try_to_release_page(page, 0);
+out:
+    return ret;
+}
+
 /*
  * Returns 1 if the page's buffers were dropped.  The page is locked.
  *
  * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
  * in the buffers at page_buffers(page).
  *
- * FIXME: Chris says the buffer list is not used with `mount -o notail',
- * so in that case the fs can avoid the extra locking.  Create a second
- * address_space_operations with a NULL ->releasepage and install that
- * into new address_spaces.
+ * even in -o notail mode, we can't be sure an old mount without -o notail
+ * didn't create files with tails.  
  */
 static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
 {
@@ -2355,11 +2577,13 @@ static int reiserfs_releasepage(struct p
     head = page_buffers(page) ;
     bh = head ;
     do {
-	if (!buffer_dirty(bh) && !buffer_locked(bh)) {
-		list_del_init(&bh->b_assoc_buffers) ;
-	} else {
+	if (bh->b_private) {
+	    if (!buffer_dirty(bh) && !buffer_locked(bh)) {
+		reiserfs_free_jh(bh);
+	    } else {
 		ret = 0 ;
 		break ;
+	    }
 	}
 	bh = bh->b_this_page ;
     } while (bh != head) ;
@@ -2381,12 +2605,75 @@ static int reiserfs_direct_IO(int rw, st
 			      offset, nr_segs, reiserfs_get_blocks_direct_io, NULL);
 }
 
+int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
+    struct inode *inode = dentry->d_inode ;
+    int error ;
+    unsigned int ia_valid = attr->ia_valid;
+    reiserfs_write_lock(inode->i_sb);
+    if (attr->ia_valid & ATTR_SIZE) {
+	/* version 2 items will be caught by the s_maxbytes check
+	** done for us in vmtruncate
+	*/
+	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
+	    attr->ia_size > MAX_NON_LFS) {
+	    error = -EFBIG ;
+	    goto out;
+	}
+	/* fill in hole pointers in the expanding truncate case. */
+        if (attr->ia_size > inode->i_size) {
+	    error = generic_cont_expand(inode, attr->ia_size) ;
+	    if (REISERFS_I(inode)->i_prealloc_count > 0) {
+		struct reiserfs_transaction_handle th ;
+		/* we're changing at most 2 bitmaps, inode + super */
+		journal_begin(&th, inode->i_sb, 4) ;
+		reiserfs_discard_prealloc (&th, inode);
+		journal_end(&th, inode->i_sb, 4) ;
+	    }
+	    if (error)
+	        goto out;
+	}
+    }
+
+    if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
+	 ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
+	(get_inode_sd_version (inode) == STAT_DATA_V1)) {
+		/* stat data of format v3.5 has 16 bit uid and gid */
+	    error = -EINVAL;
+	    goto out;	
+	}
+
+    error = inode_change_ok(inode, attr) ;
+    if (!error) {
+	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+	    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+                error = reiserfs_chown_xattrs (inode, attr);
+
+                if (!error)
+                    error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+        }
+        if (!error)
+            inode_setattr(inode, attr) ;
+    }
+
+ 
+    if (!error && reiserfs_posixacl (inode->i_sb)) {
+        if (attr->ia_valid & ATTR_MODE)
+            error = reiserfs_acl_chmod (inode);
+    }
+
+out:
+    reiserfs_write_unlock(inode->i_sb);
+    return error ;
+}
+
+
 
 struct address_space_operations reiserfs_address_space_operations = {
     .writepage = reiserfs_writepage,
     .readpage = reiserfs_readpage, 
     .readpages = reiserfs_readpages, 
     .releasepage = reiserfs_releasepage,
+    .invalidatepage = reiserfs_invalidatepage,
     .sync_page = block_sync_page,
     .prepare_write = reiserfs_prepare_write,
     .commit_write = reiserfs_commit_write,
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/ioctl.c linux-2.6.5-rc1-mm2/fs/reiserfs/ioctl.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/ioctl.c	2004-03-11 03:55:26.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/ioctl.c	2004-03-19 16:10:24.009965624 +0100
@@ -92,6 +92,7 @@ int reiserfs_unpack (struct inode * inod
     int retval = 0;
     int index ;
     struct page *page ;
+    struct address_space *mapping ;
     unsigned long write_from ;
     unsigned long blocksize = inode->i_sb->s_blocksize ;
     	
@@ -122,17 +123,19 @@ int reiserfs_unpack (struct inode * inod
     ** reiserfs_get_block to unpack the tail for us.
     */
     index = inode->i_size >> PAGE_CACHE_SHIFT ;
-    page = grab_cache_page(inode->i_mapping, index) ;
+    mapping = inode->i_mapping ;
+    page = grab_cache_page(mapping, index) ;
     retval = -ENOMEM;
     if (!page) {
         goto out ;
     }
-    retval = reiserfs_prepare_write(NULL, page, write_from, blocksize) ;
+    retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ;
     if (retval)
         goto out_unlock ;
 
     /* conversion can change page contents, must flush */
     flush_dcache_page(page) ;
+    retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ;
     REISERFS_I(inode)->i_flags |= i_nopack_mask;
 
 out_unlock:
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/journal.c linux-2.6.5-rc1-mm2/fs/reiserfs/journal.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/journal.c	2004-03-11 03:55:34.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/journal.c	2004-03-19 16:31:05.280277171 +0100
@@ -32,13 +32,6 @@
 **                      around too long.
 **		     -- Note, if you call this as an immediate flush from 
 **		        from within kupdate, it will ignore the immediate flag
-**
-** The commit thread -- a writer process for async commits.  It allows a 
-**                      a process to request a log flush on a task queue.
-**                      the commit will happen once the commit thread wakes up.
-**                      The benefit here is the writer (with whatever
-**                      related locks it has) doesn't have to wait for the
-**                      log blocks to hit disk if it doesn't want to.
 */
 
 #include <linux/config.h>
@@ -60,6 +53,15 @@
 #include <linux/suspend.h>
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+
+
+/* gets a struct reiserfs_journal_list * from a list head */
+#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_list))
+#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_working_list))
 
 /* the number of mounted filesystems.  This is used to decide when to
 ** start and kill the commit workqueue
@@ -78,6 +80,12 @@ static struct workqueue_struct *commit_w
 #define BLOCK_FREED_HOLDER 3    /* this block was freed during this transaction, and can't be written */
 
 #define BLOCK_NEEDS_FLUSH 4	/* used in flush_journal_list */
+#define BLOCK_DIRTIED 5
+
+
+/* journal list state bits */
+#define LIST_TOUCHED 1
+#define LIST_DIRTY   2
 
 /* flags for do_journal_end */
 #define FLUSH_ALL   1		/* flush commit and real blocks */
@@ -86,6 +94,9 @@ static struct workqueue_struct *commit_w
 
 /* state bits for the journal */
 #define WRITERS_BLOCKED 1      /* set when new writers not allowed */
+#define WRITERS_QUEUED 2       /* set when log is full due to too many
+				* writers
+				*/
 
 static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ;
 static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ;
@@ -94,6 +105,9 @@ static int can_dirty(struct reiserfs_jou
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks);
 static int release_journal_dev( struct super_block *super,
 				struct reiserfs_journal *journal );
+static int dirty_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl);
+static void flush_async_commits(void *p);
 
 static void init_journal_hash(struct super_block *p_s_sb) {
   memset(SB_JOURNAL(p_s_sb)->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
@@ -105,11 +119,19 @@ static void init_journal_hash(struct sup
 ** more details.
 */
 static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
-  if (bh)
+  if (bh) {
     clear_buffer_dirty(bh);
+    clear_bit(BH_JTest, &bh->b_state);
+  }
   return 0 ;
 }
 
+static void disable_barrier(struct super_block *s)
+{
+    REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH); 
+    printk("reiserfs: disabling flush barriers on %s\n", reiserfs_bdevname(s));
+}
+
 static struct reiserfs_bitmap_node *
 allocate_bitmap_node(struct super_block *p_s_sb) {
   struct reiserfs_bitmap_node *bn ;
@@ -367,6 +389,7 @@ static void free_cnode(struct super_bloc
 
 static int clear_prepared_bits(struct buffer_head *bh) {
   clear_bit(BH_JPrepared, &bh->b_state) ;
+  clear_bit(BH_JRestore_dirty, &bh->b_state) ;
   return 0 ;
 }
 
@@ -408,7 +431,6 @@ void reiserfs_check_lock_depth(char *cal
 #ifdef CONFIG_SMP
   if (current->lock_depth < 0) {
     printk("%s called without kernel lock held\n", caller) ;
-    show_reiserfs_locks() ;
     BUG() ;
   }
 #else
@@ -444,52 +466,6 @@ static inline struct reiserfs_journal_cn
   return cn ;
 }
 
-/* once upon a time, the journal would deadlock.  a lot.  Now, when
-** CONFIG_REISERFS_CHECK is defined, anytime someone enters a
-** transaction, it pushes itself into this ugly static list, and pops
-** itself off before calling journal_end.  I made a SysRq key to dump
-** the list, and tell me what the writers are when I'm deadlocked.  */
-
-				/* are you depending on the compiler
-                                   to optimize this function away
-                                   everywhere it is called? It is not
-                                   obvious how this works, but I
-                                   suppose debugging code need not be
-                                   clear.  -Hans */
-static char *journal_writers[512] ;
-int push_journal_writer(char *s) {
-#ifdef CONFIG_REISERFS_CHECK
-  int i ;
-  for (i = 0 ; i < 512 ; i++) {
-    if (!journal_writers[i]) {
-      journal_writers[i] = s ;
-      return i ;
-    }
-  }
-  return -1 ;
-#else
-  return 0 ;
-#endif
-}
-int pop_journal_writer(int index) {
-#ifdef CONFIG_REISERFS_CHECK
-  if (index >= 0) {
-    journal_writers[index] = NULL ;
-  }
-#endif
-  return 0 ;
-}
-
-int dump_journal_writers(void) {
-  int i ;
-  for (i = 0 ; i < 512 ; i++) {
-    if (journal_writers[i]) {
-      printk("%d: %s\n", i, journal_writers[i]) ;
-    }
-  }
-  return 0 ;
-}
-
 /*
 ** this actually means 'can this block be reallocated yet?'.  If you set search_all, a block can only be allocated
 ** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever
@@ -518,11 +494,6 @@ int reiserfs_in_journal(struct super_blo
 
   *next_zero_bit = 0 ; /* always start this at zero. */
 
-  /* we aren't logging all blocks are safe for reuse */
-  if (reiserfs_dont_log(p_s_sb)) {
-    return 0 ;
-  }
-
   PROC_INFO_INC( p_s_sb, journal.in_journal );
   /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
   ** if we crash before the transaction that freed it commits,  this transaction won't
@@ -550,6 +521,7 @@ int reiserfs_in_journal(struct super_blo
 
   /* is it in the current transaction.  This should never happen */
   if ((cn = get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_hash_table, bl))) {
+    BUG();
     return 1; 
   }
 
@@ -574,18 +546,30 @@ inline void insert_journal_hash(struct r
 
 /* lock the current transaction */
 inline static void lock_journal(struct super_block *p_s_sb) {
-  PROC_INFO_INC( p_s_sb, journal.lock_journal );
-  while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) {
-    PROC_INFO_INC( p_s_sb, journal.lock_journal_wait );
-    sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
-  }
-  atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 1) ;
+    PROC_INFO_INC( p_s_sb, journal.lock_journal );
+    down(&SB_JOURNAL(p_s_sb)->j_lock);
 }
 
 /* unlock the current transaction */
 inline static void unlock_journal(struct super_block *p_s_sb) {
-  atomic_dec(&(SB_JOURNAL(p_s_sb)->j_wlock)) ;
-  wake_up(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
+    up(&SB_JOURNAL(p_s_sb)->j_lock);
+}
+
+static inline void get_journal_list(struct reiserfs_journal_list *jl)
+{
+    jl->j_refcount++;
+}
+
+static inline void put_journal_list(struct super_block *s,
+                                   struct reiserfs_journal_list *jl)
+{
+    if (jl->j_refcount < 1) {
+        printk("trans id %lu, refcount at %d\n", jl->j_trans_id, 
+	                                         jl->j_refcount);
+        BUG();
+    }
+    if (--jl->j_refcount == 0)
+        reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s);
 }
 
 /*
@@ -603,6 +587,341 @@ static void cleanup_freed_for_journal_li
   jl->j_list_bitmap = NULL ;
 }
 
+static int journal_list_still_alive(struct super_block *s, 
+                                    unsigned long trans_id)
+{
+    struct list_head *entry = &SB_JOURNAL(s)->j_journal_list;
+    struct reiserfs_journal_list *jl;
+
+    if (!list_empty(entry)) {
+        jl = JOURNAL_LIST_ENTRY(entry->next);
+	if (jl->j_trans_id <= trans_id) {
+	    return 1;
+	}
+    }
+    return 0;
+}
+
+static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
+    char b[BDEVNAME_SIZE];
+
+    if (buffer_journaled(bh)) {
+        reiserfs_warning("clm-2084: pinned buffer %lu:%s sent to disk\n",
+	                 bh->b_blocknr, bdevname(bh->b_bdev, b)) ;
+    }
+    if (uptodate)
+    	set_buffer_uptodate(bh) ;
+    else
+    	clear_buffer_uptodate(bh) ;
+    unlock_buffer(bh) ;
+    put_bh(bh) ;
+}
+
+static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) {
+    if (uptodate)
+    	set_buffer_uptodate(bh) ;
+    else
+    	clear_buffer_uptodate(bh) ;
+    unlock_buffer(bh) ;
+    put_bh(bh) ;
+}
+
+static void submit_logged_buffer(struct buffer_head *bh) {
+    get_bh(bh) ;
+    bh->b_end_io = reiserfs_end_buffer_io_sync ;
+    mark_buffer_notjournal_new(bh) ;
+    clear_buffer_dirty(bh) ;
+    if (!test_and_clear_bit(BH_JTest, &bh->b_state))
+        BUG();
+    if (!buffer_uptodate(bh))
+        BUG();
+    submit_bh(WRITE, bh) ;
+}
+
+static void submit_ordered_buffer(struct buffer_head *bh) {
+    get_bh(bh) ;
+    bh->b_end_io = reiserfs_end_ordered_io;
+    clear_buffer_dirty(bh) ;
+    if (!buffer_uptodate(bh))
+        BUG();
+    submit_bh(WRITE, bh) ;
+}
+
+static int submit_barrier_buffer(struct buffer_head *bh) {
+    get_bh(bh) ;
+    bh->b_end_io = reiserfs_end_ordered_io;
+    clear_buffer_dirty(bh) ;
+    if (!buffer_uptodate(bh))
+        BUG();
+    return submit_bh(WRITE_BARRIER, bh) ;
+}
+
+#define CHUNK_SIZE 32
+struct buffer_chunk {
+    struct buffer_head *bh[CHUNK_SIZE];
+    int nr;
+};
+
+static void write_chunk(struct buffer_chunk *chunk) {
+    int i;
+    for (i = 0; i < chunk->nr ; i++) {
+	submit_logged_buffer(chunk->bh[i]) ;
+    }
+    chunk->nr = 0;
+}
+
+static void write_ordered_chunk(struct buffer_chunk *chunk) {
+    int i;
+    for (i = 0; i < chunk->nr ; i++) {
+	submit_ordered_buffer(chunk->bh[i]) ;
+    }
+    chunk->nr = 0;
+}
+
+static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
+			 spinlock_t *lock,
+			 void (fn)(struct buffer_chunk *))
+{
+    int ret = 0;
+    if (chunk->nr >= CHUNK_SIZE)
+        BUG();
+    chunk->bh[chunk->nr++] = bh;
+    if (chunk->nr >= CHUNK_SIZE) {
+	ret = 1;
+        if (lock)
+	    spin_unlock(lock);
+        fn(chunk);
+        if (lock)
+	    spin_lock(lock);
+    }
+    return ret;
+}
+
+
+atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
+static struct reiserfs_jh *alloc_jh(void) {
+    struct reiserfs_jh *jh;
+    while(1) {
+	jh = kmalloc(sizeof(*jh), GFP_NOFS);
+	if (jh) {
+	    atomic_inc(&nr_reiserfs_jh);
+	    return jh;
+	}
+        yield();
+    }
+}
+
+/*
+ * we want to free the jh when the buffer has been written
+ * and waited on
+ */
+void reiserfs_free_jh(struct buffer_head *bh) {
+    struct reiserfs_jh *jh;
+
+    jh = bh->b_private;
+    if (jh) {
+	bh->b_private = NULL;
+	jh->bh = NULL;
+	list_del_init(&jh->list);
+	kfree(jh);
+	if (atomic_read(&nr_reiserfs_jh) <= 0)
+	    BUG();
+	atomic_dec(&nr_reiserfs_jh);
+	put_bh(bh);
+    }
+}
+
+static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
+                           int tail)
+{
+    struct reiserfs_jh *jh;
+
+    if (bh->b_private) {
+	spin_lock(&j->j_dirty_buffers_lock);
+	if (!bh->b_private) {
+	    spin_unlock(&j->j_dirty_buffers_lock);
+	    goto no_jh;
+	}
+        jh = bh->b_private;
+	list_del_init(&jh->list);
+    } else {
+no_jh:
+	get_bh(bh);
+	jh = alloc_jh();
+	spin_lock(&j->j_dirty_buffers_lock);
+	/* buffer must be locked for __add_jh, should be able to have
+	 * two adds at the same time
+	 */
+	if (bh->b_private)
+	    BUG();
+	jh->bh = bh;
+	bh->b_private = jh;
+    }
+    jh->jl = j->j_current_jl;
+    if (tail)
+	list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
+    else {
+	list_add_tail(&jh->list, &jh->jl->j_bh_list);
+    }
+    spin_unlock(&j->j_dirty_buffers_lock);
+    return 0;
+}
+
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) {
+    return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
+}
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) {
+    return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
+}
+
+#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
+static int write_ordered_buffers(spinlock_t *lock, 
+				 struct reiserfs_journal *j,
+                                 struct reiserfs_journal_list *jl, 
+				 struct list_head *list) 
+{
+    struct buffer_head *bh;
+    struct reiserfs_jh *jh;
+    int ret = 0;
+    struct buffer_chunk chunk;
+    struct list_head tmp;
+    INIT_LIST_HEAD(&tmp);
+
+    chunk.nr = 0;
+    spin_lock(lock);
+    while(!list_empty(list)) {
+        jh = JH_ENTRY(list->next);
+	bh = jh->bh;
+	get_bh(bh);
+	if (test_set_buffer_locked(bh)) {
+	    if (!buffer_dirty(bh)) {
+		list_del_init(&jh->list);
+		list_add(&jh->list, &tmp);
+		goto loop_next;
+	    }
+	    spin_unlock(lock);
+	    if (chunk.nr)
+		write_ordered_chunk(&chunk);
+	    wait_on_buffer(bh);
+	    if (need_resched)
+	        schedule();
+	    spin_lock(lock);
+	    goto loop_next;
+	}
+	if (buffer_dirty(bh)) {
+	    list_del_init(&jh->list);
+	    list_add(&jh->list, &tmp);
+	    add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
+	} else {
+	    reiserfs_free_jh(bh);
+	    unlock_buffer(bh);
+	}
+loop_next:
+	put_bh(bh);
+	if (chunk.nr == 0 && need_resched) {
+	    spin_unlock(lock);
+	    schedule();
+	    spin_lock(lock);
+	}
+    }
+    if (chunk.nr) {
+	spin_unlock(lock);
+        write_ordered_chunk(&chunk);
+	spin_lock(lock);
+    }
+    while(!list_empty(&tmp)) {
+        jh = JH_ENTRY(tmp.prev);
+	bh = jh->bh;
+	get_bh(bh);
+	reiserfs_free_jh(bh);
+
+	if (buffer_locked(bh)) {
+	    spin_unlock(lock);
+	    wait_on_buffer(bh);
+	    spin_lock(lock);
+	}
+	if (!buffer_uptodate(bh))
+	    ret = -EIO;
+	put_bh(bh);
+	if (need_resched()) {
+	    spin_unlock(lock);
+	    schedule();
+	    spin_lock(lock);
+	}
+    }
+    spin_unlock(lock);
+    return ret;
+}
+
+static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
+    struct reiserfs_journal_list *other_jl;
+    struct reiserfs_journal_list *first_jl;
+    struct list_head *entry;
+    unsigned long trans_id = jl->j_trans_id;
+    unsigned long other_trans_id;
+    unsigned long first_trans_id;
+
+find_first:
+    /* 
+     * first we walk backwards to find the oldest uncommitted transation
+     */
+    first_jl = jl;
+    entry = jl->j_list.prev;
+    while(1) {
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	if (entry == &SB_JOURNAL(s)->j_journal_list || 
+	    atomic_read(&other_jl->j_older_commits_done))
+	    break;
+        
+        first_jl = other_jl;
+	entry = other_jl->j_list.prev;
+    }
+
+    /* if we didn't find any older uncommitted transactions, return now */
+    if (first_jl == jl) {
+        return 0;
+    }
+
+    first_trans_id = first_jl->j_trans_id;
+
+    entry = &first_jl->j_list;
+    while(1) {
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	other_trans_id = other_jl->j_trans_id;
+	
+	if (other_trans_id < trans_id) { 
+	    if (atomic_read(&other_jl->j_commit_left) != 0) {
+		flush_commit_list(s, other_jl, 0);
+
+		/* list we were called with is gone, return */
+		if (!journal_list_still_alive(s, trans_id))
+		    return 1;
+
+		/* the one we just flushed is gone, this means all
+		 * older lists are also gone, so first_jl is no longer
+		 * valid either.  Go back to the beginning.
+		 */
+		if (!journal_list_still_alive(s, other_trans_id)) {
+		    goto find_first;
+		}
+	    }
+	    entry = entry->next;
+	    if (entry == &SB_JOURNAL(s)->j_journal_list)
+		return 0;
+	} else {
+	    return 0;
+	}
+    }
+    return 0;
+}
+int reiserfs_async_progress_wait(struct super_block *s) {
+    DEFINE_WAIT(wait);
+    struct reiserfs_journal *j = SB_JOURNAL(s);
+    if (atomic_read(&j->j_async_throttle))
+    	blk_congestion_wait(WRITE, HZ/10);
+    return 0;
+}
+
 /*
 ** if this journal list still has commit blocks unflushed, send them to disk.
 **
@@ -611,13 +930,11 @@ static void cleanup_freed_for_journal_li
 **
 */
 static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) {
-  int i, count ;
-  int index = 0 ;
+  int i;
   int bn ;
-  int retry_count = 0 ;
-  int orig_commit_left = 0 ;
   struct buffer_head *tbh = NULL ;
-  struct reiserfs_journal_list *other_jl ;
+  unsigned long trans_id = jl->j_trans_id;
+  int barrier = 0;
 
   reiserfs_check_lock_depth("flush_commit_list") ;
 
@@ -628,133 +945,129 @@ static int flush_commit_list(struct supe
   /* before we can put our commit blocks on disk, we have to make sure everyone older than
   ** us is on disk too
   */
-  if (jl->j_len <= 0) {
-    return 0 ;
-  }
+  if (jl->j_len <= 0)
+    BUG();
+  if (trans_id == SB_JOURNAL(s)->j_trans_id)
+    BUG();
+
+  get_journal_list(jl);
   if (flushall) {
-    /* we _must_ make sure the transactions are committed in order.  Start with the
-    ** index after this one, wrap all the way around 
-    */
-    index = (jl - SB_JOURNAL_LIST(s)) + 1 ;
-    for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-      other_jl = SB_JOURNAL_LIST(s) + ( (index + i) % JOURNAL_LIST_COUNT) ;
-      if (other_jl && other_jl != jl && other_jl->j_len > 0 && other_jl->j_trans_id > 0 && 
-          other_jl->j_trans_id <= jl->j_trans_id && (atomic_read(&(jl->j_older_commits_done)) == 0)) {
-        flush_commit_list(s, other_jl, 0) ;
-      }
+    if (flush_older_commits(s, jl) == 1) {
+      /* list disappeared during flush_older_commits.  return */
+      goto put_jl;
     }
   }
 
-  count = 0 ;
-  /* don't flush the commit list for the current transactoin */
-  if (jl == ((SB_JOURNAL_LIST(s) + SB_JOURNAL_LIST_INDEX(s)))) {
-    return 0 ;
-  }
-
   /* make sure nobody is trying to flush this one at the same time */
-  if (atomic_read(&(jl->j_commit_flushing))) {
-    sleep_on(&(jl->j_commit_wait)) ;
-    if (flushall) {
-      atomic_set(&(jl->j_older_commits_done), 1) ;
-    }
-    return 0 ;
-  }
-  
+  down(&jl->j_commit_lock);
+  if (!journal_list_still_alive(s, trans_id)) {
+    up(&jl->j_commit_lock);
+    goto put_jl;
+  }
+  if (jl->j_trans_id == 0)
+    BUG();
+   
   /* this commit is done, exit */
   if (atomic_read(&(jl->j_commit_left)) <= 0) {
     if (flushall) {
       atomic_set(&(jl->j_older_commits_done), 1) ;
     }
-    return 0 ;
+    up(&jl->j_commit_lock);
+    goto put_jl;
   }
-  /* keeps others from flushing while we are flushing */
-  atomic_set(&(jl->j_commit_flushing), 1) ; 
-
 
-  if (jl->j_len > SB_JOURNAL_TRANS_MAX(s)) {
-    reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, list number %d\n", jl->j_len, jl - SB_JOURNAL_LIST(s)) ;
-    return 0 ;
-  }
-
-  orig_commit_left = atomic_read(&(jl->j_commit_left)) ; 
-
-  /* start by checking all the commit blocks in this transaction.  
-  ** Add anyone not on disk into tbh.  Stop checking once commit_left <= 1, because that means we
-  ** only have the commit block left 
-  */
-retry:
-  count = 0 ;
-  for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && i < (jl->j_len + 1) ; i++) {  /* everything but commit_bh */
-    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %  SB_ONDISK_JOURNAL_SIZE(s);
+  if (!list_empty(&jl->j_bh_list)) {
+      unlock_kernel();
+      write_ordered_buffers(&SB_JOURNAL(s)->j_dirty_buffers_lock,
+                            SB_JOURNAL(s), jl, &jl->j_bh_list);
+      lock_kernel();
+  }
+  if (!list_empty(&jl->j_bh_list))
+      BUG();
+  /* 
+   * for the description block and all the log blocks, submit any buffers
+   * that haven't already reached the disk
+   */
+  atomic_inc(&SB_JOURNAL(s)->j_async_throttle);
+  for (i = 0 ; i < (jl->j_len + 1) ; i++) {
+    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %  
+         SB_ONDISK_JOURNAL_SIZE(s);
     tbh = journal_find_get_block(s, bn) ;
-
-/* kill this sanity check */
-if (count > (orig_commit_left + 2)) {
-reiserfs_panic(s, "journal-539: flush_commit_list: BAD count(%d) > orig_commit_left(%d)!\n", count, orig_commit_left) ;
-}
-    if (tbh) {
-      if (buffer_locked(tbh)) { /* wait on it, redo it just to make sure */
-	wait_on_buffer(tbh) ;
-	if (!buffer_uptodate(tbh)) {
-	  reiserfs_panic(s, "journal-584, buffer write failed\n") ;
-	}
-      } 
-      if (buffer_dirty(tbh)) {
-	printk("journal-569: flush_commit_list, block already dirty!\n") ;
-      } else {				
-	mark_buffer_dirty(tbh) ;
-      }
-      ll_rw_block(WRITE, 1, &tbh) ;
-      count++ ;
-      put_bh(tbh) ; /* once for our get_hash */
-    } 
+    if (buffer_dirty(tbh))
+	ll_rw_block(WRITE, 1, &tbh) ;
+    put_bh(tbh) ; 
   }
+  atomic_dec(&SB_JOURNAL(s)->j_async_throttle);
 
-  /* wait on everyone in tbh before writing commit block*/
-  if (count > 0) {
-    for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && 
-                 i < (jl->j_len + 1) ; i++) {  /* everything but commit_bh */
-      bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
-      tbh = journal_find_get_block(s, bn) ;
-
-      wait_on_buffer(tbh) ;
-      if (!buffer_uptodate(tbh)) {
-	reiserfs_panic(s, "journal-601, buffer write failed\n") ;
+  /* wait on everything written so far before writing the commit 
+   * if we are in barrier mode, send the commit down now
+   */
+  barrier = reiserfs_barrier_flush(s);
+  if (barrier) {
+      int ret;
+      lock_buffer(jl->j_commit_bh);
+      ret = submit_barrier_buffer(jl->j_commit_bh);
+      if (ret == -EOPNOTSUPP) {
+	  set_buffer_uptodate(jl->j_commit_bh);
+          disable_barrier(s);
+	  barrier = 0;
       }
-      put_bh(tbh) ; /* once for our get_hash */
-      bforget(tbh) ;    /* once due to original getblk in do_journal_end */
-      atomic_dec(&(jl->j_commit_left)) ;
-    }
-  }
-
-  if (atomic_read(&(jl->j_commit_left)) != 1) { /* just the commit_bh left, flush it without calling getblk for everyone */
-    if (retry_count < 2) {
-      printk("journal-582: flush_commit_list, not all log blocks on disk yet, trying again\n") ;
-      retry_count++ ;
-      goto retry;
-    }
-    reiserfs_panic(s, "journal-563: flush_commit_list: BAD, j_commit_left is %u, should be 1\n", 
-		   atomic_read(&(jl->j_commit_left)));
   }
+  for (i = 0 ;  i < (jl->j_len + 1) ; i++) {  
+    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + 
+	 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
+    tbh = journal_find_get_block(s, bn) ;
+    wait_on_buffer(tbh) ;
+    // since we're using ll_rw_blk above, it might have skipped over
+    // a locked buffer.  Double check here
+    //
+    if (buffer_dirty(tbh))
+      sync_dirty_buffer(tbh);
+    if (!buffer_uptodate(tbh)) {
+      reiserfs_panic(s, "journal-601, buffer write failed\n") ;
+    }
+    put_bh(tbh) ; /* once for journal_find_get_block */
+    put_bh(tbh) ;    /* once due to original getblk in do_journal_end */
+    atomic_dec(&(jl->j_commit_left)) ;
+  }
+
+  if (atomic_read(&(jl->j_commit_left)) != 1)
+    BUG();
+
+  if (!barrier) {
+      if (buffer_dirty(jl->j_commit_bh))
+	BUG();
+      mark_buffer_dirty(jl->j_commit_bh) ;
+      sync_dirty_buffer(jl->j_commit_bh) ;
+  } else
+      wait_on_buffer(jl->j_commit_bh);
 
-  mark_buffer_dirty(jl->j_commit_bh) ;
-  sync_dirty_buffer(jl->j_commit_bh) ;
   if (!buffer_uptodate(jl->j_commit_bh)) {
     reiserfs_panic(s, "journal-615: buffer write failed\n") ;
   }
-  atomic_dec(&(jl->j_commit_left)) ;
   bforget(jl->j_commit_bh) ;
+  if (SB_JOURNAL(s)->j_last_commit_id != 0 && 
+     (jl->j_trans_id - SB_JOURNAL(s)->j_last_commit_id) != 1) {
+      reiserfs_warning("clm-2200: last commit %lu, current %lu\n",
+                       SB_JOURNAL(s)->j_last_commit_id,
+		       jl->j_trans_id);
+  }
+  SB_JOURNAL(s)->j_last_commit_id = jl->j_trans_id;
 
   /* now, every commit block is on the disk.  It is safe to allow blocks freed during this transaction to be reallocated */
   cleanup_freed_for_journal_list(s, jl) ;
 
+  /* mark the metadata dirty */
+  dirty_one_transaction(s, jl);
+  atomic_dec(&(jl->j_commit_left)) ;
+
   if (flushall) {
     atomic_set(&(jl->j_older_commits_done), 1) ;
   }
-  atomic_set(&(jl->j_commit_flushing), 0) ;
-  wake_up(&(jl->j_commit_wait)) ;
-
-  s->s_dirt = 1 ;
+  up(&jl->j_commit_lock);
+put_jl:
+  put_journal_list(s, jl);
+  
   return 0 ;
 }
 
@@ -829,8 +1142,22 @@ static int _update_journal_header_block(
     jh->j_last_flush_trans_id = cpu_to_le32(trans_id) ;
     jh->j_first_unflushed_offset = cpu_to_le32(offset) ;
     jh->j_mount_id = cpu_to_le32(SB_JOURNAL(p_s_sb)->j_mount_id) ;
-    set_buffer_dirty(SB_JOURNAL(p_s_sb)->j_header_bh) ;
-    sync_dirty_buffer(SB_JOURNAL(p_s_sb)->j_header_bh) ;
+
+    if (reiserfs_barrier_flush(p_s_sb)) {
+	int ret;
+	lock_buffer(SB_JOURNAL(p_s_sb)->j_header_bh);
+	ret = submit_barrier_buffer(SB_JOURNAL(p_s_sb)->j_header_bh);
+	if (ret == -EOPNOTSUPP) {
+	    set_buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh);
+	    disable_barrier(p_s_sb);
+	    goto sync;
+	}
+	wait_on_buffer(SB_JOURNAL(p_s_sb)->j_header_bh);
+    } else {
+sync:
+	set_buffer_dirty(SB_JOURNAL(p_s_sb)->j_header_bh) ;
+	sync_dirty_buffer(SB_JOURNAL(p_s_sb)->j_header_bh) ;
+    }
     if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) {
       printk( "reiserfs: journal-837: IO error during journal replay\n" );
       return -EIO ;
@@ -851,45 +1178,35 @@ static int update_journal_header_block(s
 ** flush any and all journal lists older than you are 
 ** can only be called from flush_journal_list
 */
-static int flush_older_journal_lists(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, unsigned long trans_id) {
-  int i, index ;
-  struct reiserfs_journal_list *other_jl ;
-
-  index = jl - SB_JOURNAL_LIST(p_s_sb) ;
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    other_jl = SB_JOURNAL_LIST(p_s_sb) + ((index + i) % JOURNAL_LIST_COUNT) ;
-    if (other_jl && other_jl->j_len > 0 && 
-        other_jl->j_trans_id > 0 && 
-	other_jl->j_trans_id < trans_id && 
-        other_jl != jl) {
-      /* do not flush all */
-      flush_journal_list(p_s_sb, other_jl, 0) ; 
+static int flush_older_journal_lists(struct super_block *p_s_sb, 
+                                     struct reiserfs_journal_list *jl)
+{
+    struct list_head *entry;
+    struct reiserfs_journal_list *other_jl ;
+    unsigned long trans_id = jl->j_trans_id;
+
+    /* we know we are the only ones flushing things, no extra race
+     * protection is required.
+     */
+restart:
+    entry = SB_JOURNAL(p_s_sb)->j_journal_list.next;
+    other_jl = JOURNAL_LIST_ENTRY(entry);
+    if (other_jl->j_trans_id < trans_id) {
+	/* do not flush all */
+	flush_journal_list(p_s_sb, other_jl, 0) ; 
+
+	/* other_jl is now deleted from the list */
+	goto restart;
     }
-  }
-  return 0 ;
+    return 0 ;
 }
 
-static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
-    char b[BDEVNAME_SIZE];
-
-    if (buffer_journaled(bh)) {
-        reiserfs_warning("clm-2084: pinned buffer %lu:%s sent to disk\n",
-	                 bh->b_blocknr, bdevname(bh->b_bdev, b)) ;
+static void del_from_work_list(struct super_block *s, 
+                               struct reiserfs_journal_list *jl) {
+    if (!list_empty(&jl->j_working_list)) {
+	list_del_init(&jl->j_working_list);
+	SB_JOURNAL(s)->j_num_work_lists--;
     }
-    if (uptodate)
-    	set_buffer_uptodate(bh) ;
-    else
-    	clear_buffer_uptodate(bh) ;
-    unlock_buffer(bh) ;
-    put_bh(bh) ;
-}
-static void submit_logged_buffer(struct buffer_head *bh) {
-    lock_buffer(bh) ;
-    get_bh(bh) ;
-    bh->b_end_io = reiserfs_end_buffer_io_sync ;
-    mark_buffer_notjournal_new(bh) ;
-    clear_buffer_dirty(bh) ;
-    submit_bh(WRITE, bh) ;
 }
 
 /* flush a journal list, both commit and real blocks
@@ -912,29 +1229,26 @@ static int flush_journal_list(struct sup
   unsigned long j_len_saved = jl->j_len ;
 
   if (j_len_saved <= 0) {
-    return 0 ;
+    BUG();
   }
 
   if (atomic_read(&SB_JOURNAL(s)->j_wcount) != 0) {
     reiserfs_warning("clm-2048: flush_journal_list called with wcount %d\n",
                       atomic_read(&SB_JOURNAL(s)->j_wcount)) ;
   }
-  /* if someone is getting the commit list, we must wait for them */
-  while (atomic_read(&(jl->j_commit_flushing))) { 
-    sleep_on(&(jl->j_commit_wait)) ;
-  }
-  /* if someone is flushing this list, we must wait for them */
-  while (atomic_read(&(jl->j_flushing))) {
-    sleep_on(&(jl->j_flush_wait)) ;
-  }
+  if (jl->j_trans_id == 0)
+    BUG();
 
-  /* this list is now ours, we can change anything we want */
-  atomic_set(&(jl->j_flushing), 1) ;
+  /* if flushall == 0, the lock is already held */
+  if (flushall) {
+      down(&SB_JOURNAL(s)->j_flush_sem);
+  } else if (!down_trylock(&SB_JOURNAL(s)->j_flush_sem)) {
+      BUG();
+  }
 
   count = 0 ;
   if (j_len_saved > SB_JOURNAL_TRANS_MAX(s)) {
-    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, list number %d\n", j_len_saved, jl - SB_JOURNAL_LIST(s)) ;
-    atomic_dec(&(jl->j_flushing)) ;
+    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, trans id %lu\n", j_len_saved, jl->j_trans_id);
     return 0 ;
   }
 
@@ -949,6 +1263,9 @@ static int flush_journal_list(struct sup
   */
   flush_commit_list(s, jl, 1) ;
 
+  if (!(jl->j_state & LIST_DIRTY))
+      BUG();
+
   /* are we done now? */
   if (atomic_read(&(jl->j_nonzerolen)) <= 0 && 
       atomic_read(&(jl->j_commit_left)) <= 0) {
@@ -984,13 +1301,13 @@ static int flush_journal_list(struct sup
       get_bh(saved_bh) ;
 
       if (buffer_journal_dirty(saved_bh)) {
+	if (!can_dirty(cn))
+	  BUG();
         was_jwait = 1 ;
-	mark_buffer_notjournal_dirty(saved_bh) ;
-        /* undo the inc from journal_mark_dirty */
-	put_bh(saved_bh) ;
-      }
-      if (can_dirty(cn)) {
         was_dirty = 1 ;
+      } else if (can_dirty(cn)) {
+        /* everything with !pjl && jwait should be writable */
+	BUG();
       }
     }
 
@@ -998,7 +1315,8 @@ static int flush_journal_list(struct sup
     ** sure they are commited, and don't try writing it to disk
     */
     if (pjl) {
-      flush_commit_list(s, pjl, 1) ;
+      if (atomic_read(&pjl->j_commit_left))
+        flush_commit_list(s, pjl, 1) ;
       goto free_cnode ;
     }
 
@@ -1017,22 +1335,17 @@ static int flush_journal_list(struct sup
 printk("journal-813: BAD! buffer %llu %cdirty %cjwait, not in a newer tranasction\n", (unsigned long long)saved_bh->b_blocknr,
         was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ;
     }
-    /* kupdate_one_transaction waits on the buffers it is writing, so we
-    ** should never see locked buffers here
-    */
-    if (buffer_locked(saved_bh)) {
-      printk("clm-2083: locked buffer %llu in flush_journal_list\n", 
-              (unsigned long long)saved_bh->b_blocknr) ;
-      wait_on_buffer(saved_bh) ;
-      if (!buffer_uptodate(saved_bh)) {
-        reiserfs_panic(s, "journal-923: buffer write failed\n") ;
-      }
-    } 
     if (was_dirty) { 
       /* we inc again because saved_bh gets decremented at free_cnode */
       get_bh(saved_bh) ;
       set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
-      submit_logged_buffer(saved_bh) ;
+      lock_buffer(saved_bh);
+      if (cn->blocknr != saved_bh->b_blocknr)
+        BUG();
+      if (buffer_dirty(saved_bh))
+        submit_logged_buffer(saved_bh) ;
+      else
+        unlock_buffer(saved_bh);
       count++ ;
     } else {
       printk("clm-2082: Unable to flush buffer %llu in flush_journal_list\n",
@@ -1063,6 +1376,14 @@ free_cnode:
 	if (!buffer_uptodate(cn->bh)) {
 	  reiserfs_panic(s, "journal-949: buffer write failed\n") ;
 	}
+	/* note, we must clear the JDirty_wait bit after the up to date
+	** check, otherwise we race against our flushpage routine
+	*/
+	if (!test_and_clear_bit(BH_JDirty_wait, &cn->bh->b_state))
+	    BUG();
+
+        /* undo the inc from journal_mark_dirty */
+	put_bh(cn->bh) ;
         brelse(cn->bh) ;
       }
       cn = cn->next ;
@@ -1076,7 +1397,7 @@ flush_older_and_return:
   ** replayed after a crash
   */
   if (flushall) {
-    flush_older_journal_lists(s, jl, jl->j_trans_id) ;
+    flush_older_journal_lists(s, jl);
   } 
   
   /* before we can remove everything from the hash tables for this 
@@ -1091,181 +1412,224 @@ flush_older_and_return:
     update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ;
   }
   remove_all_from_journal_list(s, jl, 0) ;
+  list_del(&jl->j_list);
+  SB_JOURNAL(s)->j_num_lists--;
+  del_from_work_list(s, jl);
+
+  if (SB_JOURNAL(s)->j_last_flush_id != 0 && 
+     (jl->j_trans_id - SB_JOURNAL(s)->j_last_flush_id) != 1) {
+      reiserfs_warning("clm-2201: last flush %lu, current %lu\n",
+                       SB_JOURNAL(s)->j_last_flush_id,
+		       jl->j_trans_id);
+  }
+  SB_JOURNAL(s)->j_last_flush_id = jl->j_trans_id;
+
+  /* not strictly required since we are freeing the list, but it should
+   * help find code using dead lists later on
+   */
   jl->j_len = 0 ;
   atomic_set(&(jl->j_nonzerolen), 0) ;
   jl->j_start = 0 ;
   jl->j_realblock = NULL ;
   jl->j_commit_bh = NULL ;
   jl->j_trans_id = 0 ;
-  atomic_dec(&(jl->j_flushing)) ;
-  wake_up(&(jl->j_flush_wait)) ;
+  jl->j_state = 0;
+  put_journal_list(s, jl);
+  if (flushall)
+    up(&SB_JOURNAL(s)->j_flush_sem);
   return 0 ;
 } 
 
-
-static int kupdate_one_transaction(struct super_block *s,
-                                    struct reiserfs_journal_list *jl) 
+static int write_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl,
+				 struct buffer_chunk *chunk) 
 {
-    struct reiserfs_journal_list *pjl ; /* previous list for this cn */
-    struct reiserfs_journal_cnode *cn, *walk_cn ;
-    b_blocknr_t blocknr ;
-    int run = 0 ;
-    int orig_trans_id = jl->j_trans_id ;
-    struct buffer_head *saved_bh ; 
+    struct reiserfs_journal_cnode *cn;
     int ret = 0 ;
 
-    /* if someone is getting the commit list, we must wait for them */
-    while (atomic_read(&(jl->j_commit_flushing))) {
-        sleep_on(&(jl->j_commit_wait)) ;
-    }
-    /* if someone is flushing this list, we must wait for them */
-    while (atomic_read(&(jl->j_flushing))) {
-        sleep_on(&(jl->j_flush_wait)) ;
+    jl->j_state |= LIST_TOUCHED;
+    del_from_work_list(s, jl);
+    if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
+        return 0;
     }
-    /* was it flushed while we slept? */
-    if (jl->j_len <= 0 || jl->j_trans_id != orig_trans_id) {
-        return 0 ;
-    }
-
-    /* this list is now ours, we can change anything we want */
-    atomic_set(&(jl->j_flushing), 1) ;
 
-loop_start:
     cn = jl->j_realblock ;
     while(cn) {
-        saved_bh = NULL ;
         /* if the blocknr == 0, this has been cleared from the hash,
         ** skip it
         */
         if (cn->blocknr == 0) {
             goto next ;
         }
+        if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
+	    struct buffer_head *tmp_bh;
+	    /* we can race against journal_mark_freed when we try
+	     * to lock_buffer(cn->bh), so we have to inc the buffer
+	     * count, and recheck things after locking
+	     */
+	    tmp_bh = cn->bh;
+	    get_bh(tmp_bh);
+	    lock_buffer(tmp_bh);
+	    if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
+		if (!buffer_journal_dirty(tmp_bh) || 
+		    reiserfs_buffer_prepared(tmp_bh))
+		    BUG();
+		add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
+		ret++;
+	    } else {
+		/* note, cn->bh might be null now */
+		unlock_buffer(tmp_bh);
+	    }
+	    put_bh(tmp_bh);
+        } 
+next:
+        cn = cn->next ;
+	cond_resched();
+    }
+    return ret ;
+}
+
+/* used by flush_commit_list */
+static int dirty_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl)
+{
+    struct reiserfs_journal_cnode *cn;
+    struct reiserfs_journal_list *pjl;
+    int ret = 0 ;
+
+    jl->j_state |= LIST_DIRTY;
+    cn = jl->j_realblock ;
+    while(cn) {
         /* look for a more recent transaction that logged this
         ** buffer.  Only the most recent transaction with a buffer in
         ** it is allowed to send that buffer to disk
         */
-        pjl = find_newer_jl_for_cn(cn) ;
-        if (run == 0 && !pjl && cn->bh && buffer_journal_dirty(cn->bh) &&
-            can_dirty(cn)) 
-        {
-            if (!test_bit(BH_JPrepared, &cn->bh->b_state)) {
-                set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
-		submit_logged_buffer(cn->bh) ;
-            } else {
-                /* someone else is using this buffer.  We can't 
-                ** send it to disk right now because they might
-                ** be changing/logging it.
-                */
-                ret = 1 ;
-            }
-        } else if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
-            clear_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
-            if (!pjl && cn->bh) {
-                wait_on_buffer(cn->bh) ;
-            }
-            /* check again, someone could have logged while we scheduled */
-            pjl = find_newer_jl_for_cn(cn) ;
-
-            /* before the JDirty_wait bit is set, the 
-            ** buffer is added to the hash list.  So, if we are
-            ** run in the middle of a do_journal_end, we will notice
-            ** if this buffer was logged and added from the latest
-            ** transaction.  In this case, we don't want to decrement
-            ** b_count
-            */
-            if (!pjl && cn->bh && buffer_journal_dirty(cn->bh)) {
-                blocknr = cn->blocknr ;
-                walk_cn = cn ;
-                saved_bh= cn->bh ;
-                /* update all older transactions to show this block
-                ** was flushed
-                */
-                mark_buffer_notjournal_dirty(cn->bh) ;
-                while(walk_cn) {
-                    if (walk_cn->bh && walk_cn->blocknr == blocknr && 
-                         walk_cn->sb == cn->sb) {
-                        if (walk_cn->jlist) {
-                            atomic_dec(&(walk_cn->jlist->j_nonzerolen)) ;
-                        }
-                        walk_cn->bh = NULL ;
-                    }
-                    walk_cn = walk_cn->hnext ;
-                }
-                if (atomic_read(&saved_bh->b_count) < 1) {
-                    reiserfs_warning("clm-2081: bad count on %lu\n", 
-                                      saved_bh->b_blocknr) ;
-                }
-                brelse(saved_bh) ;
-            }
-        }
-        /*
-        ** if the more recent transaction is committed to the log,
-        ** this buffer can be considered flushed.  Decrement our
-        ** counters to reflect one less buffer that needs writing.
-        **
-        ** note, this relies on all of the above code being
-        ** schedule free once pjl comes back non-null.
-        */
-        if (pjl && cn->bh && atomic_read(&pjl->j_commit_left) == 0) {
-            atomic_dec(&cn->jlist->j_nonzerolen) ;
-            cn->bh = NULL ;
+	pjl = find_newer_jl_for_cn(cn) ;
+        if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh))
+	{
+	    if (!can_dirty(cn))
+	        BUG();
+	    /* if the buffer is prepared, it will either be logged
+	     * or restored.  If restored, we need to make sure
+	     * it actually gets marked dirty
+	     */
+	    mark_buffer_notjournal_new(cn->bh) ;
+	    if (test_bit(BH_JPrepared, &cn->bh->b_state)) { 
+	        set_bit(BH_JRestore_dirty, &cn->bh->b_state);
+	    } else {
+	        set_bit(BH_JTest, &cn->bh->b_state);
+	        mark_buffer_dirty(cn->bh);
+	    }
         } 
-next:
         cn = cn->next ;
     }
-    /* the first run through the loop sends all the dirty buffers to
-    ** ll_rw_block.
-    ** the second run through the loop does all the accounting
-    */
-    if (run++ == 0) {
-        goto loop_start ;
-    }
-
-    atomic_set(&(jl->j_flushing), 0) ;
-    wake_up(&(jl->j_flush_wait)) ;
     return ret ;
 }
-/* since we never give dirty buffers to bdflush/kupdate, we have to
-** flush them ourselves.  This runs through the journal lists, finds
-** old metadata in need of flushing and sends it to disk.
-** this does not end transactions, commit anything, or free
-** cnodes.
-**
-** returns the highest transaction id that was flushed last time
-*/
-static unsigned long reiserfs_journal_kupdate(struct super_block *s) {
-    struct reiserfs_journal_list *jl ;
-    int i ;
-    int start ;
-    time_t age ;
-    int ret = 0 ;
 
-    start = SB_JOURNAL_LIST_INDEX(s) ;
+static int kupdate_transactions(struct super_block *s,
+                                   struct reiserfs_journal_list *jl,
+				   struct reiserfs_journal_list **next_jl,
+				   unsigned long *next_trans_id,
+				   int num_blocks,
+				   int num_trans) {
+    int ret = 0;
+    int written = 0 ;
+    int transactions_flushed = 0;
+    unsigned long orig_trans_id = jl->j_trans_id;
+    struct buffer_chunk chunk;
+    struct list_head *entry;
+    chunk.nr = 0;
+
+    down(&SB_JOURNAL(s)->j_flush_sem);
+    if (!journal_list_still_alive(s, orig_trans_id)) {
+	goto done;
+    }
+
+    /* we've got j_flush_sem held, nobody is going to delete any
+     * of these lists out from underneath us
+     */
+    while((num_trans && transactions_flushed < num_trans) || 
+          (!num_trans && written < num_blocks)) {
+
+	if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
+	    atomic_read(&jl->j_commit_left))
+	{
+	    del_from_work_list(s, jl);
+	    break;
+	}
+	ret = write_one_transaction(s, jl, &chunk);
 
-    /* safety check to prevent flush attempts during a mount */
-    if (start < 0) {
-        return 0 ;
-    }
-    i = (start + 1) % JOURNAL_LIST_COUNT ;
-    while(i != start) {
-        jl = SB_JOURNAL_LIST(s) + i  ;
-        age = get_seconds() - jl->j_timestamp ;
-        if (jl->j_len > 0 && // age >= (JOURNAL_MAX_COMMIT_AGE * 2) && 
-            atomic_read(&(jl->j_nonzerolen)) > 0 &&
-            atomic_read(&(jl->j_commit_left)) == 0) {
-
-            if (jl->j_trans_id == SB_JOURNAL(s)->j_trans_id) {
-                break ;
-            }
-            /* if ret was already 1, we want to preserve that */
-            ret |= kupdate_one_transaction(s, jl) ;
-        } 
-        if (atomic_read(&(jl->j_nonzerolen)) > 0) {
-            ret |= 1 ;
+	if (ret < 0)
+	    goto done;
+	transactions_flushed++;
+	written += ret;
+	entry = jl->j_list.next;
+
+	/* did we wrap? */
+	if (entry == &SB_JOURNAL(s)->j_journal_list) {
+	    break;
         }
-        i = (i + 1) % JOURNAL_LIST_COUNT ;
+	jl = JOURNAL_LIST_ENTRY(entry);
+
+	/* don't bother with older transactions */
+	if (jl->j_trans_id <= orig_trans_id)
+	    break;
     }
-    return ret ;
+    if (chunk.nr) {
+        write_chunk(&chunk);
+    }
+
+done:
+    up(&SB_JOURNAL(s)->j_flush_sem);
+    return ret;
+}
+
+/* for o_sync and fsync heavy applications, they tend to use 
+** all the journa list slots with tiny transactions.  These
+** trigger lots and lots of calls to update the header block, which
+** adds seeks and slows things down.
+** 
+** This function tries to clear out a large chunk of the journal lists
+** at once, which makes everything faster since only the newest journal
+** list updates the header block
+*/
+static int flush_used_journal_lists(struct super_block *s, 
+                                    struct reiserfs_journal_list *jl) {
+    unsigned long len = 0;
+    unsigned long cur_len;
+    int ret;
+    int i;
+    struct reiserfs_journal_list *tjl;
+    struct reiserfs_journal_list *flush_jl;
+    unsigned long trans_id;
+
+    flush_jl = tjl = jl;
+
+    /* flush for 256 transactions or 256 blocks, whichever comes first */
+    for(i = 0 ; i < 256 && len < 256 ; i++) {
+	if (atomic_read(&tjl->j_commit_left) || 
+	    tjl->j_trans_id < jl->j_trans_id) {
+	    break;
+	}
+	cur_len = atomic_read(&tjl->j_nonzerolen);
+	if (cur_len > 0) {
+	    tjl->j_state &= ~LIST_TOUCHED;
+	}
+	len += cur_len;
+	flush_jl = tjl;
+	if (tjl->j_list.next == &SB_JOURNAL(s)->j_journal_list)
+	    break;
+	tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
+    }
+    /* try to find a group of blocks we can flush across all the
+    ** transactions, but only bother if we've actually spanned 
+    ** across multiple lists
+    */
+    if (flush_jl != jl) {
+        ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
+    }
+    flush_journal_list(s, flush_jl, 1);
+    return 0;
 }
 
 /*
@@ -1309,6 +1673,10 @@ void remove_journal_hash(struct super_bl
 }
 
 static void free_journal_ram(struct super_block *p_s_sb) {
+  reiserfs_kfree(SB_JOURNAL(p_s_sb)->j_current_jl, 
+                 sizeof(struct reiserfs_journal_list), p_s_sb);
+  SB_JOURNAL(p_s_sb)->j_num_lists--;
+
   vfree(SB_JOURNAL(p_s_sb)->j_cnode_free_orig) ;
   free_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap) ;
   free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */
@@ -1439,7 +1807,7 @@ static int journal_transaction_is_valid(
     }
     brelse(c_bh) ;
     reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid "
-                   "transaction start offset %lu, len %d id %d\n", 
+                   "transaction start offset %llu, len %d id %d\n", 
 		   d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		   get_desc_trans_len(desc), get_desc_trans_id(desc)) ;
     return 1 ;
@@ -1479,7 +1847,7 @@ static int journal_read_transaction(stru
   desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
   trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
   reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "
-                 "journal_read_transaction, offset %lu, len %d mount_id %d\n", 
+                 "journal_read_transaction, offset %llu, len %d mount_id %d\n", 
 		 d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		 get_desc_trans_len(desc), get_desc_mount_id(desc)) ;
   if (get_desc_trans_id(desc) < oldest_trans_id) {
@@ -1507,7 +1875,7 @@ static int journal_read_transaction(stru
   commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
   if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
     reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, "
-                   "commit offset %ld had bad time %d or length %d\n", 
+                   "commit offset %llu had bad time %d or length %d\n", 
 		   c_bh->b_blocknr -  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		   get_commit_trans_id(commit), get_commit_trans_len(commit));
     brelse(c_bh) ;
@@ -1675,7 +2043,7 @@ static int journal_read(struct super_blo
   printk("reiserfs: checking transaction log (%s) for (%s)\n",
 	 bdevname(SB_JOURNAL(p_s_sb)->j_dev_bd, b),
 	 reiserfs_bdevname(p_s_sb));
-  start = get_seconds() ;
+  start = get_seconds();
 
   /* step 1, read in the journal header block.  Check the transaction it says 
   ** is the first unflushed, and if that transaction is not valid, 
@@ -1735,7 +2103,7 @@ static int journal_read(struct super_blo
 	oldest_start = d_bh->b_blocknr ;
 	newest_mount_id = get_desc_mount_id(desc) ;
 	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting "
-	               "oldest_start to offset %lu, trans_id %lu\n", 
+	               "oldest_start to offset %llu, trans_id %lu\n", 
 		       oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		       oldest_trans_id) ;
       } else if (oldest_trans_id > get_desc_trans_id(desc)) { 
@@ -1763,7 +2131,7 @@ start_log_replay:
   cur_dblock = oldest_start ;
   if (oldest_trans_id)  {
     reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay "
-                   "from offset %lu, trans_id %lu\n", 
+                   "from offset %llu, trans_id %lu\n", 
 		   cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		   oldest_trans_id) ;
 
@@ -1817,70 +2185,28 @@ start_log_replay:
   return 0 ;
 }
 
-
-struct reiserfs_journal_commit_task {
-  struct super_block *p_s_sb ;
-  int jindex ;
-  int wake_on_finish ; /* if this is one, we wake the task_done queue, if it
-                       ** is zero, we free the whole struct on finish
-		       */
-  struct reiserfs_journal_commit_task *self ;
-  struct work_struct work;
-} ;
-
-static void reiserfs_journal_commit_task_func(void *__ct) {
-  struct reiserfs_journal_commit_task *ct = __ct;
-  struct reiserfs_journal_list *jl ;
-
-  reiserfs_write_lock(ct->p_s_sb);
-
-  jl = SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex ;
-
-  flush_commit_list(ct->p_s_sb, SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex, 1) ; 
-
-  if (jl->j_len > 0 && atomic_read(&(jl->j_nonzerolen)) > 0 &&
-      atomic_read(&(jl->j_commit_left)) == 0) {
-    kupdate_one_transaction(ct->p_s_sb, jl) ;
-  }
-  reiserfs_kfree(ct->self, sizeof(struct reiserfs_journal_commit_task), ct->p_s_sb) ;
-  reiserfs_write_unlock(ct->p_s_sb);
-}
-
-static void setup_commit_task_arg(struct reiserfs_journal_commit_task *ct,
-                                  struct super_block *p_s_sb, 
-				  int jindex) {
-  if (!ct) {
-    reiserfs_panic(NULL, "journal-1360: setup_commit_task_arg called with NULL struct\n") ;
-  }
-  ct->p_s_sb = p_s_sb ;
-  ct->jindex = jindex ;
-  INIT_WORK(&ct->work, reiserfs_journal_commit_task_func, ct);
-  ct->self = ct ;
-}
-
-static void commit_flush_async(struct super_block *p_s_sb, int jindex) {
-  struct reiserfs_journal_commit_task *ct ;
-  /* using GFP_NOFS, GFP_KERNEL could try to flush inodes, which will try
-  ** to start/join a transaction, which will deadlock
-  */
-  ct = reiserfs_kmalloc(sizeof(struct reiserfs_journal_commit_task), GFP_NOFS, p_s_sb) ;
-  if (ct) {
-    setup_commit_task_arg(ct, p_s_sb, jindex) ;
-    queue_work(commit_wq, &ct->work) ;
-  } else {
-#ifdef CONFIG_REISERFS_CHECK
-    reiserfs_warning("journal-1540: kmalloc failed, doing sync commit\n") ;
-#endif
-    flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ;
-  }
+static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
+{
+    struct reiserfs_journal_list *jl;
+retry:
+    jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s);
+    if (!jl) {
+	yield();
+	goto retry;
+    }
+    memset(jl, 0, sizeof(*jl));
+    INIT_LIST_HEAD(&jl->j_list);
+    INIT_LIST_HEAD(&jl->j_working_list);
+    INIT_LIST_HEAD(&jl->j_tail_bh_list);
+    INIT_LIST_HEAD(&jl->j_bh_list);
+    sema_init(&jl->j_commit_lock, 1);
+    SB_JOURNAL(s)->j_num_lists++;
+    get_journal_list(jl);
+    return jl;
 }
 
 static void journal_list_init(struct super_block *p_s_sb) {
-  int i ;
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_commit_wait)) ;
-    init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_flush_wait)) ;
-  }
+    SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
 }
 
 static int release_journal_dev( struct super_block *super,
@@ -1971,6 +2297,7 @@ int journal_init(struct super_block *p_s
     struct reiserfs_super_block * rs;
     struct reiserfs_journal_header *jh;
     struct reiserfs_journal *journal;
+    struct reiserfs_journal_list *jl;
     char b[BDEVNAME_SIZE];
 
     journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ;
@@ -1981,6 +2308,8 @@ int journal_init(struct super_block *p_s
     memset(journal, 0, sizeof(struct reiserfs_journal)) ;
     INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_bitmap_nodes) ;
     INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_prealloc_list);
+    INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_working_list);
+    INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_journal_list);
     reiserfs_allocate_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap, 
  				   SB_BMAP_NR(p_s_sb)) ;
     allocate_bitmap_nodes(p_s_sb) ;
@@ -2088,14 +2417,9 @@ int journal_init(struct super_block *p_s
   brelse (bhjh);
      
   SB_JOURNAL(p_s_sb)->j_list_bitmap_index = 0 ;
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = -10000 ; /* make sure flush_old_commits does not try to flush a list while replay is on */
-
-  /* clear out the journal list array */
-  memset(SB_JOURNAL_LIST(p_s_sb), 0, sizeof(struct reiserfs_journal_list) * JOURNAL_LIST_COUNT) ; 
   journal_list_init(p_s_sb) ;
 
   memset(SB_JOURNAL(p_s_sb)->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
-  memset(journal_writers, 0, sizeof(char *) * 512) ; /* debug code */
 
   INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_dirty_buffers) ;
   spin_lock_init(&SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock) ;
@@ -2104,18 +2428,19 @@ int journal_init(struct super_block *p_s
   SB_JOURNAL(p_s_sb)->j_len = 0 ;
   SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
   atomic_set(&(SB_JOURNAL(p_s_sb)->j_wcount), 0) ;
+  atomic_set(&(SB_JOURNAL(p_s_sb)->j_async_throttle), 0) ;
   SB_JOURNAL(p_s_sb)->j_bcount = 0 ;	  
   SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ;	  
   SB_JOURNAL(p_s_sb)->j_last = NULL ;	  
   SB_JOURNAL(p_s_sb)->j_first = NULL ;     
   init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-  init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_wait)) ; 
+  sema_init(&SB_JOURNAL(p_s_sb)->j_lock, 1);
+  sema_init(&SB_JOURNAL(p_s_sb)->j_flush_sem, 1);
 
   SB_JOURNAL(p_s_sb)->j_trans_id = 10 ;  
   SB_JOURNAL(p_s_sb)->j_mount_id = 10 ; 
   SB_JOURNAL(p_s_sb)->j_state = 0 ;
   atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
-  atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 0) ;
   SB_JOURNAL(p_s_sb)->j_cnode_free_list = allocate_cnodes(num_cnodes) ;
   SB_JOURNAL(p_s_sb)->j_cnode_free_orig = SB_JOURNAL(p_s_sb)->j_cnode_free_list ;
   SB_JOURNAL(p_s_sb)->j_cnode_free = SB_JOURNAL(p_s_sb)->j_cnode_free_list ? num_cnodes : 0 ;
@@ -2123,8 +2448,9 @@ int journal_init(struct super_block *p_s
   SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
 
   init_journal_hash(p_s_sb) ;
-  SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb)) ;
-  if (!(SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap)) {
+  jl = SB_JOURNAL(p_s_sb)->j_current_jl;
+  jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
+  if (!jl->j_list_bitmap) {
     reiserfs_warning("journal-2005, get_list_bitmap failed for journal list 0\n") ;
     goto free_and_return;
   }
@@ -2132,16 +2458,12 @@ int journal_init(struct super_block *p_s
     reiserfs_warning("Replay Failure, unable to mount\n") ;
     goto free_and_return;
   }
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ; /* once the read is done, we can set this
-                                         where it belongs */
-
-  if (reiserfs_dont_log (p_s_sb))
-    return 0;
 
   reiserfs_mounted_fs_count++ ;
   if (reiserfs_mounted_fs_count <= 1)
     commit_wq = create_workqueue("reiserfs");
 
+  INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb);
   return 0 ;
 free_and_return:
   free_journal_ram(p_s_sb);
@@ -2155,7 +2477,8 @@ free_and_return:
 */
 int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) {
   time_t now = get_seconds() ;
-  if (reiserfs_dont_log(th->t_super)) 
+  /* cannot restart while nested */
+  if (th->t_refcount > 1)
     return 0 ;
   if ( SB_JOURNAL(th->t_super)->j_must_wait > 0 ||
        (SB_JOURNAL(th->t_super)->j_len_alloc + new_alloc) >= SB_JOURNAL_MAX_BATCH(th->t_super) || 
@@ -2193,6 +2516,35 @@ void reiserfs_wait_on_write_block(struct
                !test_bit(WRITERS_BLOCKED, &SB_JOURNAL(s)->j_state)) ;
 }
 
+static void queue_log_writer(struct super_block *s) {
+    set_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state);
+    sleep_on(&SB_JOURNAL(s)->j_join_wait);
+}
+
+static void wake_queued_writers(struct super_block *s) {
+    if (test_and_clear_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state))
+        wake_up(&SB_JOURNAL(s)->j_join_wait);
+}
+
+static void let_transaction_grow(struct super_block *sb, 
+                                 unsigned long trans_id)
+{
+    unsigned long bcount = SB_JOURNAL(sb)->j_bcount;
+    while(1) {
+	yield();
+        while ((atomic_read(&SB_JOURNAL(sb)->j_wcount) > 0 ||
+	        atomic_read(&SB_JOURNAL(sb)->j_jlock)) && 
+	       SB_JOURNAL(sb)->j_trans_id == trans_id) {
+	    queue_log_writer(sb);
+	}
+	if (SB_JOURNAL(sb)->j_trans_id != trans_id)
+	    break;
+	if (bcount == SB_JOURNAL(sb)->j_bcount)
+	    break;
+	bcount = SB_JOURNAL(sb)->j_bcount;
+    }
+}
+
 /* join == true if you must join an existing transaction.
 ** join == false if you can deal with waiting for others to finish
 **
@@ -2202,92 +2554,170 @@ void reiserfs_wait_on_write_block(struct
 static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) {
   time_t now = get_seconds() ;
   int old_trans_id  ;
+  struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+  struct reiserfs_transaction_handle myth;
+  int sched_count = 0;
 
   reiserfs_check_lock_depth("journal_begin") ;
   RFALSE( p_s_sb->s_flags & MS_RDONLY, 
 	  "clm-2078: calling journal_begin on readonly FS") ;
 
-  if (reiserfs_dont_log(p_s_sb)) {
-    th->t_super = p_s_sb ; /* others will check this for the don't log flag */
-    return 0 ;
-  }
   PROC_INFO_INC( p_s_sb, journal.journal_being );
+  /* set here for journal_join */
+  th->t_refcount = 1; 
+  th->t_super = p_s_sb ;
 
 relock:
   lock_journal(p_s_sb) ;
+  journal->j_bcount++;
 
-  if (test_bit(WRITERS_BLOCKED, &SB_JOURNAL(p_s_sb)->j_state)) {
+  if (test_bit(WRITERS_BLOCKED, &journal->j_state)) {
     unlock_journal(p_s_sb) ;
     reiserfs_wait_on_write_block(p_s_sb) ;
     PROC_INFO_INC( p_s_sb, journal.journal_relock_writers );
     goto relock ;
   }
+  now = get_seconds();
 
   /* if there is no room in the journal OR
   ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning 
   ** we don't sleep if there aren't other writers
   */
 
-  if (  (!join && SB_JOURNAL(p_s_sb)->j_must_wait > 0) ||
-     ( !join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) || 
-     (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0 && SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && 
-      (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) ||
-     (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) ) ||
-     (!join && SB_JOURNAL(p_s_sb)->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) {
+  if ( (!join && journal->j_must_wait > 0) ||
+     ( !join && (journal->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) || 
+     (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 && 
+      (now - journal->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) ||
+     (!join && atomic_read(&journal->j_jlock)) ||
+     (!join && journal->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) {
 
+    old_trans_id = journal->j_trans_id;
     unlock_journal(p_s_sb) ; /* allow others to finish this transaction */
-
-    /* if writer count is 0, we can just force this transaction to end, and start
-    ** a new one afterwards.
-    */
-    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
-      struct reiserfs_transaction_handle myth ;
-      journal_join(&myth, p_s_sb, 1) ;
-      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-      journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-      do_journal_end(&myth, p_s_sb,1,COMMIT_NOW) ;
+    
+    if (!join && (journal->j_len_alloc + nblocks + 2) >= 
+        SB_JOURNAL_MAX_BATCH(p_s_sb) && 
+	((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75))
+    {
+	if (atomic_read(&journal->j_wcount) > 10) {
+	    sched_count++;
+	    queue_log_writer(p_s_sb);
+	    goto relock;
+	}
+    } 
+    /* don't mess with joining the transaction if all we have to do is
+     * wait for someone else to do a commit
+     */
+    if (atomic_read(&journal->j_jlock)) {
+	while (journal->j_trans_id == old_trans_id &&
+	       atomic_read(&journal->j_jlock)) {
+	    queue_log_writer(p_s_sb);
+        }
+	goto relock;
+    }
+    journal_join(&myth, p_s_sb, 1) ;
+  
+    /* someone might have ended the transaction while we joined */
+    if (old_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
+        do_journal_end(&myth, p_s_sb, 1, 0) ;
     } else {
-      /* but if the writer count isn't zero, we have to wait for the current writers to finish.
-      ** They won't batch on transaction end once we set j_jlock
-      */
-      atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
-      old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-      while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) &&
-            SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id) {
-	sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-      }
+        do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ;
     }
+
     PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount );
     goto relock ;
   }
-
-  if (SB_JOURNAL(p_s_sb)->j_trans_start_time == 0) { /* we are the first writer, set trans_id */
-    SB_JOURNAL(p_s_sb)->j_trans_start_time = now ;
+  /* we are the first writer, set trans_id */
+  if (journal->j_trans_start_time == 0) { 
+    journal->j_trans_start_time = get_seconds();
   }
-  atomic_inc(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
-  SB_JOURNAL(p_s_sb)->j_len_alloc += nblocks ;
+  atomic_inc(&(journal->j_wcount)) ;
+  journal->j_len_alloc += nblocks ;
   th->t_blocks_logged = 0 ;
   th->t_blocks_allocated = nblocks ;
-  th->t_super = p_s_sb ;
-  th->t_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-  th->t_caller = "Unknown" ;
+  th->t_trans_id = journal->j_trans_id ;
   unlock_journal(p_s_sb) ;
-  p_s_sb->s_dirt = 1; 
   return 0 ;
 }
 
+struct reiserfs_transaction_handle *
+reiserfs_persistent_transaction(struct super_block *s, int nblocks) {
+    int ret ;
+    struct reiserfs_transaction_handle *th ;
+
+    /* if we're nesting into an existing transaction.  It will be
+    ** persistent on its own
+    */
+    if (reiserfs_transaction_running(s)) {
+        th = current->journal_info ;
+	th->t_refcount++ ;
+	if (th->t_refcount < 2) {
+	    BUG() ;
+	}
+	return th ;
+    }
+    th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ;
+    if (!th)
+       return NULL;
+    ret = journal_begin(th, s, nblocks) ;
+    if (ret) {
+	reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
+        return NULL;
+    }
+    return th ;
+}
+
+int
+reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) {
+    struct super_block *s = th->t_super;
+    int ret;
+    ret = journal_end(th, th->t_super, th->t_blocks_allocated);
+    if (th->t_refcount == 0)
+	reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
+    return ret;
+}
 
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
+  struct reiserfs_transaction_handle *cur_th = current->journal_info;
+
+  /* this keeps do_journal_end from NULLing out the current->journal_info
+  ** pointer
+  */
+  th->t_handle_save = cur_th ;
+  if (cur_th && cur_th->t_refcount > 1) {
+      BUG() ;
+  }
   return do_journal_begin_r(th, p_s_sb, nblocks, 1) ;
 }
 
 int journal_begin(struct reiserfs_transaction_handle *th, struct super_block  * p_s_sb, unsigned long nblocks) {
-  return do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
-}
+    struct reiserfs_transaction_handle *cur_th = current->journal_info ;
+    int ret ;
 
-/* not used at all */
-int journal_prepare(struct super_block  * p_s_sb, struct buffer_head *bh) {
-  return 0 ;
+    th->t_handle_save = NULL ;
+    if (cur_th) {
+	/* we are nesting into the current transaction */
+	if (cur_th->t_super == p_s_sb) {
+	      cur_th->t_refcount++ ;
+	      memcpy(th, cur_th, sizeof(*th)); 
+	      if (th->t_refcount <= 1) 
+		      printk("BAD: refcount <= 1, but journal_info != 0\n"); 
+	      return 0;
+	} else {
+	    /* we've ended up with a handle from a different filesystem.
+	    ** save it and restore on journal_end.  This should never
+	    ** really happen...
+	    */
+	    reiserfs_warning("clm-2100: nesting info a different FS\n") ;
+	    th->t_handle_save = current->journal_info ;
+	    current->journal_info = th;
+	}
+    } else {
+	current->journal_info = th;
+    }
+    ret = do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
+    if (current->journal_info != th)
+        BUG() ;
+    return ret ;
 }
 
 /*
@@ -2305,18 +2735,14 @@ int journal_mark_dirty(struct reiserfs_t
   int prepared = 0 ;
 
   PROC_INFO_INC( p_s_sb, journal.mark_dirty );
-  if (reiserfs_dont_log(th->t_super)) {
-    mark_buffer_dirty(bh) ;
-    return 0 ;
-  }
-
   if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
     reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", 
                    th->t_trans_id, SB_JOURNAL(p_s_sb)->j_trans_id);
   }
-  p_s_sb->s_dirt = 1 ;
+  p_s_sb->s_dirt = 1;
 
   prepared = test_and_clear_bit(BH_JPrepared, &bh->b_state) ;
+  clear_bit(BH_JRestore_dirty, &bh->b_state);
   /* already in this transaction, we are done */
   if (buffer_journaled(bh)) {
     PROC_INFO_INC( p_s_sb, journal.mark_dirty_already );
@@ -2327,14 +2753,12 @@ int journal_mark_dirty(struct reiserfs_t
   ** a dirty or journal_dirty or locked buffer to be logged, as some changes
   ** could get to disk too early.  NOT GOOD.
   */
-  if (!prepared || buffer_locked(bh)) {
+  if (!prepared || buffer_locked(bh) || buffer_dirty(bh)) {
     printk("journal-1777: buffer %llu bad state %cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT\n", (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!', 
                             buffer_locked(bh) ? ' ' : '!',
 			    buffer_dirty(bh) ? ' ' : '!',
 			    buffer_journal_dirty(bh) ? ' ' : '!') ;
-    show_reiserfs_locks() ;
   }
-  count_already_incd = clear_prepared_bits(bh) ;
 
   if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
     printk("journal-1409: journal_mark_dirty returning because j_wcount was %d\n", atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount))) ;
@@ -2353,14 +2777,6 @@ int journal_mark_dirty(struct reiserfs_t
     mark_buffer_notjournal_dirty(bh) ;
   }
 
-  if (buffer_dirty(bh)) {
-    clear_buffer_dirty(bh) ;
-  }
-
-  if (buffer_journaled(bh)) { /* must double check after getting lock */
-    goto done ;
-  }
-
   if (SB_JOURNAL(p_s_sb)->j_len > SB_JOURNAL(p_s_sb)->j_len_alloc) {
     SB_JOURNAL(p_s_sb)->j_len_alloc = SB_JOURNAL(p_s_sb)->j_len + JOURNAL_PER_BALANCE_CNT ;
   }
@@ -2400,29 +2816,31 @@ int journal_mark_dirty(struct reiserfs_t
     SB_JOURNAL(p_s_sb)->j_first = cn ;
     SB_JOURNAL(p_s_sb)->j_last = cn ;
   }
-done:
-  return 0 ;
-}
-
-/*
-** if buffer already in current transaction, do a journal_mark_dirty
-** otherwise, just mark it dirty and move on.  Used for writes to meta blocks
-** that don't need journaling
-*/
-int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) {
-  if (reiserfs_dont_log(th->t_super) || buffer_journaled(bh) || 
-      buffer_journal_dirty(bh)) {
-    return journal_mark_dirty(th, p_s_sb, bh) ;
-  }
-  if (get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_hash_table, bh->b_blocknr)) {
-    return journal_mark_dirty(th, p_s_sb, bh) ;
-  }
-  mark_buffer_dirty(bh) ;
   return 0 ;
 }
 
 int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
-  return do_journal_end(th, p_s_sb, nblocks, 0) ;
+  if (!current->journal_info && th->t_refcount > 1) 
+    printk("REISER-NESTING: th NULL, refcount %d\n", th->t_refcount); 
+
+  th->t_refcount--;
+  if (th->t_refcount > 0) { 
+    struct reiserfs_transaction_handle *cur_th = current->journal_info ;
+
+    /* we aren't allowed to close a nested transaction on a different
+    ** filesystem from the one in the task struct
+    */
+    if (cur_th->t_super != th->t_super)
+      BUG() ;
+
+    if (th != cur_th) {
+      memcpy(current->journal_info, th, sizeof(*th));
+      th->t_trans_id = 0;
+    }
+    return 0; 
+  } else {
+    return do_journal_end(th, p_s_sb, nblocks, 0) ;
+  }
 }
 
 /* removes from the current transaction, relsing and descrementing any counters.  
@@ -2464,7 +2882,6 @@ static int remove_from_transaction(struc
     if (atomic_read(&(bh->b_count)) < 0) {
       printk("journal-1752: remove from trans, b_count < 0\n") ;
     }
-    if (!buffer_locked(bh)) reiserfs_clean_and_file_buffer(bh) ; 
     ret = 1 ;
   }
   SB_JOURNAL(p_s_sb)->j_len-- ;
@@ -2490,7 +2907,7 @@ static int can_dirty(struct reiserfs_jou
   int can_dirty = 1 ;
   
   /* first test hprev.  These are all newer than cn, so any node here
-  ** with the name block number and dev means this node can't be sent
+  ** with the same block number and dev means this node can't be sent
   ** to disk right now.
   */
   while(cur && can_dirty) {
@@ -2520,6 +2937,10 @@ static int can_dirty(struct reiserfs_jou
 */
 int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
 
+  /* you can sync while nested, very, very bad */
+  if (th->t_refcount > 1) {
+    BUG() ;
+  }
   if (SB_JOURNAL(p_s_sb)->j_len == 0) {
     reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
     journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
@@ -2527,88 +2948,62 @@ int journal_end_sync(struct reiserfs_tra
   return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT) ;
 }
 
-int show_reiserfs_locks(void) {
-
-  dump_journal_writers() ;
-  return 0 ;
-}
-
 /*
-** used to get memory back from async commits that are floating around
-** and to reclaim any blocks deleted but unusable because their commits
-** haven't hit disk yet.  called from bitmap.c
-**
-** if it starts flushing things, it ors SCHEDULE_OCCURRED into repeat.
-** note, this is just if schedule has a chance of occurring.  I need to 
-** change flush_commit_lists to have a repeat parameter too.
-**
+** writeback the pending async commits to disk
 */
-void flush_async_commits(struct super_block *p_s_sb) {
-  int i ;
-
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) {
-      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ; 
-    }
-  }
+static void flush_async_commits(void *p) {
+  struct super_block *p_s_sb = p;
+  struct reiserfs_journal_list *jl;
+  struct list_head *entry;
+
+  lock_kernel();
+  if (!list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
+      /* last entry is the youngest, commit it and you get everything */
+      entry = SB_JOURNAL(p_s_sb)->j_journal_list.prev;
+      jl = JOURNAL_LIST_ENTRY(entry);
+      flush_commit_list(p_s_sb, jl, 1);
+  }
+  unlock_kernel();
+  atomic_inc(&SB_JOURNAL(p_s_sb)->j_async_throttle);
+  filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);
+  atomic_dec(&SB_JOURNAL(p_s_sb)->j_async_throttle);
 }
 
 /*
 ** flushes any old transactions to disk
 ** ends the current transaction if it is too old
-**
-** also calls flush_journal_list with old_only == 1, which allows me to reclaim
-** memory and such from the journal lists whose real blocks are all on disk.
-**
-** called by sync_dev_journal from buffer.c
 */
-int flush_old_commits(struct super_block *p_s_sb, int immediate) {
-  int i ;
-  int count = 0;
-  int start ; 
-  time_t now ; 
-  struct reiserfs_transaction_handle th ; 
-
-  start =  SB_JOURNAL_LIST_INDEX(p_s_sb) ;
-  now = get_seconds() ;
-
-  /* safety check so we don't flush while we are replaying the log during mount */
-  if (SB_JOURNAL_LIST_INDEX(p_s_sb) < 0) {
-    return 0  ;
-  }
-  /* starting with oldest, loop until we get to the start */
-  i = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ;
-  while(i != start) {
-    if (SB_JOURNAL_LIST(p_s_sb)[i].j_len > 0 && ((now - SB_JOURNAL_LIST(p_s_sb)[i].j_timestamp) > SB_JOURNAL_MAX_COMMIT_AGE(p_s_sb) ||
-       immediate)) {
-      /* we have to check again to be sure the current transaction did not change */
-      if (i != SB_JOURNAL_LIST_INDEX(p_s_sb))  {
-	flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ;
-      }
-    }
-    i = (i + 1) % JOURNAL_LIST_COUNT ;
-    count++ ;
-  }
-  /* now, check the current transaction.  If there are no writers, and it is too old, finish it, and
-  ** force the commit blocks to disk
-  */
-  if (!immediate && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&  
-     SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && 
-     SB_JOURNAL(p_s_sb)->j_len > 0 && 
-     (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) {
-    journal_join(&th, p_s_sb, 1) ;
-    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-    journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-    do_journal_end(&th, p_s_sb,1, COMMIT_NOW) ;
-  } else if (immediate) { /* belongs above, but I wanted this to be very explicit as a special case.  If they say to 
-                             flush, we must be sure old transactions hit the disk too. */
-    journal_join(&th, p_s_sb, 1) ;
-    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-    journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-    do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
-  }
-   reiserfs_journal_kupdate(p_s_sb) ;
-   return 0 ;
+int reiserfs_flush_old_commits(struct super_block *p_s_sb) {
+    time_t now ; 
+    struct reiserfs_transaction_handle th ; 
+
+    now = get_seconds();
+    /* safety check so we don't flush while we are replaying the log during 
+     * mount 
+     */
+    if (list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
+	return 0  ;
+    }
+
+    /* check the current transaction.  If there are no writers, and it is 
+     * too old, finish it, and force the commit blocks to disk 
+     */
+    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&  
+        SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && 
+        SB_JOURNAL(p_s_sb)->j_len > 0 && 
+        (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > 
+	SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) 
+    {
+	journal_join(&th, p_s_sb, 1) ;
+	reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+	journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
+
+	/* we're only being called from kreiserfsd, it makes no sense to do
+	** an async commit so that kreiserfsd can do it later
+	*/
+	do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
+    } 
+    return p_s_sb->s_dirt;
 }
 
 /*
@@ -2629,6 +3024,7 @@ static int check_journal_end(struct reis
   int flush = flags & FLUSH_ALL ;
   int commit_now = flags & COMMIT_NOW ;
   int wait_on_commit = flags & WAIT ;
+  struct reiserfs_journal_list *jl;
 
   if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
     reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", 
@@ -2645,13 +3041,7 @@ static int check_journal_end(struct reis
   ** care of in this trans
   */
   if (SB_JOURNAL(p_s_sb)->j_len == 0) {
-    int wcount = atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
-    unlock_journal(p_s_sb) ;
-    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock))  > 0 && wcount <= 0) {
-      atomic_dec(&(SB_JOURNAL(p_s_sb)->j_jlock)) ;
-      wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-    }
-    return 0 ;
+    BUG();
   }
   /* if wcount > 0, and we are called to with flush or commit_now,
   ** we wait on j_join_wait.  We will wake up when the last writer has
@@ -2661,24 +3051,37 @@ static int check_journal_end(struct reis
   */
   if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0) {
     if (flush || commit_now) {
-      int orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
+      unsigned trans_id ;
+
+      jl = SB_JOURNAL(p_s_sb)->j_current_jl;
+      trans_id = jl->j_trans_id;
+
       atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
       if (flush) {
         SB_JOURNAL(p_s_sb)->j_next_full_flush = 1 ;
       }
       unlock_journal(p_s_sb) ;
+
       /* sleep while the current transaction is still j_jlocked */
-      while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) && 
-            SB_JOURNAL(p_s_sb)->j_trans_id == th->t_trans_id) {
-	sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-      }
-      if (commit_now) {
-	if (wait_on_commit) {
-	  flush_commit_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-	} else {
-	  commit_flush_async(p_s_sb, orig_jindex) ; 
+      while(SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+	if (atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
+	    queue_log_writer(p_s_sb);
+        } else {
+	    lock_journal(p_s_sb);
+	    if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+	        atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
+	    } 
+	    unlock_journal(p_s_sb);
 	}
       }
+      if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+          BUG();
+      }
+      if (commit_now && journal_list_still_alive(p_s_sb, trans_id) &&
+          wait_on_commit) 
+      {
+	  flush_commit_list(p_s_sb, jl, 1) ;
+      }
       return 0 ;
     } 
     unlock_journal(p_s_sb) ;
@@ -2686,7 +3089,7 @@ static int check_journal_end(struct reis
   }
 
   /* deal with old transactions where we are the last writers */
-  now = get_seconds() ;
+  now = get_seconds();
   if ((now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) {
     commit_now = 1 ;
     SB_JOURNAL(p_s_sb)->j_next_async_flush = 1 ;
@@ -2726,25 +3129,21 @@ int journal_mark_freed(struct reiserfs_t
   struct buffer_head *bh = NULL ;
   struct reiserfs_list_bitmap *jb = NULL ;
   int cleaned = 0 ;
-  
-  if (reiserfs_dont_log(th->t_super)) {
-    bh = sb_find_get_block(p_s_sb, blocknr) ;
-    if (bh && buffer_dirty (bh)) {
-      printk ("journal_mark_freed(dont_log): dirty buffer on hash list: %lx %d\n", bh->b_state, blocknr);
-      BUG ();
-    }
-    brelse (bh);
-    return 0 ;
+
+  cn = get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_hash_table, blocknr);
+  if (cn && cn->bh) {
+      bh = cn->bh ;
+      get_bh(bh) ;
   }
-  bh = sb_find_get_block(p_s_sb, blocknr) ;
   /* if it is journal new, we just remove it from this transaction */
   if (bh && buffer_journal_new(bh)) {
     mark_buffer_notjournal_new(bh) ;
     clear_prepared_bits(bh) ;
+    reiserfs_clean_and_file_buffer(bh) ;
     cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
   } else {
     /* set the bit for this block in the journal bitmap for this transaction */
-    jb = SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap ;
+    jb = SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap;
     if (!jb) {
       reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ;
     }
@@ -2754,6 +3153,7 @@ int journal_mark_freed(struct reiserfs_t
 
     if (bh) {
       clear_prepared_bits(bh) ;
+      reiserfs_clean_and_file_buffer(bh) ;
     }
     cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
 
@@ -2785,7 +3185,6 @@ int journal_mark_freed(struct reiserfs_t
   }
 
   if (bh) {
-    reiserfs_clean_and_file_buffer(bh) ;
     put_bh(bh) ; /* get_hash grabs the buffer */
     if (atomic_read(&(bh->b_count)) < 0) {
       printk("journal-2165: bh->b_count < 0\n") ;
@@ -2795,50 +3194,98 @@ int journal_mark_freed(struct reiserfs_t
 }
 
 void reiserfs_update_inode_transaction(struct inode *inode) {
-  
-  REISERFS_I(inode)->i_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb);
-
+  REISERFS_I(inode)->i_jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
   REISERFS_I(inode)->i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
 }
 
-static int reiserfs_inode_in_this_transaction(struct inode *inode) {
-  if (REISERFS_I(inode)->i_trans_id == SB_JOURNAL(inode->i_sb)->j_trans_id || 
-      REISERFS_I(inode)->i_trans_id == 0) {
-    return 1; 
-  } 
-  return 0 ;
+/*
+ * returns -1 on error, 0 if no commits/barriers were done and 1
+ * if a transaction was actually committed and the barrier was done
+ */
+static int __commit_trans_jl(struct inode *inode, unsigned long id,
+                                 struct reiserfs_journal_list *jl) 
+{
+    struct reiserfs_transaction_handle th ;
+    struct super_block *sb = inode->i_sb ;
+    int ret = 0;
+
+    /* is it from the current transaction, or from an unknown transaction? */
+    if (id == SB_JOURNAL(sb)->j_trans_id) {
+	jl = SB_JOURNAL(sb)->j_current_jl;
+	/* try to let other writers come in and grow this transaction */
+	let_transaction_grow(sb, id);
+	if (SB_JOURNAL(sb)->j_trans_id != id) {
+	    goto flush_commit_only;
+	}
+
+	journal_begin(&th, sb, 1) ;
+
+	/* someone might have ended this transaction while we joined */
+	if (SB_JOURNAL(sb)->j_trans_id != id) {
+	    reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ;
+	    journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ;
+	    journal_end(&th, sb, 1) ;
+	    goto flush_commit_only;
+	}
+
+	journal_end_sync(&th, sb, 1) ;
+	ret = 1;
+	
+    } else {
+	/* this gets tricky, we have to make sure the journal list in
+	 * the inode still exists.  We know the list is still around
+	 * if we've got a larger transaction id than the oldest list
+	 */
+flush_commit_only:
+	if (journal_list_still_alive(inode->i_sb, id)) {
+	    /* 
+	     * we only set ret to 1 when we know for sure
+	     * the barrier hasn't been started yet on the commit
+	     * block.
+	     */
+	    if (atomic_read(&jl->j_commit_left) > 1)
+	        ret = 1;
+	    flush_commit_list(sb, jl, 1) ;
+	}
+    }
+    /* otherwise the list is gone, and long since committed */
+    return ret;
 }
 
-void reiserfs_commit_for_inode(struct inode *inode) {
-  struct reiserfs_journal_list *jl ;
-  struct reiserfs_transaction_handle th ;
-  struct super_block *sb = inode->i_sb ;
-
-  jl = SB_JOURNAL_LIST(sb) + REISERFS_I(inode)->i_trans_index ;
-
-  /* is it from the current transaction, or from an unknown transaction? */
-  if (reiserfs_inode_in_this_transaction(inode)) {
-    journal_join(&th, sb, 1) ;
-    reiserfs_update_inode_transaction(inode) ;
-    journal_end_sync(&th, sb, 1) ;
-  } else if (jl->j_trans_id == REISERFS_I(inode)->i_trans_id) {
-    flush_commit_list(sb, jl, 1) ;
-  }
-  /* if the transaction id does not match, this list is long since flushed
-  ** and we don't have to do anything here
-  */
+int reiserfs_commit_for_inode(struct inode *inode) {
+    unsigned long id = REISERFS_I(inode)->i_trans_id;
+    struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
+
+    /* for the whole inode, assume unset id means it was
+     * changed in the current transaction.  More conservative
+     */
+    if (!id || !jl) {
+	reiserfs_update_inode_transaction(inode) ;
+	id = REISERFS_I(inode)->i_trans_id;
+	/* jl will be updated in __commit_trans_jl */
+    }
+
+   return __commit_trans_jl(inode, id, jl);
 }
 
 void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, 
                                       struct buffer_head *bh) {
-  PROC_INFO_INC( p_s_sb, journal.restore_prepared );
-  if (reiserfs_dont_log (p_s_sb))
-    return;
-
-  if (!bh) {
-    return ;
-  }
-  clear_bit(BH_JPrepared, &bh->b_state) ;
+    PROC_INFO_INC( p_s_sb, journal.restore_prepared );
+    if (!bh) {
+	return ; 
+    }
+    if (test_and_clear_bit(BH_JRestore_dirty, &bh->b_state) && 
+	buffer_journal_dirty(bh)) {
+	struct reiserfs_journal_cnode *cn;
+	cn = get_journal_hash_dev(p_s_sb, 
+	                          SB_JOURNAL(p_s_sb)->j_list_hash_table, 
+				  bh->b_blocknr);
+	if (cn && can_dirty(cn)) {
+	    set_bit(BH_JTest, &bh->b_state);
+	    mark_buffer_dirty(bh);
+        }
+    }
+    clear_bit(BH_JPrepared, &bh->b_state) ;
 }
 
 extern struct tree_balance *cur_tb ;
@@ -2849,29 +3296,39 @@ extern struct tree_balance *cur_tb ;
 ** wait on it.
 ** 
 */
-void reiserfs_prepare_for_journal(struct super_block *p_s_sb, 
+int reiserfs_prepare_for_journal(struct super_block *p_s_sb, 
                                   struct buffer_head *bh, int wait) {
-  int retry_count = 0 ;
-
   PROC_INFO_INC( p_s_sb, journal.prepare );
-  if (reiserfs_dont_log (p_s_sb))
-    return;
 
-  while(!test_bit(BH_JPrepared, &bh->b_state) ||
-        (wait && buffer_locked(bh))) {
-    if (buffer_journaled(bh)) {
-      set_bit(BH_JPrepared, &bh->b_state) ;
-      return ;
-    }
-    set_bit(BH_JPrepared, &bh->b_state) ;
-    if (wait) {
-      RFALSE( buffer_locked(bh) && cur_tb != NULL,
-	      "waiting while do_balance was running\n") ;
-      wait_on_buffer(bh) ;
+    if (test_set_buffer_locked(bh)) {
+	if (!wait)
+	    return 0;
+	lock_buffer(bh);
+    }
+    set_bit(BH_JPrepared, &bh->b_state); 
+    if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh))  {
+	clear_bit(BH_JTest, &bh->b_state);
+	set_bit(BH_JRestore_dirty, &bh->b_state);
+    }
+    unlock_buffer(bh);
+    return 1;
+}
+
+static void flush_old_journal_lists(struct super_block *s) {
+    struct reiserfs_journal_list *jl;
+    struct list_head *entry;
+    time_t now = get_seconds();
+
+    while(!list_empty(&SB_JOURNAL(s)->j_journal_list)) {
+        entry = SB_JOURNAL(s)->j_journal_list.next;
+	jl = JOURNAL_LIST_ENTRY(entry);
+	/* this check should always be run, to send old lists to disk */
+	if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
+	    flush_used_journal_lists(s, jl);
+	} else {
+	    break;
+	}
     }
-    PROC_INFO_INC( p_s_sb, journal.prepare_retry );
-    retry_count++ ;
-  }
 }
 
 /* 
@@ -2890,19 +3347,24 @@ static int do_journal_end(struct reiserf
   struct buffer_head *c_bh ; /* commit bh */
   struct buffer_head *d_bh ; /* desc bh */
   int cur_write_start = 0 ; /* start index of current log write */
-  int cur_blocks_left = 0 ; /* number of journal blocks left to write */
   int old_start ;
   int i ;
-  int jindex ;
-  int orig_jindex ;
   int flush = flags & FLUSH_ALL ;
-  int commit_now = flags & COMMIT_NOW ;
   int wait_on_commit = flags & WAIT ;
-  struct reiserfs_super_block *rs ; 
-  int trans_half ;
+  struct reiserfs_journal_list *jl, *temp_jl;
+  struct list_head *entry, *safe;
+  unsigned long jindex;
+  unsigned long commit_trans_id;
+  int trans_half;
 
-  if (reiserfs_dont_log(th->t_super)) {
-    return 0 ;
+  if (th->t_refcount > 1)
+    BUG() ;
+
+  current->journal_info = th->t_handle_save;
+  reiserfs_check_lock_depth("journal end");
+  if (SB_JOURNAL(p_s_sb)->j_len == 0) {
+      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+      journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
   }
 
   lock_journal(p_s_sb) ;
@@ -2911,24 +3373,25 @@ static int do_journal_end(struct reiserf
     flush = 1 ;
   }
   if (SB_JOURNAL(p_s_sb)->j_next_async_flush) {
-    flags |= COMMIT_NOW ;
-    commit_now = 1 ;
+    flags |= COMMIT_NOW | WAIT;
+    wait_on_commit = 1;
   }
 
   /* check_journal_end locks the journal, and unlocks if it does not return 1 
   ** it tells us if we should continue with the journal_end, or just return
   */
   if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
-    return 0 ;
+    p_s_sb->s_dirt = 1;
+    wake_queued_writers(p_s_sb);
+    reiserfs_async_progress_wait(p_s_sb);
+    goto out ;
   }
 
   /* check_journal_end might set these, check again */
   if (SB_JOURNAL(p_s_sb)->j_next_full_flush) {
     flush = 1 ;
   }
-  if (SB_JOURNAL(p_s_sb)->j_next_async_flush) {
-    commit_now = 1 ;
-  }
+
   /*
   ** j must wait means we have to flush the log blocks, and the real blocks for
   ** this transaction
@@ -2938,14 +3401,16 @@ static int do_journal_end(struct reiserf
   }
 
 #ifdef REISERFS_PREALLOCATE
+  /* quota ops might need to nest, setup the journal_info pointer for them */
+  current->journal_info = th ;
   reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into
 				      * the transaction */
+  current->journal_info = th->t_handle_save ;
 #endif
   
-  rs = SB_DISK_SUPER_BLOCK(p_s_sb) ;
   /* setup description block */
   d_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_JOURNAL(p_s_sb)->j_start) ; 
-  set_buffer_uptodate(d_bh) ;
+  set_buffer_uptodate(d_bh);
   desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ;
   memset(d_bh->b_data, 0, d_bh->b_size) ;
   memcpy(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8) ;
@@ -2960,28 +3425,33 @@ static int do_journal_end(struct reiserf
   set_buffer_uptodate(c_bh) ;
 
   /* init this journal list */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_older_commits_done), 0) ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_bh = c_bh ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_start = SB_JOURNAL(p_s_sb)->j_start ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len = SB_JOURNAL(p_s_sb)->j_len ;  
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_nonzerolen), SB_JOURNAL(p_s_sb)->j_len) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_left), SB_JOURNAL(p_s_sb)->j_len + 2);
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = NULL ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
-
-  /* which is faster, locking/unlocking at the start and end of the for
-  ** or locking once per iteration around the insert_journal_hash?
-  ** eitherway, we are write locking insert_journal_hash.  The ENTIRE FOR
-  ** LOOP MUST not cause schedule to occur.
-  */
+  jl = SB_JOURNAL(p_s_sb)->j_current_jl;
+
+  /* we lock the commit before doing anything because
+   * we want to make sure nobody tries to run flush_commit_list until
+   * the new transaction is fully setup, and we've already flushed the
+   * ordered bh list
+   */
+  down(&jl->j_commit_lock);
+
+  /* save the transaction id in case we need to commit it later */
+  commit_trans_id = jl->j_trans_id;
+
+  atomic_set(&jl->j_older_commits_done, 0) ;
+  jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
+  jl->j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
+  jl->j_commit_bh = c_bh ;
+  jl->j_start = SB_JOURNAL(p_s_sb)->j_start ;
+  jl->j_len = SB_JOURNAL(p_s_sb)->j_len ;  
+  atomic_set(&jl->j_nonzerolen, SB_JOURNAL(p_s_sb)->j_len) ;
+  atomic_set(&jl->j_commit_left, SB_JOURNAL(p_s_sb)->j_len + 2);
+  jl->j_realblock = NULL ;
 
-  /* for each real block, add it to the journal list hash,
+  /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
+  **  for each real block, add it to the journal list hash,
   ** copy into real block index array in the commit or desc block
   */
-  trans_half = journal_trans_half(p_s_sb->s_blocksize) ;
+  trans_half = journal_trans_half(p_s_sb->s_blocksize);
   for (i = 0, cn = SB_JOURNAL(p_s_sb)->j_first ; cn ; cn = cn->next, i++) {
     if (test_bit(BH_JDirty, &cn->bh->b_state) ) {
       jl_cn = get_cnode(p_s_sb) ;
@@ -2989,7 +3459,7 @@ static int do_journal_end(struct reiserf
         reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ;
       }
       if (i == 0) {
-        SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = jl_cn ;
+        jl->j_realblock = jl_cn ;
       }
       jl_cn->prev = last_cn ;
       jl_cn->next = NULL ;
@@ -3005,9 +3475,9 @@ static int do_journal_end(struct reiserf
       }
       jl_cn->blocknr = cn->bh->b_blocknr ; 
       jl_cn->state = 0 ;
-      jl_cn->sb = p_s_sb ;
+      jl_cn->sb = p_s_sb;
       jl_cn->bh = cn->bh ;
-      jl_cn->jlist = SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb) ;
+      jl_cn->jlist = jl;
       insert_journal_hash(SB_JOURNAL(p_s_sb)->j_list_hash_table, jl_cn) ; 
       if (i < trans_half) {
 	desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ;
@@ -3018,7 +3488,6 @@ static int do_journal_end(struct reiserf
       i-- ;
     }
   }
-  
   set_desc_trans_len(desc, SB_JOURNAL(p_s_sb)->j_len) ;
   set_desc_mount_id(desc, SB_JOURNAL(p_s_sb)->j_mount_id) ;
   set_desc_trans_id(desc, SB_JOURNAL(p_s_sb)->j_trans_id) ;
@@ -3026,53 +3495,35 @@ static int do_journal_end(struct reiserf
 
   /* special check in case all buffers in the journal were marked for not logging */
   if (SB_JOURNAL(p_s_sb)->j_len == 0) {
-    brelse(d_bh) ;
-    brelse(c_bh) ;
-    unlock_journal(p_s_sb) ;
-    printk("journal-2020: do_journal_end: BAD desc->j_len is ZERO\n") ;
-    atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
-    wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-    return 0 ;
+    BUG();
   }
 
+  /* we're about to dirty all the log blocks, mark the description block
+   * dirty now too.  Don't mark the commit block dirty until all the
+   * others are on disk
+   */
+  mark_buffer_dirty(d_bh);
+
   /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
   cur_write_start = SB_JOURNAL(p_s_sb)->j_start ;
-  cur_blocks_left = SB_JOURNAL(p_s_sb)->j_len  ;
   cn = SB_JOURNAL(p_s_sb)->j_first ;
   jindex = 1 ; /* start at one so we don't get the desc again */
-  while(cur_blocks_left > 0) {
+  while(cn) {
+    clear_bit(BH_JNew, &(cn->bh->b_state)) ;
     /* copy all the real blocks into log area.  dirty log blocks */
     if (test_bit(BH_JDirty, &cn->bh->b_state)) {
       struct buffer_head *tmp_bh ;
       tmp_bh =  journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 
 		       ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
-      set_buffer_uptodate(tmp_bh) ;
+      set_buffer_uptodate(tmp_bh);
       memcpy(tmp_bh->b_data, cn->bh->b_data, cn->bh->b_size) ;  
+      mark_buffer_dirty(tmp_bh);
       jindex++ ;
-    } else {
-      /* JDirty cleared sometime during transaction.  don't log this one */
-      printk("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ;
-    }
-    cn = cn->next ;
-    cur_blocks_left-- ;
-  }
-
-  /* we are done  with both the c_bh and d_bh, but
-  ** c_bh must be written after all other commit blocks,
-  ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
-  */
-
-  /* now loop through and mark all buffers from this transaction as JDirty_wait
-  ** clear the JDirty bit, clear BH_JNew too.  
-  ** if they weren't JDirty, they weren't logged, just relse them and move on
-  */
-  cn = SB_JOURNAL(p_s_sb)->j_first ; 
-  while(cn) {
-    clear_bit(BH_JNew, &(cn->bh->b_state)) ;
-    if (test_bit(BH_JDirty, &(cn->bh->b_state))) {
       set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ; 
       clear_bit(BH_JDirty, &(cn->bh->b_state)) ;
     } else {
+      /* JDirty cleared sometime during transaction.  don't log this one */
+      reiserfs_warning("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ;
       brelse(cn->bh) ;
     }
     next = cn->next ;
@@ -3080,30 +3531,17 @@ static int do_journal_end(struct reiserf
     cn = next ;
   }
 
-  /* unlock the journal list for committing and flushing */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 0) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 0) ;
-
-  orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
-  jindex = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ; 
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ;
-
-  /* write any buffers that must hit disk before this commit is done */
-  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock),
-		     &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
+  /* we are done  with both the c_bh and d_bh, but
+  ** c_bh must be written after all other commit blocks,
+  ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
+  */
 
-  /* honor the flush and async wishes from the caller */
-  if (flush) {
-  
-    flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-    flush_journal_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex , 1) ;  
-  } else if (commit_now) {
-    if (wait_on_commit) {
-      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-    } else {
-      commit_flush_async(p_s_sb, orig_jindex) ; 
-    }
-  }
+  SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
+
+  /* now it is safe to insert this transaction on the main list */
+  list_add_tail(&jl->j_list, &SB_JOURNAL(p_s_sb)->j_journal_list);
+  list_add_tail(&jl->j_working_list, &SB_JOURNAL(p_s_sb)->j_working_list);
+  SB_JOURNAL(p_s_sb)->j_num_work_lists++;
 
   /* reset journal values for the next transaction */
   old_start = SB_JOURNAL(p_s_sb)->j_start ;
@@ -3115,57 +3553,108 @@ static int do_journal_end(struct reiserf
   SB_JOURNAL(p_s_sb)->j_len = 0 ;
   SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ;
   SB_JOURNAL(p_s_sb)->j_trans_id++ ;
+  SB_JOURNAL(p_s_sb)->j_current_jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id;
   SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
   SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
   SB_JOURNAL(p_s_sb)->j_next_full_flush = 0 ;
   SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ;
   init_journal_hash(p_s_sb) ; 
 
+  // make sure reiserfs_add_jh sees the new current_jl before we
+  // write out the tails
+  smp_mb();
+
+  /* tail conversion targets have to hit the disk before we end the 
+   * transaction.  Otherwise a later transaction might repack the tail
+   * before this transaction commits, leaving the data block unflushed and 
+   * clean, if we crash before the later transaction commits, the data block
+   * is lost.
+   */
+  if (!list_empty(&jl->j_tail_bh_list)) {
+      unlock_kernel();
+      write_ordered_buffers(&SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock,
+			    SB_JOURNAL(p_s_sb), jl, &jl->j_tail_bh_list);
+      lock_kernel();
+  }
+  if (!list_empty(&jl->j_tail_bh_list))
+      BUG();
+  up(&jl->j_commit_lock);
+
+  /* honor the flush wishes from the caller, simple commits can
+  ** be done outside the journal lock, they are done below
+  **
+  ** if we don't flush the commit list right now, we put it into
+  ** the work queue so the people waiting on the async progress work
+  ** queue don't wait for this proc to flush journal lists and such.
+  */
+  if (flush) {
+    flush_commit_list(p_s_sb, jl, 1) ;
+    flush_journal_list(p_s_sb, jl, 1) ;  
+  } else 
+    queue_work(commit_wq, &SB_JOURNAL(p_s_sb)->j_work);
+
+
   /* if the next transaction has any chance of wrapping, flush 
   ** transactions that might get overwritten.  If any journal lists are very 
   ** old flush them as well.  
   */
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    jindex = i ;
-    if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && SB_JOURNAL(p_s_sb)->j_start <= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
-      if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ; 
-      }
-    } else if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && 
-              (SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
-      if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= 
-            SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; 
+first_jl:
+  list_for_each_safe(entry, safe, &SB_JOURNAL(p_s_sb)->j_journal_list) {
+    temp_jl = JOURNAL_LIST_ENTRY(entry);
+    if (SB_JOURNAL(p_s_sb)->j_start <= temp_jl->j_start) {
+      if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= 
+          temp_jl->j_start) 
+      {
+	flush_used_journal_lists(p_s_sb, temp_jl);
+	goto first_jl;
+      } else if ((SB_JOURNAL(p_s_sb)->j_start + 
+                  SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) < 
+		  SB_ONDISK_JOURNAL_SIZE(p_s_sb)) 
+      {
+          /* if we don't cross into the next transaction and we don't
+	   * wrap, there is no way we can overlap any later transactions
+	   * break now
+	   */
+	  break;
+      }
+    } else if ((SB_JOURNAL(p_s_sb)->j_start + 
+                SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > 
+		SB_ONDISK_JOURNAL_SIZE(p_s_sb)) 
+    {
+      if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % 
+            SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start) 
+      {
+	flush_used_journal_lists(p_s_sb, temp_jl);
+	goto first_jl;
+      } else {
+	  /* we don't overlap anything from out start to the end of the 
+	   * log, and our wrapped portion doesn't overlap anything at
+	   * the start of the log.  We can break
+	   */
+	  break;
       }
-    } 
-    /* this check should always be run, to send old lists to disk */
-    if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && 
-              SB_JOURNAL_LIST(p_s_sb)[jindex].j_timestamp < 
-	      (get_seconds() - (SB_JOURNAL_MAX_TRANS_AGE(p_s_sb) * 4))) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; 
     }
   }
+  flush_old_journal_lists(p_s_sb);
 
-  /* if the next journal_list is still in use, flush it */
-  if (SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len != 0) {
-    flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb), 1) ; 
-  }
-
-  /* we don't want anyone flushing the new transaction's list */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + 
-											 SB_JOURNAL_LIST_INDEX(p_s_sb)) ;
+  SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL(p_s_sb)->j_current_jl) ;
 
-  if (!(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap)) {
+  if (!(SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap)) {
     reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ;
   }
-  unlock_journal(p_s_sb) ;
+
   atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
+  unlock_journal(p_s_sb) ;
   /* wake up any body waiting to join. */
+  clear_bit(WRITERS_QUEUED, &SB_JOURNAL(p_s_sb)->j_state);
   wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
+  
+  if (!flush && wait_on_commit && 
+      journal_list_still_alive(p_s_sb, commit_trans_id)) {
+	  flush_commit_list(p_s_sb, jl, 1) ;
+  }
+out:
+  reiserfs_check_lock_depth("journal end2");
+  th->t_trans_id = 0;
   return 0 ;
 }
-
-
-
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/Makefile linux-2.6.5-rc1-mm2/fs/reiserfs/Makefile
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/Makefile	2004-03-11 03:55:54.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/Makefile	2004-03-19 16:10:24.020964439 +0100
@@ -9,6 +9,18 @@ reiserfs-objs := bitmap.o do_balan.o nam
 		 hashes.o tail_conversion.o journal.o resize.o \
 		 item_ops.o ioctl.o procfs.o
 
+ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
+reiserfs-objs += xattr.o xattr_user.o xattr_trusted.o
+endif
+
+ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
+reiserfs-objs += xattr_security.o
+endif
+
+ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y)
+reiserfs-objs += xattr_acl.o
+endif
+
 # gcc -O2 (the kernel default)  is overaggressive on ppc32 when many inline
 # functions are used.  This causes the compiler to advance the stack
 # pointer out of the available stack space, corrupting kernel space,
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/namei.c linux-2.6.5-rc1-mm2/fs/reiserfs/namei.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/namei.c	2004-03-11 03:55:29.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/namei.c	2004-03-19 16:10:24.014965085 +0100
@@ -15,7 +15,10 @@
 #include <linux/time.h>
 #include <linux/bitops.h>
 #include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_acl.h>
+#include <linux/reiserfs_xattr.h>
 #include <linux/smp_lock.h>
+#include <linux/quotaops.h>
 
 #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
 #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--;
@@ -331,11 +334,24 @@ static struct dentry * reiserfs_lookup (
     retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path_to_entry, &de);
     pathrelse (&path_to_entry);
     if (retval == NAME_FOUND) {
+        /* Hide the .reiserfs_priv directory */
+	if (reiserfs_xattrs (dir->i_sb) &&
+	    !old_format_only(dir->i_sb) &&
+            REISERFS_SB(dir->i_sb)->priv_root &&
+            REISERFS_SB(dir->i_sb)->priv_root->d_inode &&
+	    de.de_objectid == le32_to_cpu (INODE_PKEY(REISERFS_SB(dir->i_sb)->priv_root->d_inode)->k_objectid)) {
+	  return ERR_PTR (-EACCES);
+	}
+
 	inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
 	if (!inode || IS_ERR(inode)) {
 	    reiserfs_write_unlock(dir->i_sb);
 	    return ERR_PTR(-EACCES);
         }
+
+	/* Propogate the priv_object flag so we know we're in the priv tree */
+	if (is_reiserfs_priv_object (dir))
+	    REISERFS_I(inode)->i_flags |= i_priv_object;
     }
     reiserfs_write_unlock(dir->i_sb);
     if ( retval == IO_ERROR ) {
@@ -504,7 +520,7 @@ static int reiserfs_add_entry (struct re
     }
   
     /* perform the insertion of the entry that we have prepared */
-    retval = reiserfs_paste_into_item (th, &path, &entry_key, buffer, paste_size);
+    retval = reiserfs_paste_into_item (th, &path, &entry_key, dir, buffer, paste_size);
     if (buffer != small_buf)
 	reiserfs_kfree (buffer, buflen, dir->i_sb);
     if (retval) {
@@ -513,7 +529,6 @@ static int reiserfs_add_entry (struct re
     }
 
     dir->i_size += paste_size;
-    dir->i_blocks = ((dir->i_size + 511) >> 9);
     dir->i_mtime = dir->i_ctime = CURRENT_TIME;
     if (!S_ISDIR (inode->i_mode) && visible)
 	// reiserfs_mkdir or reiserfs_rename will do that by itself
@@ -529,7 +544,9 @@ static int reiserfs_add_entry (struct re
 ** inserted into the tree yet.
 */
 static int drop_new_inode(struct inode *inode) {
+    DQUOT_DROP(inode);
     make_bad_inode(inode) ;
+    inode->i_flags |= S_NOQUOTA;
     iput(inode) ;
     return 0 ;
 }
@@ -555,6 +572,11 @@ static int new_inode_init(struct inode *
     } else {
         inode->i_gid = current->fsgid;
     }
+    DQUOT_INIT(inode);
+    if (DQUOT_ALLOC_INODE(inode)) {
+        drop_new_inode(inode);
+	return -EDQUOT;
+    }
     return 0 ;
 }
 
@@ -565,6 +587,7 @@ static int reiserfs_create (struct inode
     struct inode * inode;
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 ;
     struct reiserfs_transaction_handle th ;
+    int locked;
 
     if (!(inode = new_inode(dir->i_sb))) {
 	return -ENOMEM ;
@@ -573,10 +596,19 @@ static int reiserfs_create (struct inode
     if (retval)
         return retval;
 
+    locked = reiserfs_cache_default_acl (dir);
+
     reiserfs_write_lock(dir->i_sb);
+
+    if (locked)
+        reiserfs_write_lock_xattrs (dir->i_sb);
+
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    th.t_caller = "create" ;
     retval = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode);
+
+    if (locked)
+        reiserfs_write_unlock_xattrs (dir->i_sb);
+
     if (retval) {
         goto out_failed;
     }
@@ -612,6 +644,7 @@ static int reiserfs_mknod (struct inode 
     struct inode * inode;
     struct reiserfs_transaction_handle th ;
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; 
+    int locked;
 
     if (!new_valid_dev(rdev))
 	return -EINVAL;
@@ -623,15 +656,25 @@ static int reiserfs_mknod (struct inode 
     if (retval)
         return retval;
 
+    locked = reiserfs_cache_default_acl (dir);
+
     reiserfs_write_lock(dir->i_sb);
+
+    if (locked)
+        reiserfs_write_lock_xattrs (dir->i_sb);
+
     journal_begin(&th, dir->i_sb, jbegin_count) ;
 
     retval = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode);
+
+    if (locked)
+        reiserfs_write_unlock_xattrs (dir->i_sb);
+
     if (retval) {
         goto out_failed;
     }
 
-    init_special_inode(inode, mode, rdev) ;
+    init_special_inode(inode, inode->i_mode, rdev) ;
 
     //FIXME: needed for block and char devices only
     reiserfs_update_sd (&th, inode);
@@ -664,6 +707,7 @@ static int reiserfs_mkdir (struct inode 
     struct inode * inode;
     struct reiserfs_transaction_handle th ;
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; 
+    int locked;
 
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
     /* set flag that new packing locality created and new blocks for the content     * of that directory are not displaced yet */
@@ -677,7 +721,11 @@ static int reiserfs_mkdir (struct inode 
     if (retval)
         return retval;
 
+    locked = reiserfs_cache_default_acl (dir);
+
     reiserfs_write_lock(dir->i_sb);
+    if (locked)
+        reiserfs_write_lock_xattrs (dir->i_sb);
     journal_begin(&th, dir->i_sb, jbegin_count) ;
 
     /* inc the link count now, so another writer doesn't overflow it while
@@ -689,6 +737,9 @@ static int reiserfs_mkdir (struct inode 
 				old_format_only (dir->i_sb) ? 
 				EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
 				dentry, inode);
+    if (locked)
+        reiserfs_write_unlock_xattrs (dir->i_sb);
+
     if (retval) {
 	dir->i_nlink-- ;
 	goto out_failed;
@@ -738,7 +789,6 @@ static int reiserfs_rmdir (struct inode 
 {
     int retval;
     struct inode * inode;
-    int windex ;
     struct reiserfs_transaction_handle th ;
     int jbegin_count; 
     INITIALIZE_PATH (path);
@@ -750,7 +800,6 @@ static int reiserfs_rmdir (struct inode 
 
     reiserfs_write_lock(dir->i_sb);
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    windex = push_journal_writer("reiserfs_rmdir") ;
 
     de.de_gen_number_bit_string = 0;
     if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) {
@@ -793,13 +842,11 @@ static int reiserfs_rmdir (struct inode 
 
     DEC_DIR_INODE_NLINK(dir)
     dir->i_size -= (DEH_SIZE + de.de_entrylen);
-    dir->i_blocks = ((dir->i_size + 511) >> 9);
     reiserfs_update_sd (&th, dir);
 
     /* prevent empty directory from getting lost */
     add_save_link (&th, inode, 0/* not truncate */);
 
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_check_path(&path) ;
     reiserfs_write_unlock(dir->i_sb);
@@ -810,7 +857,6 @@ static int reiserfs_rmdir (struct inode 
        reiserfs_cut_from_item, or reiserfs_cut_from_item does not
        release path if operation was not complete */
     pathrelse (&path);
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_write_unlock(dir->i_sb);
     return retval;	
@@ -822,7 +868,6 @@ static int reiserfs_unlink (struct inode
     struct inode * inode;
     struct reiserfs_dir_entry de;
     INITIALIZE_PATH (path);
-    int windex ;
     struct reiserfs_transaction_handle th ;
     int jbegin_count;
     unsigned long savelink;
@@ -835,7 +880,6 @@ static int reiserfs_unlink (struct inode
 
     reiserfs_write_lock(dir->i_sb);
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    windex = push_journal_writer("reiserfs_unlink") ;
 	
     de.de_gen_number_bit_string = 0;
     if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) {
@@ -880,7 +924,6 @@ static int reiserfs_unlink (struct inode
     reiserfs_update_sd (&th, inode);
 
     dir->i_size -= (de.de_entrylen + DEH_SIZE);
-    dir->i_blocks = ((dir->i_size + 511) >> 9);
     dir->i_ctime = dir->i_mtime = CURRENT_TIME;
     reiserfs_update_sd (&th, dir);
 
@@ -888,7 +931,6 @@ static int reiserfs_unlink (struct inode
        /* prevent file from getting lost */
        add_save_link (&th, inode, 0/* not truncate */);
 
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_check_path(&path) ;
     reiserfs_write_unlock(dir->i_sb);
@@ -896,7 +938,6 @@ static int reiserfs_unlink (struct inode
 
  end_unlink:
     pathrelse (&path);
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_check_path(&path) ;
     reiserfs_write_unlock(dir->i_sb);
@@ -939,6 +980,8 @@ static int reiserfs_symlink (struct inod
     memcpy (name, symname, strlen (symname));
     padd_item (name, item_len, strlen (symname));
 
+    /* We would inherit the default ACL here, but symlinks don't get ACLs */
+
     journal_begin(&th, parent_dir->i_sb, jbegin_count) ;
 
     retval = reiserfs_new_inode (&th, parent_dir, mode, name, strlen (symname), 
@@ -951,7 +994,7 @@ static int reiserfs_symlink (struct inod
     reiserfs_update_inode_transaction(inode) ;
     reiserfs_update_inode_transaction(parent_dir) ;
 
-    inode->i_op = &page_symlink_inode_operations;
+    inode->i_op = &reiserfs_symlink_inode_operations;
     inode->i_mapping->a_ops = &reiserfs_address_space_operations;
 
     // must be sure this inode is written with this transaction
@@ -979,7 +1022,6 @@ static int reiserfs_link (struct dentry 
 {
     int retval;
     struct inode *inode = old_dentry->d_inode;
-    int windex ;
     struct reiserfs_transaction_handle th ;
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; 
 
@@ -997,7 +1039,6 @@ static int reiserfs_link (struct dentry 
     inode->i_nlink++;
 
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    windex = push_journal_writer("reiserfs_link") ;
 
     /* create new entry */
     retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len,
@@ -1008,7 +1049,6 @@ static int reiserfs_link (struct dentry 
 
     if (retval) {
 	inode->i_nlink--;
-	pop_journal_writer(windex) ;
 	journal_end(&th, dir->i_sb, jbegin_count) ;
 	reiserfs_write_unlock(dir->i_sb);
 	return retval;
@@ -1019,7 +1059,6 @@ static int reiserfs_link (struct dentry 
 
     atomic_inc(&inode->i_count) ;
     d_instantiate(dentry, inode);
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_write_unlock(dir->i_sb);
     return 0;
@@ -1083,7 +1122,6 @@ static int reiserfs_rename (struct inode
     struct item_head new_entry_ih, old_entry_ih, dot_dot_ih ;
     struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
     struct inode * old_inode, * new_dentry_inode;
-    int windex ;
     struct reiserfs_transaction_handle th ;
     int jbegin_count ; 
     umode_t old_inode_mode;
@@ -1151,7 +1189,6 @@ static int reiserfs_rename (struct inode
     }
 
     journal_begin(&th, old_dir->i_sb, jbegin_count) ;
-    windex = push_journal_writer("reiserfs_rename") ;
 
     /* add new entry (or find the existing one) */
     retval = reiserfs_add_entry (&th, new_dir, new_dentry->d_name.name, new_dentry->d_name.len, 
@@ -1162,7 +1199,6 @@ static int reiserfs_rename (struct inode
 			    "vs-7050: new entry is found, new inode == 0\n");
 	}
     } else if (retval) {
-	pop_journal_writer(windex) ;
 	journal_end(&th, old_dir->i_sb, jbegin_count) ;
 	reiserfs_write_unlock(old_dir->i_sb);
 	return retval;
@@ -1303,7 +1339,6 @@ static int reiserfs_rename (struct inode
 	reiserfs_warning ("vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?\n");
 
     old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
-    old_dir->i_blocks = ((old_dir->i_size + 511) >> 9);
 
     reiserfs_update_sd (&th, old_dir);
     reiserfs_update_sd (&th, new_dir);
@@ -1315,14 +1350,11 @@ static int reiserfs_rename (struct inode
 	reiserfs_update_sd (&th, new_dentry_inode);
     }
 
-    pop_journal_writer(windex) ;
     journal_end(&th, old_dir->i_sb, jbegin_count) ;
     reiserfs_write_unlock(old_dir->i_sb);
     return 0;
 }
 
-
-
 /*
  * directories can handle most operations...
  */
@@ -1337,5 +1369,28 @@ struct inode_operations reiserfs_dir_ino
     .rmdir	= reiserfs_rmdir,
     .mknod	= reiserfs_mknod,
     .rename	= reiserfs_rename,
+    .setattr    = reiserfs_setattr,
+    .setxattr   = reiserfs_setxattr,
+    .getxattr   = reiserfs_getxattr,
+    .listxattr  = reiserfs_listxattr,
+    .removexattr = reiserfs_removexattr,
+    .permission     = reiserfs_permission,
 };
 
+/*
+ * symlink operations.. same as page_symlink_inode_operations, with xattr
+ * stuff added
+ */
+struct inode_operations reiserfs_symlink_inode_operations = {
+    .readlink       = page_readlink,
+    .follow_link    = page_follow_link,
+    .setattr        = reiserfs_setattr,
+    .setxattr       = reiserfs_setxattr,
+    .getxattr       = reiserfs_getxattr,
+    .listxattr      = reiserfs_listxattr,
+    .removexattr    = reiserfs_removexattr,
+    .permission     = reiserfs_permission,
+
+};
+
+
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/objectid.c linux-2.6.5-rc1-mm2/fs/reiserfs/objectid.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/objectid.c	2004-03-11 03:55:26.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/objectid.c	2004-03-19 16:10:24.009965624 +0100
@@ -86,7 +86,6 @@ __u32 reiserfs_get_unused_objectid (stru
     }
 
     journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 1;
     return unused_objectid;
 }
 
@@ -105,8 +104,6 @@ void reiserfs_release_objectid (struct r
 
     reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
     journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); 
-    s->s_dirt = 1;
-
 
     /* start at the beginning of the objectid map (i = 0) and go to
        the end of it (i = disk_sb->s_oid_cursize).  Linear search is
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/prints.c linux-2.6.5-rc1-mm2/fs/reiserfs/prints.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/prints.c	2004-03-11 03:55:34.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/prints.c	2004-03-19 16:10:24.014965085 +0100
@@ -333,7 +333,6 @@ extern struct tree_balance * cur_tb;
 
 void reiserfs_panic (struct super_block * sb, const char * fmt, ...)
 {
-  show_reiserfs_locks() ;
   do_reiserfs_warning(fmt);
   printk ( KERN_EMERG "%s", error_buf);
   BUG ();
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/procfs.c linux-2.6.5-rc1-mm2/fs/reiserfs/procfs.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/procfs.c	2004-03-11 03:55:24.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/procfs.c	2004-03-19 16:10:24.008965731 +0100
@@ -87,7 +87,7 @@ static int show_super(struct seq_file *m
 	struct reiserfs_sb_info *r = REISERFS_SB(sb);
     
 	seq_printf(m,	"state: \t%s\n"
-			"mount options: \t%s%s%s%s%s%s%s%s%s%s%s%s\n"
+			"mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
 			"gen. counter: \t%i\n"
 			"s_kmallocs: \t%i\n"
 			"s_disk_reads: \t%i\n"
@@ -131,7 +131,6 @@ static int show_super(struct seq_file *m
 			reiserfs_test4( sb ) ? "TEST4 " : "",
 			have_large_tails( sb ) ? "TAILS " : have_small_tails(sb)?"SMALL_TAILS ":"NO_TAILS ",
 			replay_only( sb ) ? "REPLAY_ONLY " : "",
-			reiserfs_dont_log( sb ) ? "DONT_LOG " : "LOG ",
 			convert_reiserfs( sb ) ? "CONV " : "",
 
 			atomic_read( &r -> s_generation_counter ),
@@ -370,7 +369,6 @@ static int show_journal(struct seq_file 
 			"j_first_unflushed_offset: \t%lu\n"
 			"j_last_flush_trans_id: \t%lu\n"
 			"j_trans_start_time: \t%li\n"
-			"j_journal_list_index: \t%i\n"
 			"j_list_bitmap_index: \t%i\n"
 			"j_must_wait: \t%i\n"
 			"j_next_full_flush: \t%i\n"
@@ -416,7 +414,6 @@ static int show_journal(struct seq_file 
 			JF( j_first_unflushed_offset ),
 			JF( j_last_flush_trans_id ),
 			JF( j_trans_start_time ),
-			JF( j_journal_list_index ),
 			JF( j_list_bitmap_index ),
 			JF( j_must_wait ),
 			JF( j_next_full_flush ),
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/stree.c linux-2.6.5-rc1-mm2/fs/reiserfs/stree.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/stree.c	2004-03-19 15:13:58.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/stree.c	2004-03-19 16:10:24.019964547 +0100
@@ -60,6 +60,7 @@
 #include <linux/reiserfs_fs.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/quotaops.h>
 
 /* Does the buffer contain a disk block which is in the tree. */
 inline int B_IS_IN_TREE (const struct buffer_head * p_s_bh)
@@ -71,9 +72,6 @@ inline int B_IS_IN_TREE (const struct bu
   return ( B_LEVEL (p_s_bh) != FREE_LEVEL );
 }
 
-
-
-
 inline void copy_short_key (void * to, const void * from)
 {
     memcpy (to, from, SHORT_KEY_SIZE);
@@ -1125,8 +1123,7 @@ static char  prepare_for_delete_or_cut(
 		tmp = get_block_num(p_n_unfm_pointer,0);
 		put_block_num(p_n_unfm_pointer, 0, 0);
 		journal_mark_dirty (th, p_s_sb, p_s_bh);
-		inode->i_blocks -= p_s_sb->s_blocksize / 512;
-		reiserfs_free_block(th, tmp);
+		reiserfs_free_block(th, inode, tmp, 1);
 		if ( item_moved (&s_ih, p_s_path) )  {
 			need_research = 1;
 			break ;
@@ -1155,8 +1152,7 @@ static char  prepare_for_delete_or_cut(
     }
 }
 
-
-/* Calculate bytes number which will be deleted or cutted in the balance. */
+/* Calculate number of bytes which will be deleted or cut during balance */
 int calc_deleted_bytes_number(
     struct  tree_balance  * p_s_tb,
     char                    c_mode
@@ -1167,14 +1163,14 @@ int calc_deleted_bytes_number(
     if ( is_statdata_le_ih (p_le_ih) )
 	return 0;
 
+    n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
     if ( is_direntry_le_ih (p_le_ih) ) {
 	// return EMPTY_DIR_SIZE; /* We delete emty directoris only. */
 	// we can't use EMPTY_DIR_SIZE, as old format dirs have a different
 	// empty size.  ick. FIXME, is this right?
 	//
-	return ih_item_len(p_le_ih);
+	return n_del_size ;
     }
-    n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
 
     if ( is_indirect_le_ih (p_le_ih) )
 	n_del_size = (n_del_size/UNFM_P_SIZE)*
@@ -1208,17 +1204,46 @@ void padd_item (char * item, int total_l
 	item [--i] = 0;
 }
 
+#ifdef REISERQUOTA_DEBUG
+char key2type(struct key *ih)
+{
+  if (is_direntry_le_key(2, ih))
+    return 'd';
+  if (is_direct_le_key(2, ih))
+    return 'D';
+  if (is_indirect_le_key(2, ih))
+    return 'i';
+  if (is_statdata_le_key(2, ih))
+    return 's';
+  return 'u';
+}
+
+char head2type(struct item_head *ih)
+{
+  if (is_direntry_le_ih(ih))
+    return 'd';
+  if (is_direct_le_ih(ih))
+    return 'D';
+  if (is_indirect_le_ih(ih))
+    return 'i';
+  if (is_statdata_le_ih(ih))
+    return 's';
+  return 'u';
+}
+#endif
 
 /* Delete object item. */
 int reiserfs_delete_item (struct reiserfs_transaction_handle *th, 
 			  struct path * p_s_path, /* Path to the deleted item. */
 			  const struct cpu_key * p_s_item_key, /* Key to search for the deleted item.  */
-			  struct inode * p_s_inode,/* inode is here just to update i_blocks */
+			  struct inode * p_s_inode,/* inode is here just to update i_blocks and quotas */
 			  struct buffer_head  * p_s_un_bh)    /* NULL or unformatted node pointer.    */
 {
     struct super_block * p_s_sb = p_s_inode->i_sb;
     struct tree_balance   s_del_balance;
     struct item_head      s_ih;
+    struct item_head      *q_ih;
+    int			  quota_cut_bytes;
     int                   n_ret_value,
 	n_del_size,
 	n_removed;
@@ -1268,6 +1293,22 @@ int reiserfs_delete_item (struct reiserf
 
     // reiserfs_delete_item returns item length when success
     n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
+    q_ih = get_ih(p_s_path) ;
+    quota_cut_bytes = ih_item_len(q_ih) ;
+
+    /* hack so the quota code doesn't have to guess if the file
+    ** has a tail.  On tail insert, we allocate quota for 1 unformatted node.
+    ** We test the offset because the tail might have been
+    ** split into multiple items, and we only want to decrement for
+    ** the unfm node once
+    */
+    if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(q_ih)) {
+        if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) {
+            quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
+        } else {
+	    quota_cut_bytes = 0 ;
+	}
+    }
 
     if ( p_s_un_bh )  {
 	int off;
@@ -1299,10 +1340,14 @@ int reiserfs_delete_item (struct reiserf
 	       B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), n_ret_value);
 	kunmap_atomic(data, KM_USER0);
     }
-
     /* Perform balancing after all resources have been collected at once. */ 
     do_balance(&s_del_balance, NULL, NULL, M_DELETE);
 
+#ifdef REISERQUOTA_DEBUG
+    printk(KERN_DEBUG "reiserquota delete_item(): freeing %u, id=%u type=%c\n", quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih));
+#endif
+    DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
+
     /* Return deleted body length */
     return n_ret_value;
 }
@@ -1327,14 +1372,16 @@ int reiserfs_delete_item (struct reiserf
 
 /* this deletes item which never gets split */
 void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
+				 struct inode *inode,
 				 struct key * key)
 {
     struct tree_balance tb;
     INITIALIZE_PATH (path);
-    int item_len;
+    int item_len = 0;
     int tb_init = 0 ;
     struct cpu_key cpu_key;
     int retval;
+    int quota_cut_bytes = 0;
     
     le_key2cpu_key (&cpu_key, key);
     
@@ -1358,6 +1405,7 @@ void reiserfs_delete_solid_item (struct 
 	    item_len = ih_item_len( PATH_PITEM_HEAD(&path) );
 	    init_tb_struct (th, &tb, th->t_super, &path, - (IH_SIZE + item_len));
 	}
+	quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)) ;
 
 	retval = fix_nodes (M_DELETE, &tb, NULL, 0);
 	if (retval == REPEAT_SEARCH) {
@@ -1367,6 +1415,12 @@ void reiserfs_delete_solid_item (struct 
 
 	if (retval == CARRY_ON) {
 	    do_balance (&tb, 0, 0, M_DELETE);
+	    if (inode) {	/* Should we count quota for item? (we don't count quotas for save-links) */
+#ifdef REISERQUOTA_DEBUG
+		printk(KERN_DEBUG "reiserquota delete_solid_item(): freeing %u id=%u type=%c\n", quota_cut_bytes, inode->i_uid, key2type(key));
+#endif
+		DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes);
+	    }
 	    break;
 	}
 
@@ -1399,7 +1453,7 @@ void reiserfs_delete_object (struct reis
       }
 /* USE_INODE_GENERATION_COUNTER */
 #endif
-    reiserfs_delete_solid_item (th, INODE_PKEY (inode));
+    reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
 }
 
 
@@ -1486,12 +1540,14 @@ int reiserfs_cut_from_item (struct reise
        structure by using the init_tb_struct and fix_nodes functions.
        After that we can make tree balancing. */
     struct tree_balance s_cut_balance;
+    struct item_head *p_le_ih;
     int n_cut_size = 0,        /* Amount to be cut. */
 	n_ret_value = CARRY_ON,
 	n_removed = 0,     /* Number of the removed unformatted nodes. */
 	n_is_inode_locked = 0;
     char                c_mode;            /* Mode of the balance. */
     int retval2 = -1;
+    int quota_cut_bytes;
     
     
     init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, n_cut_size);
@@ -1579,23 +1635,27 @@ int reiserfs_cut_from_item (struct reise
     RFALSE( c_mode == M_PASTE || c_mode == M_INSERT, "invalid mode");
 
     /* Calculate number of bytes that need to be cut from the item. */
+    quota_cut_bytes = ( c_mode == M_DELETE ) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.insert_size[0];
     if (retval2 == -1)
 	n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode);
     else
 	n_ret_value = retval2;
-    
-    if ( c_mode == M_DELETE ) {
-	struct item_head * p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
-	
-	if ( is_direct_le_ih (p_le_ih) && (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) {
-	    /* we delete first part of tail which was stored in direct
-               item(s) */
+
+
+    /* For direct items, we only change the quota when deleting the last
+    ** item.
+    */
+    p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
+    if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) {
+        if (c_mode == M_DELETE && 
+	   (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) {
 	    // FIXME: this is to keep 3.5 happy
 	    REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX;
-	    p_s_inode->i_blocks -= p_s_sb->s_blocksize / 512;
+	    quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE ;
+        } else {
+	    quota_cut_bytes = 0 ;
 	}
     }
-
 #ifdef CONFIG_REISERFS_CHECK
     if (n_is_inode_locked) {
 	struct item_head * le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
@@ -1630,10 +1690,13 @@ int reiserfs_cut_from_item (struct reise
 	*/
 	REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask ;
     }
+#ifdef REISERQUOTA_DEBUG
+    printk(KERN_DEBUG "reiserquota cut_from_item(): freeing %u id=%u type=%c\n", quota_cut_bytes, p_s_inode->i_uid, '?');
+#endif
+    DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
     return n_ret_value;
 }
 
-
 static void truncate_directory (struct reiserfs_transaction_handle *th, struct inode * inode)
 {
     if (inode->i_nlink)
@@ -1641,8 +1704,8 @@ static void truncate_directory (struct r
 
     set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), DOT_OFFSET);
     set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_DIRENTRY);
-    reiserfs_delete_solid_item (th, INODE_PKEY (inode));
-
+    reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
+    reiserfs_update_sd(th, inode) ;
     set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), SD_OFFSET);
     set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_STAT_DATA);    
 }
@@ -1809,18 +1872,37 @@ static void check_research_for_paste (st
 int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th, 
 			      struct path         * p_s_search_path,	/* Path to the pasted item.          */
 			      const struct cpu_key      * p_s_key,        	/* Key to search for the needed item.*/
+			      struct inode	  * inode,		/* Inode item belongs to */
 			      const char          * p_c_body,       	/* Pointer to the bytes to paste.    */
 			      int                   n_pasted_size)  	/* Size of pasted bytes.             */
 {
     struct tree_balance s_paste_balance;
     int                 retval;
+    int			fs_gen;
+
+    fs_gen = get_generation(inode->i_sb) ;
 
+#ifdef REISERQUOTA_DEBUG
+    printk(KERN_DEBUG "reiserquota paste_into_item(): allocating %u id=%u type=%c\n", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
+#endif
+
+    if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) {
+	pathrelse(p_s_search_path);
+	return -EDQUOT;
+    }
     init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size);
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
     s_paste_balance.key = p_s_key->on_disk_key;
 #endif
-    
-    while ( (retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) == REPEAT_SEARCH ) {
+
+    /* DQUOT_* can schedule, must check before the fix_nodes */
+    if (fs_changed(fs_gen, inode->i_sb)) {
+	goto search_again;
+    }
+
+    while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) ==
+REPEAT_SEARCH ) {
+search_again:
 	/* file system changed while we were in the fix_nodes */
 	PROC_INFO_INC( th -> t_super, paste_into_item_restarted );
 	retval = search_for_position_by_key (th->t_super, p_s_key, p_s_search_path);
@@ -1849,6 +1931,10 @@ int reiserfs_paste_into_item (struct rei
 error_out:
     /* this also releases the path */
     unfix_nodes(&s_paste_balance);
+#ifdef REISERQUOTA_DEBUG
+    printk(KERN_DEBUG "reiserquota paste_into_item(): freeing %u id=%u type=%c\n", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
+#endif
+    DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size);
     return retval ;
 }
 
@@ -1858,23 +1944,45 @@ int reiserfs_insert_item(struct reiserfs
 			 struct path         * 	p_s_path,         /* Path to the inserteded item.         */
 			 const struct cpu_key      * key,
 			 struct item_head    * 	p_s_ih,           /* Pointer to the item header to insert.*/
+			 struct inode        * inode,
 			 const char          * 	p_c_body)         /* Pointer to the bytes to insert.      */
 {
     struct tree_balance s_ins_balance;
     int                 retval;
+    int fs_gen = 0 ;
+    int quota_bytes = 0 ;
 
+    if (inode) {      /* Do we count quotas for item? */
+	fs_gen = get_generation(inode->i_sb);
+	quota_bytes = ih_item_len(p_s_ih);
+
+	/* hack so the quota code doesn't have to guess if the file has
+	 ** a tail, links are always tails, so there's no guessing needed
+	 */
+	if (!S_ISLNK (inode->i_mode) && is_direct_le_ih(p_s_ih)) {
+	    quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE ;
+	}
+#ifdef REISERQUOTA_DEBUG
+	printk(KERN_DEBUG "reiserquota insert_item(): allocating %u id=%u type=%c\n", quota_bytes, inode->i_uid, head2type(p_s_ih));
+#endif
+	/* We can't dirty inode here. It would be immediately written but
+	 * appropriate stat item isn't inserted yet... */
+	if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) {
+	    pathrelse(p_s_path);
+	    return -EDQUOT;
+	}
+    }
     init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + ih_item_len(p_s_ih));
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
     s_ins_balance.key = key->on_disk_key;
 #endif
-
-    /*
-    if (p_c_body == 0)
-      n_zeros_num = ih_item_len(p_s_ih);
-    */
-    //    le_key2cpu_key (&key, &(p_s_ih->ih_key));
+    /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */
+    if (inode && fs_changed(fs_gen, inode->i_sb)) {
+	goto search_again;
+    }
 
     while ( (retval = fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, p_c_body)) == REPEAT_SEARCH) {
+search_again:
 	/* file system changed while we were in the fix_nodes */
 	PROC_INFO_INC( th -> t_super, insert_item_restarted );
 	retval = search_item (th->t_super, key, p_s_path);
@@ -1889,7 +1997,7 @@ int reiserfs_insert_item(struct reiserfs
 	    goto error_out; 
 	}
     }
-
+ 
     /* make balancing after all resources will be collected at a time */ 
     if ( retval == CARRY_ON ) {
 	do_balance (&s_ins_balance, p_s_ih, p_c_body, M_INSERT);
@@ -1900,6 +2008,11 @@ int reiserfs_insert_item(struct reiserfs
 error_out:
     /* also releases the path */
     unfix_nodes(&s_ins_balance);
+#ifdef REISERQUOTA_DEBUG
+    printk(KERN_DEBUG "reiserquota insert_item(): freeing %u id=%u type=%c\n", quota_bytes, inode->i_uid, head2type(p_s_ih));
+#endif
+    if (inode)
+	DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes) ;
     return retval; 
 }
 
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/super.c linux-2.6.5-rc1-mm2/fs/reiserfs/super.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/super.c	2004-03-11 03:55:26.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/super.c	2004-03-19 16:30:24.719646136 +0100
@@ -17,6 +17,8 @@
 #include <linux/time.h>
 #include <asm/uaccess.h>
 #include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_acl.h>
+#include <linux/reiserfs_xattr.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
@@ -59,22 +61,26 @@ static int is_any_reiserfs_magic_string 
 static int reiserfs_remount (struct super_block * s, int * flags, char * data);
 static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf);
 
-static void reiserfs_write_super (struct super_block * s)
+static void reiserfs_sync_fs (struct super_block * s)
 {
+    if (!(s->s_flags & MS_RDONLY)) {
+        struct reiserfs_transaction_handle th;
+	reiserfs_write_lock(s);
+	journal_begin(&th, s, 1);
+	journal_end_sync(&th, s, 1);
+	reiserfs_flush_old_commits(s);
+	s->s_dirt = 0;
+	reiserfs_write_unlock(s);
+    }
+}
 
-  int dirty = 0 ;
-  reiserfs_write_lock(s);
-  if (!(s->s_flags & MS_RDONLY)) {
-    dirty = flush_old_commits(s, 1) ;
-  }
-  s->s_dirt = dirty;
-  reiserfs_write_unlock(s);
+static void reiserfs_write_super(struct super_block *s)
+{
+    reiserfs_sync_fs(s);
 }
 
 static void reiserfs_write_super_lockfs (struct super_block * s)
 {
-
-  int dirty = 0 ;
   struct reiserfs_transaction_handle th ;
   reiserfs_write_lock(s);
   if (!(s->s_flags & MS_RDONLY)) {
@@ -84,7 +90,7 @@ static void reiserfs_write_super_lockfs 
     reiserfs_block_writes(&th) ;
     journal_end(&th, s, 1) ;
   }
-  s->s_dirt = dirty;
+  s->s_dirt = 0;
   reiserfs_write_unlock(s);
 }
 
@@ -109,7 +115,7 @@ static void remove_save_link_only (struc
      /* we are going to do one balancing */
      journal_begin (&th, s, JOURNAL_PER_BALANCE_CNT);
  
-     reiserfs_delete_solid_item (&th, key);
+     reiserfs_delete_solid_item (&th, NULL, key);
      if (oid_free)
         /* removals are protected by direct items */
         reiserfs_release_objectid (&th, le32_to_cpu (key->k_objectid));
@@ -295,8 +301,8 @@ void add_save_link (struct reiserfs_tran
     /* body of "save" link */
     link = INODE_PKEY (inode)->k_dir_id;
 
-    /* put "save" link inot tree */
-    retval = reiserfs_insert_item (th, &path, &key, &ih, (char *)&link);
+    /* put "save" link inot tree, don't charge quota to anyone */
+    retval = reiserfs_insert_item (th, &path, &key, &ih, NULL, (char *)&link);
     if (retval) {
 	if (retval != -ENOSPC)
 	    reiserfs_warning ("vs-2120: add_save_link: insert_item returned %d\n",
@@ -338,7 +344,8 @@ void remove_save_link (struct inode * in
           ( REISERFS_I(inode) -> i_flags & i_link_saved_truncate_mask ) ) ||
         ( !truncate && 
           ( REISERFS_I(inode) -> i_flags & i_link_saved_unlink_mask ) ) )
-	reiserfs_delete_solid_item (&th, &key);
+	/* don't take quota bytes from anywhere */
+	reiserfs_delete_solid_item (&th, NULL, &key);
     if (!truncate) {
 	reiserfs_release_objectid (&th, inode->i_ino);
 	REISERFS_I(inode) -> i_flags &= ~i_link_saved_unlink_mask;
@@ -353,7 +360,17 @@ static void reiserfs_put_super (struct s
 {
   int i;
   struct reiserfs_transaction_handle th ;
+
+  if (REISERFS_SB(s)->xattr_root) {
+    d_invalidate (REISERFS_SB(s)->xattr_root);
+    dput (REISERFS_SB(s)->xattr_root);
+  }
   
+  if (REISERFS_SB(s)->priv_root) {
+    d_invalidate (REISERFS_SB(s)->priv_root);
+    dput (REISERFS_SB(s)->priv_root);
+  }
+
   /* change file system state to current state if it was mounted with read-write permissions */
   if (!(s->s_flags & MS_RDONLY)) {
     journal_begin(&th, s, 10) ;
@@ -418,6 +435,8 @@ static void init_once(void * foo, kmem_c
 	    SLAB_CTOR_CONSTRUCTOR) {
 		INIT_LIST_HEAD(&ei->i_prealloc_list) ;
 		inode_init_once(&ei->vfs_inode);
+		ei->i_acl_access = NULL;
+		ei->i_acl_default = NULL;
 	}
 }
  
@@ -458,6 +477,22 @@ static void reiserfs_dirty_inode (struct
     reiserfs_write_unlock(inode->i_sb);
 }
 
+static void reiserfs_clear_inode (struct inode *inode)
+{
+    struct posix_acl *acl;
+
+    acl = REISERFS_I(inode)->i_acl_access;
+    if (acl && !IS_ERR (acl))
+        posix_acl_release (acl);
+    REISERFS_I(inode)->i_acl_access = NULL;
+
+    acl = REISERFS_I(inode)->i_acl_default;
+    if (acl && !IS_ERR (acl))
+        posix_acl_release (acl);
+    REISERFS_I(inode)->i_acl_default = NULL;
+}
+
+
 struct super_operations reiserfs_sops = 
 {
   .alloc_inode = reiserfs_alloc_inode,
@@ -465,6 +500,7 @@ struct super_operations reiserfs_sops = 
   .write_inode = reiserfs_write_inode,
   .dirty_inode = reiserfs_dirty_inode,
   .delete_inode = reiserfs_delete_inode,
+  .clear_inode  = reiserfs_clear_inode,
   .put_super = reiserfs_put_super,
   .write_super = reiserfs_write_super,
   .write_super_lockfs = reiserfs_write_super_lockfs,
@@ -506,6 +542,21 @@ typedef struct {
 		    applied BEFORE setmask */
 } opt_desc_t;
 
+/* possible values for -o data= */
+static const arg_desc_t logging_mode[] = {
+    {"ordered", 1<<REISERFS_DATA_ORDERED, (1<<REISERFS_DATA_LOG|1<<REISERFS_DATA_WRITEBACK)},
+    {"journal", 1<<REISERFS_DATA_LOG, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_WRITEBACK)},
+    {"writeback", 1<<REISERFS_DATA_WRITEBACK, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_LOG)},
+    {NULL, 0}
+};
+
+/* possible values for -o data= */
+static const arg_desc_t barrier_mode[] = {
+    {"none", 1<<REISERFS_BARRIER_NONE, 1<<REISERFS_BARRIER_FLUSH},
+    {"flush", 1<<REISERFS_BARRIER_FLUSH, 1<<REISERFS_BARRIER_NONE},
+    {NULL, 0}
+};
+
 /* possible values for "-o block-allocator=" and bits which are to be set in
    s_mount_opt of reiserfs specific part of in-core super block */
 static const arg_desc_t balloc[] = {
@@ -657,13 +708,23 @@ static int reiserfs_parse_options (struc
 	{"conv", 0, 0, 1<<REISERFS_CONVERT, 0},
 	{"attrs", 0, 0, 1<<REISERFS_ATTRS, 0},
 	{"noattrs", 0, 0, 0, 1<<REISERFS_ATTRS},
+	{"user_xattr", 0, 0, 1<<REISERFS_XATTRS_USER, 0},
+	{"nouser_xattr", 0, 0, 0, 1<<REISERFS_XATTRS_USER},
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+	{"acl", 0, 0, 1<<REISERFS_POSIXACL, 0},
+	{"noacl", 0, 0, 0, 1<<REISERFS_POSIXACL},
+#endif
 	{"nolog", 0, 0, 0, 0}, /* This is unsupported */
 	{"replayonly", 0, 0, 1<<REPLAYONLY, 0},
 	{"block-allocator", 'a', balloc, 0, 0},
+	{"data", 'd', logging_mode, 0, 0},
+	{"barrier", 'b', barrier_mode, 0, 0},
 	{"resize", 'r', 0, 0, 0},
 	{"jdev", 'j', 0, 0, 0},
 	{"nolargeio", 'w', 0, 0, 0},
 	{"commit", 'c', 0, 0, 0},
+	{"usrquota", 0, 0, 0, 0},
+	{"grpquota", 0, 0, 0, 0},
 	{NULL, 0, 0, 0, 0}
     };
 	
@@ -733,6 +794,50 @@ static int reiserfs_parse_options (struc
     return 1;
 }
 
+static void switch_data_mode(struct super_block *s, unsigned long mode) {
+    REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
+                                       (1 << REISERFS_DATA_ORDERED) |
+				       (1 << REISERFS_DATA_WRITEBACK));
+    REISERFS_SB(s)->s_mount_opt |= (1 << mode);
+}
+
+static void handle_data_mode(struct super_block *s, unsigned long mount_options)
+{
+    if (mount_options & (1 << REISERFS_DATA_LOG)) {
+        if (!reiserfs_data_log(s)) {
+	    switch_data_mode(s, REISERFS_DATA_LOG);
+	    printk("reiserfs: switching to journaled data mode\n");
+	}
+    } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
+        if (!reiserfs_data_ordered(s)) {
+	    switch_data_mode(s, REISERFS_DATA_ORDERED);
+	    printk("reiserfs: switching to ordered data mode\n");
+	}
+    } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
+        if (!reiserfs_data_writeback(s)) {
+	    switch_data_mode(s, REISERFS_DATA_WRITEBACK);
+	    printk("reiserfs: switching to writeback data mode\n");
+	}
+    }
+}
+
+static void handle_barrier_mode(struct super_block *s, unsigned long bits) {
+    int flush = (1 << REISERFS_BARRIER_FLUSH);
+    int none = (1 << REISERFS_BARRIER_NONE);
+    int all_barrier = flush | none;
+                      
+    if (bits & all_barrier) {
+        REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
+	if (bits & flush) {
+	    REISERFS_SB(s)->s_mount_opt |= flush;
+	    printk("reiserfs: enabling write barrier flush mode\n");
+	} else if (bits & none) {
+	    REISERFS_SB(s)->s_mount_opt |= none;
+	    printk("reiserfs: write barriers turned off\n");
+	}
+   }
+}
+
 static void handle_attrs( struct super_block *s )
 {
 	struct reiserfs_super_block * rs;
@@ -775,6 +880,10 @@ static int reiserfs_remount (struct supe
   safe_mask |= 1 << REISERFS_HASHED_RELOCATION;
   safe_mask |= 1 << REISERFS_TEST4;
   safe_mask |= 1 << REISERFS_ATTRS;
+  safe_mask |= 1 << REISERFS_XATTRS_USER;
+  safe_mask |= 1 << REISERFS_POSIXACL;
+  safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
+  safe_mask |= 1 << REISERFS_BARRIER_NONE;
 
   /* Update the bitmask, taking care to keep
    * the bits we're not allowed to change here */
@@ -791,6 +900,7 @@ static int reiserfs_remount (struct supe
   }
 
   if (*mount_flags & MS_RDONLY) {
+    reiserfs_xattr_init (s, *mount_flags);
     /* remount read-only */
     if (s->s_flags & MS_RDONLY)
       /* it is read-only already */
@@ -805,12 +915,15 @@ static int reiserfs_remount (struct supe
     reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
     set_sb_umount_state( rs, REISERFS_SB(s)->s_mount_state );
     journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 0;
   } else {
     /* remount read-write */
-    if (!(s->s_flags & MS_RDONLY))
+    if (!(s->s_flags & MS_RDONLY)) {
+	reiserfs_xattr_init (s, *mount_flags);
 	return 0; /* We are read-write already */
+    }
 
+    handle_data_mode(s, mount_options);
+    handle_barrier_mode(s, mount_options);
     REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ;
     s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */
     journal_begin(&th, s, 10) ;
@@ -822,15 +935,17 @@ static int reiserfs_remount (struct supe
     set_sb_umount_state( rs, REISERFS_ERROR_FS );
     /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
     journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 0;
     REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS ;
   }
   /* this will force a full flush of all journal lists */
   SB_JOURNAL(s)->j_must_wait = 1 ;
   journal_end(&th, s, 10) ;
+  s->s_dirt = 0;
 
-  if (!( *mount_flags & MS_RDONLY ) )
+  if (!( *mount_flags & MS_RDONLY ) ) {
     finish_unfinished( s );
+    reiserfs_xattr_init (s, *mount_flags);
+  }
 
   return 0;
 }
@@ -1258,8 +1373,10 @@ static int reiserfs_fill_super (struct s
     REISERFS_SB(s)->s_alloc_options.bits = ( 1 << 5);
     /* If file grew past 4 blocks, start preallocation blocks for it. */
     REISERFS_SB(s)->s_alloc_options.preallocmin = 4;
-    /* Preallocate by 8 blocks (9-1) at once */
-    REISERFS_SB(s)->s_alloc_options.preallocsize = 9;
+    /* Preallocate by 16 blocks (17-1) at once */
+    REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
+    /* Initialize the rwsem for xattr dir */
+    init_rwsem(&REISERFS_SB(s)->xattr_dir_sem);
 
     jdev_name = NULL;
     if (reiserfs_parse_options (s, (char *) data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age) == 0) {
@@ -1303,6 +1420,24 @@ static int reiserfs_fill_super (struct s
     SPRINTK(silent, "reiserfs:warning: - it is slow mode for debugging.\n");
 #endif
 
+    /* make data=ordered the default */
+    if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
+        !reiserfs_data_writeback(s))
+    {
+         REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
+    }
+
+    if (reiserfs_data_log(s)) {
+        printk("reiserfs: using journaled data mode\n");
+    } else if (reiserfs_data_ordered(s)) {
+        printk("reiserfs: using ordered data mode\n");
+    } else {
+        printk("reiserfs: using writeback data mode\n");
+    }
+    if (reiserfs_barrier_flush(s)) {
+    	printk("reiserfs: using flush barriers\n");
+    }
+
     // set_device_ro(s->s_dev, 1) ;
     if( journal_init(s, jdev_name, old_format, commit_max_age) ) {
 	SPRINTK(silent, "sh-2022: reiserfs_fill_super: unable to initialize journal space\n") ;
@@ -1389,15 +1524,25 @@ static int reiserfs_fill_super (struct s
 
 	journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
 	journal_end(&th, s, 1) ;
-	
+
+	if (reiserfs_xattr_init (s, s->s_flags)) {
+	    dput (s->s_root);
+	    s->s_root = NULL;
+	    goto error;
+	}
+
 	/* look for files which were to be removed in previous session */
 	finish_unfinished (s);
-
-	s->s_dirt = 0;
     } else {
 	if ( old_format_only(s) && !silent) {
 	    reiserfs_warning("reiserfs: using 3.5.x disk format\n") ;
 	}
+
+	if (reiserfs_xattr_init (s, s->s_flags)) {
+	    dput (s->s_root);
+	    s->s_root = NULL;
+	    goto error;
+	}
     }
     // mark hash in super block: it could be unset. overwrite should be ok
     set_sb_hash_function_code( rs, function2code(sbi->s_hash_function ) );
@@ -1465,6 +1610,9 @@ init_reiserfs_fs ( void )
 		return ret;
 	}
 
+        if ((ret = reiserfs_xattr_register_handlers ()))
+            goto failed_reiserfs_xattr_register_handlers;
+
 	reiserfs_proc_info_global_init ();
 	reiserfs_proc_register_global ("version", reiserfs_global_version_in_proc);
 
@@ -1474,6 +1622,9 @@ init_reiserfs_fs ( void )
 		return 0;
 	}
 
+        reiserfs_xattr_unregister_handlers ();
+
+failed_reiserfs_xattr_register_handlers:
 	reiserfs_proc_unregister_global ("version");
 	reiserfs_proc_info_global_done ();
 	destroy_inodecache ();
@@ -1484,6 +1635,7 @@ init_reiserfs_fs ( void )
 static void __exit
 exit_reiserfs_fs ( void )
 {
+        reiserfs_xattr_unregister_handlers ();
 	reiserfs_proc_unregister_global ("version");
 	reiserfs_proc_info_global_done ();
         unregister_filesystem (& reiserfs_fs_type);
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/tail_conversion.c linux-2.6.5-rc1-mm2/fs/reiserfs/tail_conversion.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/tail_conversion.c	2004-03-11 03:55:56.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/tail_conversion.c	2004-03-19 16:10:24.020964439 +0100
@@ -66,11 +66,11 @@ int direct2indirect (struct reiserfs_tra
 	set_ih_free_space (&ind_ih, 0); /* delete at nearest future */
         put_ih_item_len( &ind_ih, UNFM_P_SIZE );
 	PATH_LAST_POSITION (path)++;
-	n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, 
+	n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, inode,
 					 (char *)&unfm_ptr);
     } else {
 	/* Paste into last indirect item of an object. */
-	n_retval = reiserfs_paste_into_item(th, path, &end_key,
+	n_retval = reiserfs_paste_into_item(th, path, &end_key, inode,
 					    (char *)&unfm_ptr, UNFM_P_SIZE);
     }
     if ( n_retval ) {
@@ -143,16 +143,17 @@ void reiserfs_unmap_buffer(struct buffer
     if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
       BUG() ;
     }
-    clear_buffer_dirty(bh) ;
     lock_buffer(bh) ;
+    clear_buffer_dirty(bh) ;
     /* Remove the buffer from whatever list it belongs to. We are mostly
        interested in removing it from per-sb j_dirty_buffers list, to avoid
         BUG() on attempt to write not mapped buffer */
-    if ( !list_empty(&bh->b_assoc_buffers) && bh->b_page) {
+    if ( (!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
 	struct inode *inode = bh->b_page->mapping->host;
 	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
 	spin_lock(&j->j_dirty_buffers_lock);
 	list_del_init(&bh->b_assoc_buffers);
+	reiserfs_free_jh(bh);
 	spin_unlock(&j->j_dirty_buffers_lock);
     }
     clear_buffer_mapped(bh) ;
@@ -275,7 +276,7 @@ int indirect2direct (struct reiserfs_tra
     set_cpu_key_k_type (&key, TYPE_DIRECT);
     key.key_length = 4;
     /* Insert tail as new direct item in the tree */
-    if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih,
+    if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode,
 			      tail ? tail : NULL) < 0 ) {
 	/* No disk memory. So we can not convert last unformatted node
 	   to the direct item.  In this case we used to adjust
@@ -293,13 +294,15 @@ int indirect2direct (struct reiserfs_tra
     */
     unmap_buffers(page, pos1) ;
 
+    /* make sure to get the i_blocks changes from reiserfs_insert_item */
+    reiserfs_update_sd(th, p_s_inode);
+
     // note: we have now the same as in above direct2indirect
     // conversion: there are two keys which have matching first three
     // key components. They only differ by the fouhth one.
 
     /* We have inserted new direct item and must remove last
        unformatted node. */
-    p_s_inode->i_blocks += (p_s_sb->s_blocksize / 512);
     *p_c_mode = M_CUT;
 
     /* we store position of first direct item in the in-core inode */
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_acl.c linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_acl.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_acl.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_acl.c	2004-03-19 16:10:24.027963686 +0100
@@ -0,0 +1,563 @@
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include <linux/xattr_acl.h>
+#include <linux/reiserfs_xattr.h>
+#include <linux/reiserfs_acl.h>
+#include <asm/uaccess.h>
+
+static int
+xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (!reiserfs_posixacl(inode->i_sb))
+		return -EOPNOTSUPP;
+	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl)) {
+			return PTR_ERR(acl);
+		} else if (acl) {
+			error = posix_acl_valid(acl);
+			if (error)
+				goto release_and_out;
+		}
+	} else
+		acl = NULL;
+
+	error = reiserfs_set_acl (inode, type, acl);
+
+release_and_out:
+	posix_acl_release(acl);
+	return error;
+}
+
+
+static int
+xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (!reiserfs_posixacl(inode->i_sb))
+		return -EOPNOTSUPP;
+
+	acl = reiserfs_get_acl (inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *
+posix_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(reiserfs_acl_header))
+		 return ERR_PTR(-EINVAL);
+	if (((reiserfs_acl_header *)value)->a_version !=
+	    cpu_to_le32(REISERFS_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(reiserfs_acl_header);
+	count = reiserfs_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n=0; n < count; n++) {
+		reiserfs_acl_entry *entry =
+			(reiserfs_acl_entry *)value;
+		if ((char *)value + sizeof(reiserfs_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		switch(acl->a_entries[n].e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				value = (char *)value +
+					sizeof(reiserfs_acl_entry_short);
+				acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				value = (char *)value + sizeof(reiserfs_acl_entry);
+				if ((char *)value > end)
+					goto fail;
+				acl->a_entries[n].e_id =
+					le32_to_cpu(entry->e_id);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *
+posix_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+	reiserfs_acl_header *ext_acl;
+	char *e;
+	int n;
+
+	*size = reiserfs_acl_size(acl->a_count);
+	ext_acl = (reiserfs_acl_header *)kmalloc(sizeof(reiserfs_acl_header) +
+		acl->a_count * sizeof(reiserfs_acl_entry), GFP_NOFS);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(reiserfs_acl_header);
+	for (n=0; n < acl->a_count; n++) {
+		reiserfs_acl_entry *entry = (reiserfs_acl_entry *)e;
+		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		switch(acl->a_entries[n].e_tag) {
+			case ACL_USER:
+			case ACL_GROUP:
+				entry->e_id =
+					cpu_to_le32(acl->a_entries[n].e_id);
+				e += sizeof(reiserfs_acl_entry);
+				break;
+
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				e += sizeof(reiserfs_acl_entry_short);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Inode operation get_posix_acl().
+ *
+ * inode->i_sem: down
+ * BKL held [before 2.5.x]
+ */
+struct posix_acl *
+reiserfs_get_acl(struct inode *inode, int type)
+{
+	char *name, *value;
+	struct posix_acl *acl, **p_acl;
+	size_t size;
+	int retval;
+        struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
+
+        switch (type) {
+            case ACL_TYPE_ACCESS:
+                name = XATTR_NAME_ACL_ACCESS;
+                p_acl = &reiserfs_i->i_acl_access;
+                break;
+            case ACL_TYPE_DEFAULT:
+                name = XATTR_NAME_ACL_DEFAULT;
+                p_acl = &reiserfs_i->i_acl_default;
+                break;
+            default:
+                return ERR_PTR (-EINVAL);
+        }
+
+        if (IS_ERR (*p_acl)) {
+            if (PTR_ERR (*p_acl) == -ENODATA)
+                return NULL;
+        } else if (*p_acl != NULL)
+            return posix_acl_dup (*p_acl);
+
+        size = reiserfs_xattr_get (inode, name, NULL, 0);
+        if ((int)size < 0) {
+            if (size == -ENODATA || size == -ENOSYS) {
+		*p_acl = ERR_PTR (-ENODATA);
+		return NULL;
+            }
+            return ERR_PTR (size);
+        }
+
+        value = kmalloc (size, GFP_NOFS);
+        if (!value)
+            return ERR_PTR (-ENOMEM);
+
+	retval = reiserfs_xattr_get(inode, name, value, size);
+	if (retval == -ENODATA || retval == -ENOSYS) {
+		/* This shouldn't actually happen as it should have
+		   been caught above.. but just in case */
+		acl = NULL;
+		*p_acl = ERR_PTR (-ENODATA);
+        } else if (retval < 0) {
+		acl = ERR_PTR(retval);
+	} else {
+		acl = posix_acl_from_disk(value, retval);
+		*p_acl = posix_acl_dup (acl);
+        }
+
+	kfree(value);
+	return acl;
+}
+
+/*
+ * Inode operation set_posix_acl().
+ *
+ * inode->i_sem: down
+ * BKL held [before 2.5.x]
+ */
+int
+reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+        char *name;
+	void *value = NULL;
+	struct posix_acl **p_acl;
+	size_t size;
+	int error;
+        struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+        switch (type) {
+            case ACL_TYPE_ACCESS:
+                name = XATTR_NAME_ACL_ACCESS;
+                p_acl = &reiserfs_i->i_acl_access;
+                if (acl) {
+                    mode_t mode = inode->i_mode;
+                    error = posix_acl_equiv_mode (acl, &mode);
+                    if (error < 0)
+                        return error;
+                    else {
+                        inode->i_mode = mode;
+                        if (error == 0)
+                            acl = NULL;
+                    }
+                }
+                break;
+            case ACL_TYPE_DEFAULT:
+                name = XATTR_NAME_ACL_DEFAULT;
+                p_acl = &reiserfs_i->i_acl_default;
+                if (!S_ISDIR (inode->i_mode))
+                    return acl ? -EACCES : 0;
+                break;
+            default:
+                return -EINVAL;
+        }
+
+ 	if (acl) {
+            value = posix_acl_to_disk(acl, &size);
+            if (IS_ERR(value))
+                return (int)PTR_ERR(value);
+            error = reiserfs_xattr_set(inode, name, value, size, 0);
+	} else {
+            error = reiserfs_xattr_del (inode, name);
+            if (error == -ENODATA)
+                error = 0;
+        }
+
+	if (value)
+		kfree(value);
+
+        if (!error) {
+            /* Release the old one */
+            if (!IS_ERR (*p_acl) && *p_acl)
+                posix_acl_release (*p_acl);
+
+            if (acl == NULL)
+                *p_acl = ERR_PTR (-ENODATA);
+            else 
+                *p_acl = posix_acl_dup (acl);
+        }
+
+	return error;
+}
+
+/* dir->i_sem: down,
+ * inode is new and not released into the wild yet */
+int
+reiserfs_inherit_default_acl (struct inode *dir, struct dentry *dentry, struct inode *inode)
+{
+    struct posix_acl *acl;
+    int err = 0;
+
+    /* ACLs only get applied to files and directories */
+    if (S_ISLNK (inode->i_mode))
+        return 0;
+
+    /* ACLs can only be used on "new" objects, so if it's an old object
+     * there is nothing to inherit from */
+    if (get_inode_sd_version (dir) == STAT_DATA_V1)
+        goto apply_umask;
+
+    /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This
+     * would be useless since permissions are ignored, and a pain because
+     * it introduces locking cycles */
+    if (is_reiserfs_priv_object (dir)) {
+        REISERFS_I(inode)->i_flags |= i_priv_object;
+        goto apply_umask;
+    }
+
+    acl = reiserfs_get_acl (dir, ACL_TYPE_DEFAULT);
+    if (IS_ERR (acl)) {
+        if (PTR_ERR (acl) == -ENODATA)
+            goto apply_umask;
+        return PTR_ERR (acl);
+    }
+
+    if (acl) {
+        struct posix_acl *acl_copy;
+        mode_t mode = inode->i_mode;
+        int need_acl;
+
+        /* Copy the default ACL to the default ACL of a new directory */
+        if (S_ISDIR (inode->i_mode)) {
+            err = reiserfs_set_acl (inode, ACL_TYPE_DEFAULT, acl);
+            if (err)
+                goto cleanup;
+        }
+
+        /* Now we reconcile the new ACL and the mode,
+           potentially modifying both */
+        acl_copy = posix_acl_clone (acl, GFP_NOFS);
+        if (!acl_copy) {
+            err = -ENOMEM;
+            goto cleanup;
+        }
+
+
+        need_acl = posix_acl_create_masq (acl_copy, &mode);
+        if (need_acl >= 0) {
+            if (mode != inode->i_mode) {
+                inode->i_mode = mode;
+            }
+
+            /* If we need an ACL.. */
+            if (need_acl > 0) {
+                err = reiserfs_set_acl (inode, ACL_TYPE_ACCESS, acl_copy);
+                if (err) 
+                    goto cleanup_copy;
+            }
+        }
+cleanup_copy:
+        posix_acl_release (acl_copy);
+cleanup:
+        posix_acl_release (acl);
+    } else {
+apply_umask:
+        /* no ACL, apply umask */
+        inode->i_mode &= ~current->fs->umask;
+    }
+
+    return err;
+}
+
+/* Looks up and caches the result of the default ACL.
+ * We do this so that we don't need to carry the xattr_sem into
+ * reiserfs_new_inode if we don't need to */
+int
+reiserfs_cache_default_acl (struct inode *inode)
+{
+    int ret = 0;
+    if (reiserfs_posixacl (inode->i_sb) &&
+        !is_reiserfs_priv_object (inode)) {
+        struct posix_acl *acl;
+        reiserfs_read_lock_xattr_i (inode);
+        reiserfs_read_lock_xattrs (inode->i_sb);
+        acl = reiserfs_get_acl (inode, ACL_TYPE_DEFAULT);
+        reiserfs_read_unlock_xattrs (inode->i_sb);
+        reiserfs_read_unlock_xattr_i (inode);
+        ret = acl ? 1 : 0;
+        posix_acl_release (acl);
+    }
+
+    return ret;
+}
+
+int
+reiserfs_acl_chmod (struct inode *inode)
+{
+        struct posix_acl *acl, *clone;
+        int error;
+
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+
+	if (get_inode_sd_version (inode) == STAT_DATA_V1 ||
+	    !reiserfs_posixacl(inode->i_sb))
+        {
+	    return 0;
+	}
+
+        reiserfs_read_lock_xattrs (inode->i_sb);
+        acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
+        reiserfs_read_unlock_xattrs (inode->i_sb);
+        if (!acl)
+                return 0;
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        clone = posix_acl_clone(acl, GFP_NOFS);
+        posix_acl_release(acl);
+        if (!clone)
+                return -ENOMEM;
+        error = posix_acl_chmod_masq(clone, inode->i_mode);
+        if (!error) {
+                int lock = !has_xattr_dir (inode);
+                reiserfs_write_lock_xattr_i (inode);
+                if (lock)
+                    reiserfs_write_lock_xattrs (inode->i_sb);
+                else
+                    reiserfs_read_lock_xattrs (inode->i_sb);
+                error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+                if (lock)
+                    reiserfs_write_unlock_xattrs (inode->i_sb);
+                else
+                    reiserfs_read_unlock_xattrs (inode->i_sb);
+                reiserfs_write_unlock_xattr_i (inode);
+        }
+        posix_acl_release(clone);
+        return error;
+}
+
+static int
+posix_acl_access_get(struct inode *inode, const char *name,
+			  void *buffer, size_t size)
+{
+	if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1)
+		return -EINVAL;
+	return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int
+posix_acl_access_set(struct inode *inode, const char *name,
+			  const void *value, size_t size, int flags)
+{
+	if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1)
+		return -EINVAL;
+	return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int
+posix_acl_access_del (struct inode *inode, const char *name)
+{
+    struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
+    struct posix_acl **acl = &reiserfs_i->i_acl_access;
+    if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1)
+	return -EINVAL;
+    if (!IS_ERR (*acl) && *acl) {
+        posix_acl_release (*acl);
+        *acl = ERR_PTR (-ENODATA);
+    }
+
+    return 0;
+}
+
+static int
+posix_acl_access_list (struct inode *inode, const char *name, int namelen, char *out)
+{
+    int len = namelen;
+    if (!reiserfs_posixacl (inode->i_sb))
+        return 0;
+    if (out)
+        memcpy (out, name, len);
+
+    return len;
+}
+
+struct reiserfs_xattr_handler posix_acl_access_handler = {
+    prefix: XATTR_NAME_ACL_ACCESS,
+    get: posix_acl_access_get,
+    set: posix_acl_access_set,
+    del: posix_acl_access_del,
+    list: posix_acl_access_list,
+};
+
+static int
+posix_acl_default_get (struct inode *inode, const char *name,
+			   void *buffer, size_t size)
+{
+	if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1)
+		return -EINVAL;
+	return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+static int
+posix_acl_default_set(struct inode *inode, const char *name,
+			   const void *value, size_t size, int flags)
+{
+	if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1)
+		return -EINVAL;
+	return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+static int
+posix_acl_default_del (struct inode *inode, const char *name)
+{
+    struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
+    struct posix_acl **acl = &reiserfs_i->i_acl_default;
+    if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1)
+	return -EINVAL;
+    if (!IS_ERR (*acl) && *acl) {
+        posix_acl_release (*acl);
+        *acl = ERR_PTR (-ENODATA);
+    }
+
+    return 0;
+}
+
+static int
+posix_acl_default_list (struct inode *inode, const char *name, int namelen, char *out)
+{
+    int len = namelen;
+    if (!reiserfs_posixacl (inode->i_sb))
+        return 0;
+    if (out)
+        memcpy (out, name, len);
+
+    return len;
+}
+
+struct reiserfs_xattr_handler posix_acl_default_handler = {
+    prefix: XATTR_NAME_ACL_DEFAULT,
+    get: posix_acl_default_get,
+    set: posix_acl_default_set,
+    del: posix_acl_default_del,
+    list: posix_acl_default_list,
+};
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr.c linux-2.6.5-rc1-mm2/fs/reiserfs/xattr.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/xattr.c	2004-03-19 16:10:24.023964116 +0100
@@ -0,0 +1,1440 @@
+/*
+ * linux/fs/reiserfs/xattr.c
+ *
+ * Copyright (c) 2002 by Jeff Mahoney, <jeffm@suse.com>
+ *
+ */
+
+/*
+ * In order to implement EA/ACLs in a clean, backwards compatible manner,
+ * they are implemented as files in a "private" directory.
+ * Each EA is in it's own file, with the directory layout like so (/ is assumed
+ * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory,
+ * directories named using the capital-hex form of the objectid and
+ * generation number are used. Inside each directory are individual files
+ * named with the name of the extended attribute.
+ *
+ * So, for objectid 12648430, we could have:
+ * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access
+ * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default
+ * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type
+ * .. or similar.
+ *
+ * The file contents are the text of the EA. The size is known based on the
+ * stat data describing the file.
+ *
+ * In the case of system.posix_acl_access and system.posix_acl_default, since
+ * these are special cases for filesystem ACLs, they are interpreted by the
+ * kernel, in addition, they are negatively and positively cached and attached
+ * to the inode so that unnecessary lookups are avoided.
+ */
+
+#include <linux/reiserfs_fs.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include <linux/reiserfs_xattr.h>
+#include <linux/reiserfs_acl.h>
+#include <linux/mbcache.h>
+#include <asm/uaccess.h>
+#include <asm/checksum.h>
+#include <linux/smp_lock.h>
+#include <linux/stat.h>
+#include <asm/semaphore.h>
+
+#define FL_READONLY 128
+#define FL_DIR_SEM_HELD 256
+#define PRIVROOT_NAME ".reiserfs_priv"
+#define XAROOT_NAME   "xattrs"
+
+static struct reiserfs_xattr_handler *find_xattr_handler_prefix (const char *prefix);
+
+static struct dentry *
+create_xa_root (struct super_block *sb)
+{
+    struct dentry *privroot = dget (REISERFS_SB(sb)->priv_root);
+    struct dentry *xaroot;
+
+    /* This needs to be created at mount-time */
+    if (!privroot)
+        return ERR_PTR(-EOPNOTSUPP);
+
+    xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME));
+    if (IS_ERR (xaroot)) {
+        goto out;
+    } else if (!xaroot->d_inode) {
+        int err;
+        down (&privroot->d_inode->i_sem);
+        err = privroot->d_inode->i_op->mkdir (privroot->d_inode, xaroot, 0700);
+        up (&privroot->d_inode->i_sem);
+
+        if (err) {
+            dput (xaroot);
+            dput (privroot);
+            return ERR_PTR (err);
+        }
+        REISERFS_SB(sb)->xattr_root = dget (xaroot);
+    }
+
+out:
+    dput (privroot);
+    return xaroot;
+}
+
+/* This will return a dentry, or error, refering to the xa root directory.
+ * If the xa root doesn't exist yet, the dentry will be returned without
+ * an associated inode. This dentry can be used with ->mkdir to create
+ * the xa directory. */
+static struct dentry *
+__get_xa_root (struct super_block *s)
+{
+    struct dentry *privroot = dget (REISERFS_SB(s)->priv_root);
+    struct dentry *xaroot = NULL;
+
+    if (IS_ERR (privroot) || !privroot)
+        return privroot;
+
+    xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME));
+    if (IS_ERR (xaroot)) {
+        goto out;
+    } else if (!xaroot->d_inode) {
+        dput (xaroot);
+        xaroot = NULL;
+        goto out;
+    }
+
+    REISERFS_SB(s)->xattr_root = dget (xaroot);
+
+out:
+    dput (privroot);
+    return xaroot;
+}
+
+/* Returns the dentry (or NULL) referring to the root of the extended
+ * attribute directory tree. If it has already been retreived, it is used.
+ * Otherwise, we attempt to retreive it from disk. It may also return
+ * a pointer-encoded error.
+ */
+static inline struct dentry *
+get_xa_root (struct super_block *s)
+{
+    struct dentry *dentry = dget (REISERFS_SB(s)->xattr_root);
+
+    if (!dentry)
+        dentry = __get_xa_root (s);
+
+    return dentry;
+}
+
+/* Opens the directory corresponding to the inode's extended attribute store.
+ * If flags allow, the tree to the directory may be created. If creation is
+ * prohibited, -ENODATA is returned. */
+static struct dentry *
+open_xa_dir (const struct inode *inode, int flags)
+{
+    struct dentry *xaroot, *xadir;
+    char namebuf[17];
+
+    xaroot = get_xa_root (inode->i_sb);
+    if (IS_ERR (xaroot)) {
+        return xaroot;
+    } else if (!xaroot) {
+        if (flags == 0 || flags & XATTR_CREATE) {
+            xaroot = create_xa_root (inode->i_sb);
+            if (IS_ERR (xaroot))
+                return xaroot;
+        }
+        if (!xaroot)
+            return ERR_PTR (-ENODATA);
+    }
+
+    /* ok, we have xaroot open */
+
+    snprintf (namebuf, sizeof (namebuf), "%X.%X",
+              le32_to_cpu (INODE_PKEY (inode)->k_objectid),
+              inode->i_generation);
+    xadir = lookup_one_len (namebuf, xaroot, strlen (namebuf));
+    if (IS_ERR (xadir)) {
+        dput (xaroot);
+        return xadir;
+    }
+    
+    if (!xadir->d_inode) {
+        int err;
+        if (flags == 0 || flags & XATTR_CREATE) {
+            /* Although there is nothing else trying to create this directory,
+             * another directory with the same hash may be created, so we need
+             * to protect against that */
+            err = xaroot->d_inode->i_op->mkdir (xaroot->d_inode, xadir, 0700);
+            if (err) {
+                dput (xaroot);
+                dput (xadir);
+                return ERR_PTR (err);
+            }
+        }
+        if (!xadir->d_inode) {
+            dput (xaroot);
+            dput (xadir);
+            return ERR_PTR (-ENODATA);
+        }
+        /* Newly created object.. Need to mark it private */
+        REISERFS_I(xadir->d_inode)->i_flags |= i_priv_object;
+    }
+
+    dput (xaroot);
+    return xadir;
+}
+
+/* Returns a dentry corresponding to a specific extended attribute file
+ * for the inode. If flags allow, the file is created. Otherwise, a
+ * valid or negative dentry, or an error is returned. */
+static struct dentry *
+get_xa_file_dentry (const struct inode *inode, const char *name, int flags)
+{
+    struct dentry *xadir, *xafile;
+    int err = 0;
+
+    xadir = open_xa_dir (inode, flags);
+    if (IS_ERR (xadir)) {
+        return ERR_PTR (PTR_ERR (xadir));
+    } else if (xadir && !xadir->d_inode) {
+        dput (xadir);
+        return ERR_PTR (-ENODATA);
+    }
+
+    xafile = lookup_one_len (name, xadir, strlen (name));
+    if (IS_ERR (xafile)) {
+        dput (xadir);
+        return ERR_PTR (PTR_ERR (xafile));
+    }
+
+    if (xafile->d_inode) { /* file exists */
+        if (flags & XATTR_CREATE) {
+            err = -EEXIST;
+            dput (xafile);
+            goto out;
+        }
+    } else if (flags & XATTR_REPLACE || flags & FL_READONLY) {
+        goto out;
+    } else {
+        /* inode->i_sem is down, so nothing else can try to create
+         * the same xattr */
+        err = xadir->d_inode->i_op->create (xadir->d_inode, xafile,
+                                            0700|S_IFREG, NULL);
+
+        if (err) {
+            dput (xafile);
+            goto out;
+        }
+        /* Newly created object.. Need to mark it private */
+        REISERFS_I(xafile->d_inode)->i_flags |= i_priv_object;
+    }
+
+out:
+    dput (xadir);
+    if (err)
+        xafile = ERR_PTR (err);
+    return xafile;
+}
+
+
+/* Opens a file pointer to the attribute associated with inode */
+static struct file *
+open_xa_file (const struct inode *inode, const char *name, int flags)
+{
+    struct dentry *xafile;
+    struct file *fp;
+
+    xafile = get_xa_file_dentry (inode, name, flags);
+    if (IS_ERR (xafile))
+        return ERR_PTR (PTR_ERR (xafile));
+    else if (!xafile->d_inode) {
+        dput (xafile);
+        return ERR_PTR (-ENODATA);
+    }
+
+    fp = dentry_open (xafile, NULL, O_RDWR);
+    /* dentry_open dputs the dentry if it fails */
+
+    return fp;
+}
+
+
+/*
+ * this is very similar to fs/reiserfs/dir.c:reiserfs_readdir, but
+ * we need to drop the path before calling the filldir struct.  That
+ * would be a big performance hit to the non-xattr case, so I've copied
+ * the whole thing for now. --clm
+ *
+ * the big difference is that I go backwards through the directory, 
+ * and don't mess with f->f_pos, but the idea is the same.  Do some
+ * action on each and every entry in the directory.
+ *
+ * we're called with i_sem held, so there are no worries about the directory
+ * changing underneath us.
+ */
+static int __xattr_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+    struct inode *inode = filp->f_dentry->d_inode;
+    struct cpu_key pos_key;	/* key of current position in the directory (key of directory entry) */
+    INITIALIZE_PATH (path_to_entry);
+    struct buffer_head * bh;
+    int entry_num;
+    struct item_head * ih, tmp_ih;
+    int search_res;
+    char * local_buf;
+    loff_t next_pos;
+    char small_buf[32] ; /* avoid kmalloc if we can */
+    struct reiserfs_de_head *deh;
+    int d_reclen;
+    char * d_name;
+    off_t d_off;
+    ino_t d_ino;
+    struct reiserfs_dir_entry de;
+
+
+    /* form key for search the next directory entry using f_pos field of
+       file structure */
+    next_pos = max_reiserfs_offset(inode);
+
+    while (1) {
+research:
+	if (next_pos <= DOT_DOT_OFFSET)
+	    break;
+	make_cpu_key (&pos_key, inode, next_pos, TYPE_DIRENTRY, 3);
+
+	search_res = search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, &de);
+	if (search_res == IO_ERROR) {
+	    // FIXME: we could just skip part of directory which could
+	    // not be read
+	    pathrelse(&path_to_entry);
+	    return -EIO;
+	}
+
+	if (search_res == NAME_NOT_FOUND)
+	    de.de_entry_num--;
+
+	set_de_name_and_namelen(&de);
+	entry_num = de.de_entry_num;
+	deh = &(de.de_deh[entry_num]);
+
+	bh = de.de_bh;
+	ih = de.de_ih;
+
+	if (!is_direntry_le_ih(ih)) {
+            reiserfs_warning("not direntry %h\n", ih);
+	    break;
+        }
+	copy_item_head(&tmp_ih, ih);
+		
+	/* we must have found item, that is item of this directory, */
+	RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key),
+		"vs-9000: found item %h does not match to dir we readdir %K",
+		ih, &pos_key);
+
+	if (deh_offset(deh) <= DOT_DOT_OFFSET) {
+	    break;
+	}
+
+	/* look for the previous entry in the directory */
+	next_pos = deh_offset (deh) - 1;
+
+	if (!de_visible (deh))
+	    /* it is hidden entry */
+	    continue;
+
+	d_reclen = entry_length(bh, ih, entry_num);
+	d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh);
+	d_off = deh_offset (deh);
+	d_ino = deh_objectid (deh);
+
+	if (!d_name[d_reclen - 1])
+	    d_reclen = strlen (d_name);
+
+	if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){
+	    /* too big to send back to VFS */
+	    continue ;
+	}
+
+        /* Ignore the .reiserfs_priv entry */
+        if (reiserfs_xattrs (inode->i_sb) && 
+            !old_format_only(inode->i_sb) &&
+            deh_objectid (deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid))
+          continue;
+
+	if (d_reclen <= 32) {
+	  local_buf = small_buf ;
+	} else {
+	    local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ;
+	    if (!local_buf) {
+		pathrelse (&path_to_entry);
+		return -ENOMEM ;
+	    }
+	    if (item_moved (&tmp_ih, &path_to_entry)) {
+		reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
+
+		/* sigh, must retry.  Do this same offset again */
+		next_pos = d_off;
+		goto research;
+	    }
+	}
+
+	// Note, that we copy name to user space via temporary
+	// buffer (local_buf) because filldir will block if
+	// user space buffer is swapped out. At that time
+	// entry can move to somewhere else
+	memcpy (local_buf, d_name, d_reclen);
+
+	/* the filldir function might need to start transactions,
+	 * or do who knows what.  Release the path now that we've
+	 * copied all the important stuff out of the deh
+	 */
+	pathrelse (&path_to_entry);
+
+	if (filldir (dirent, local_buf, d_reclen, d_off, d_ino, 
+		     DT_UNKNOWN) < 0) {
+	    if (local_buf != small_buf) {
+		reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
+	    }
+	    goto end;
+	}
+	if (local_buf != small_buf) {
+	    reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
+	}
+    } /* while */
+
+end:
+    pathrelse (&path_to_entry);
+    return 0;
+}
+
+/* 
+ * this could be done with dedicated readdir ops for the xattr files,
+ * but I want to get something working asap
+ * this is stolen from vfs_readdir
+ *
+ */
+static
+int xattr_readdir(struct file *file, filldir_t filler, void *buf)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        int res = -ENOTDIR;
+        if (!file->f_op || !file->f_op->readdir)
+                goto out;
+        down(&inode->i_sem);
+//        down(&inode->i_zombie);
+        res = -ENOENT;
+        if (!IS_DEADDIR(inode)) {
+                lock_kernel();
+                res = __xattr_readdir(file, buf, filler);
+                unlock_kernel();
+        }
+//        up(&inode->i_zombie);
+        up(&inode->i_sem);
+out:
+        return res;
+}
+
+
+/* Internal operations on file data */
+static inline void
+reiserfs_put_page(struct page *page)
+{
+        kunmap(page);
+        page_cache_release(page);
+}
+
+static struct page *
+reiserfs_get_page(struct inode *dir, unsigned long n)
+{
+        struct address_space *mapping = dir->i_mapping; 
+        struct page *page;
+        /* We can deadlock if we try to free dentries,
+           and an unlink/rmdir has just occured - GFP_NOFS avoids this */
+        mapping->flags = (mapping->flags & ~__GFP_BITS_MASK) | GFP_NOFS;
+        page = read_cache_page (mapping, n,
+                                (filler_t*)mapping->a_ops->readpage, NULL);
+        if (!IS_ERR(page)) {
+                wait_on_page_locked(page);
+                kmap(page);
+                if (!PageUptodate(page))
+                        goto fail;
+
+                if (PageError(page))
+                        goto fail;
+        }
+        return page;
+
+fail:
+        reiserfs_put_page(page);
+        return ERR_PTR(-EIO);
+}
+
+static inline __u32
+xattr_hash (const char *msg, int len)
+{
+    return csum_partial (msg, len, 0);
+}
+
+/* Generic extended attribute operations that can be used by xa plugins */
+
+/*
+ * inode->i_sem: down
+ */
+int
+reiserfs_xattr_set (struct inode *inode, const char *name, const void *buffer,
+                    size_t buffer_size, int flags)
+{
+    int err = 0;
+    struct file *fp;
+    struct page *page;
+    char *data;
+    struct address_space *mapping;
+    size_t file_pos = 0;
+    size_t buffer_pos = 0;
+    struct inode *xinode;
+    struct iattr newattrs;
+    __u32 xahash = 0;
+
+    if (IS_RDONLY (inode))
+        return -EROFS;
+
+    if (IS_IMMUTABLE (inode) || IS_APPEND (inode))
+        return -EPERM;
+
+    if (get_inode_sd_version (inode) == STAT_DATA_V1)
+        return -EOPNOTSUPP;
+
+    /* Empty xattrs are ok, they're just empty files, no hash */
+    if (buffer && buffer_size)
+        xahash = xattr_hash (buffer, buffer_size);
+
+open_file:
+    fp = open_xa_file (inode, name, flags);
+    if (IS_ERR (fp)) {
+        err = PTR_ERR (fp);
+        goto out;
+    }
+
+    xinode = fp->f_dentry->d_inode;
+    REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+
+    /* we need to copy it off.. */
+    if (xinode->i_nlink > 1) {
+	fput(fp);
+        err = reiserfs_xattr_del (inode, name);
+        if (err < 0)
+            goto out;
+        /* We just killed the old one, we're not replacing anymore */
+        if (flags & XATTR_REPLACE)
+            flags &= ~XATTR_REPLACE;
+        goto open_file;
+    }
+
+    /* Resize it so we're ok to write there */
+    newattrs.ia_size = buffer_size;
+    newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+    down (&xinode->i_sem);
+    err = notify_change(fp->f_dentry, &newattrs);
+    if (err)
+        goto out_filp;
+
+    mapping = xinode->i_mapping;
+    while (buffer_pos < buffer_size || buffer_pos == 0) {
+        size_t chunk;
+        size_t skip = 0;
+        size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1));
+        if (buffer_size - buffer_pos > PAGE_CACHE_SIZE)
+            chunk = PAGE_CACHE_SIZE;
+        else
+            chunk = buffer_size - buffer_pos;
+
+        page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT);
+        if (IS_ERR (page)) {
+            err = PTR_ERR (page);
+            goto out_filp;
+        }
+
+        lock_page (page);
+        data = page_address (page);
+
+        if (file_pos == 0) {
+            struct reiserfs_xattr_header *rxh;
+            skip = file_pos = sizeof (struct reiserfs_xattr_header);
+            if (chunk + skip > PAGE_CACHE_SIZE)
+                chunk = PAGE_CACHE_SIZE - skip;
+            rxh = (struct reiserfs_xattr_header *)data;
+            rxh->h_magic = cpu_to_le32 (REISERFS_XATTR_MAGIC);
+            rxh->h_hash = cpu_to_le32 (xahash);
+        }
+
+        err = mapping->a_ops->prepare_write (fp, page, page_offset,
+                                             page_offset + chunk + skip);
+        if (!err) {
+	    if (buffer)
+		memcpy (data + skip, buffer + buffer_pos, chunk);
+            err = mapping->a_ops->commit_write (fp, page, page_offset,
+                                                page_offset + chunk + skip);
+	}
+        unlock_page (page);
+        reiserfs_put_page (page);
+        buffer_pos += chunk;
+        file_pos += chunk;
+        skip = 0;
+        if (err || buffer_size == 0 || !buffer)
+            break;
+    }
+
+    inode->i_ctime = CURRENT_TIME;
+    mark_inode_dirty (inode);
+
+out_filp:
+    up (&xinode->i_sem);
+    fput(fp);
+
+out:
+    return err;
+}
+
+/*
+ * inode->i_sem: down
+ */
+int
+reiserfs_xattr_get (const struct inode *inode, const char *name, void *buffer,
+                    size_t buffer_size)
+{
+    ssize_t err = 0;
+    struct file *fp;
+    size_t isize;
+    size_t file_pos = 0;
+    size_t buffer_pos = 0;
+    struct page *page;
+    struct inode *xinode;
+    __u32 hash = 0;
+
+    if (name == NULL)
+        return -EINVAL;
+
+    /* We can't have xattrs attached to v1 items since they don't have
+     * generation numbers */
+    if (get_inode_sd_version (inode) == STAT_DATA_V1)
+        return -EOPNOTSUPP;
+
+    fp = open_xa_file (inode, name, FL_READONLY);
+    if (IS_ERR (fp)) {
+        err = PTR_ERR (fp);
+        goto out;
+    }
+
+    xinode = fp->f_dentry->d_inode;
+    isize = xinode->i_size;
+    REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+
+    /* Just return the size needed */
+    if (buffer == NULL) {
+        err = isize - sizeof (struct reiserfs_xattr_header);
+        goto out_dput;
+    }
+
+    if (buffer_size < isize - sizeof (struct reiserfs_xattr_header)) {
+        err = -ERANGE;
+        goto out_dput;
+    }
+    
+    while (file_pos < isize) {
+        size_t chunk;
+        char *data;
+        size_t skip = 0;
+        if (isize - file_pos > PAGE_CACHE_SIZE)
+            chunk = PAGE_CACHE_SIZE;
+        else
+            chunk = isize - file_pos;
+
+        page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT);
+        if (IS_ERR (page)) {
+            err = PTR_ERR (page);
+            goto out_dput;
+        }
+
+        lock_page (page);
+        data = page_address (page);
+        if (file_pos == 0) {
+            struct reiserfs_xattr_header *rxh =
+                                        (struct reiserfs_xattr_header *)data;
+            skip = file_pos = sizeof (struct reiserfs_xattr_header);
+            chunk -= skip;
+            /* Magic doesn't match up.. */
+            if (rxh->h_magic != cpu_to_le32 (REISERFS_XATTR_MAGIC)) {
+                unlock_page (page);
+                reiserfs_put_page (page);
+                reiserfs_warning ("reiserfs: Invalid magic for xattr (%s) "
+                                  "associated with %s %k\n", name,
+                                  reiserfs_bdevname (inode->i_sb),
+                                  INODE_PKEY (inode));
+                err = -EIO;
+                goto out_dput;
+            }
+            hash = le32_to_cpu (rxh->h_hash);
+        }
+        memcpy (buffer + buffer_pos, data + skip, chunk);
+        unlock_page (page);
+        reiserfs_put_page (page);
+        file_pos += chunk;
+        buffer_pos += chunk;
+        skip = 0;
+    }
+    err = isize - sizeof (struct reiserfs_xattr_header);
+
+    if (xattr_hash (buffer, isize - sizeof (struct reiserfs_xattr_header)) != hash) {
+        reiserfs_warning ("reiserfs: Invalid hash for xattr (%s) associated "
+                          "with %s %k\n", name,
+                          reiserfs_bdevname (inode->i_sb), INODE_PKEY (inode));
+        err = -EIO;
+    }
+
+out_dput:
+    fput(fp);
+
+out:
+    return err;
+}
+
+static int
+__reiserfs_xattr_del (struct dentry *xadir, const char *name, int namelen)
+{
+    struct dentry *dentry;
+    struct inode *dir = xadir->d_inode;
+    int err = 0;
+
+    dentry = lookup_one_len (name, xadir, namelen);
+    if (IS_ERR (dentry)) {
+        err = PTR_ERR (dentry);
+        goto out;
+    } else if (!dentry->d_inode) {
+        err = -ENODATA;
+        goto out_file;
+    }
+
+    /* Skip directories.. */
+    if (S_ISDIR (dentry->d_inode->i_mode))
+        goto out_file;
+
+    if (!is_reiserfs_priv_object (dentry->d_inode)) {
+        reiserfs_warning ("OID %08x [%.*s/%.*s] doesn't have priv flag set [parent is %sset].\n",
+                        le32_to_cpu (INODE_PKEY (dentry->d_inode)->k_objectid),
+                        xadir->d_name.len, xadir->d_name.name, namelen, name,
+                        is_reiserfs_priv_object (xadir->d_inode) ? "" : "not ");
+        dput (dentry);
+        return -EIO;
+    }
+
+    err = dir->i_op->unlink (dir, dentry);
+    if (!err)
+        d_delete (dentry);
+
+out_file:
+    dput (dentry);
+
+out:
+    return err;
+}
+
+
+int
+reiserfs_xattr_del (struct inode *inode, const char *name)
+{
+    struct dentry *dir;
+    int err;
+
+    if (IS_RDONLY (inode))
+        return -EROFS;
+
+    dir = open_xa_dir (inode, FL_READONLY);
+    if (IS_ERR (dir)) {
+        err = PTR_ERR (dir);
+        goto out;
+    }
+
+    err = __reiserfs_xattr_del (dir, name, strlen (name));
+    dput (dir);
+
+out:
+    return err;
+}
+
+/* The following are side effects of other operations that aren't explicitly
+ * modifying extended attributes. This includes operations such as permissions
+ * or ownership changes, object deletions, etc. */
+ 
+static int
+reiserfs_delete_xattrs_filler (void *buf, const char *name, int namelen,
+                               loff_t offset, ino_t ino, unsigned int d_type)
+{
+    struct dentry *xadir = (struct dentry *)buf;
+
+    return __reiserfs_xattr_del (xadir, name, namelen);
+
+}
+
+/* This is called w/ inode->i_sem downed */
+int
+reiserfs_delete_xattrs (struct inode *inode)
+{
+    struct file *fp;
+    struct dentry *dir, *root;
+    int err = 0;
+
+    /* Skip out, an xattr has no xattrs associated with it */
+    if (is_reiserfs_priv_object (inode) ||
+        get_inode_sd_version (inode) == STAT_DATA_V1 || 
+        !reiserfs_xattrs(inode->i_sb))
+    {
+        return 0;
+    }
+    reiserfs_read_lock_xattrs (inode->i_sb);
+    dir = open_xa_dir (inode, FL_READONLY);
+    reiserfs_read_unlock_xattrs (inode->i_sb);
+    if (IS_ERR (dir)) {
+        err = PTR_ERR (dir);
+        goto out;
+    } else if (!dir->d_inode) {
+        dput (dir);
+        return 0;
+    }
+
+    fp = dentry_open (dir, NULL, O_RDWR);
+    if (IS_ERR (fp)) {
+        err = PTR_ERR (fp);
+        /* dentry_open dputs the dentry if it fails */
+        goto out;
+    }
+
+    lock_kernel ();
+    err = xattr_readdir (fp, reiserfs_delete_xattrs_filler, dir);
+    if (err) {
+        unlock_kernel ();
+        goto out_dir;
+    }
+
+    /* Leftovers besides . and .. -- that's not good. */
+    if (dir->d_inode->i_nlink <= 2) {
+        root = get_xa_root (inode->i_sb);
+        reiserfs_write_lock_xattrs (inode->i_sb);
+        err = vfs_rmdir (root->d_inode, dir);
+        reiserfs_write_unlock_xattrs (inode->i_sb);
+        dput (root);
+    } else {
+        reiserfs_warning ("Couldn't remove all entries in directory\n");
+    }
+    unlock_kernel ();
+
+out_dir:
+    fput(fp);
+
+out:
+    if (!err)
+        REISERFS_I(inode)->i_flags = REISERFS_I(inode)->i_flags & ~i_has_xattr_dir;
+    return err;
+}
+
+struct reiserfs_chown_buf {
+    struct inode *inode;
+    struct dentry *xadir;
+    struct iattr *attrs;
+};
+
+/* XXX: If there is a better way to do this, I'd love to hear about it */
+static int
+reiserfs_chown_xattrs_filler (void *buf, const char *name, int namelen,
+                               loff_t offset, ino_t ino, unsigned int d_type)
+{
+    struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf;
+    struct dentry *xafile, *xadir = chown_buf->xadir;
+    struct iattr *attrs = chown_buf->attrs;
+    int err = 0;
+
+    xafile = lookup_one_len (name, xadir, namelen);
+    if (IS_ERR (xafile))
+        return PTR_ERR (xafile);
+    else if (!xafile->d_inode) {
+        dput (xafile);
+        return -ENODATA;
+    }
+
+    if (!S_ISDIR (xafile->d_inode->i_mode))
+        err = notify_change (xafile, attrs);
+    dput (xafile);
+
+    return err;
+}
+
+int
+reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs)
+{
+    struct file *fp;
+    struct dentry *dir;
+    int err = 0;
+    struct reiserfs_chown_buf buf;
+    unsigned int ia_valid = attrs->ia_valid;
+
+    /* Skip out, an xattr has no xattrs associated with it */
+    if (is_reiserfs_priv_object (inode) ||
+        get_inode_sd_version (inode) == STAT_DATA_V1 || 
+        !reiserfs_xattrs(inode->i_sb))
+    {
+        return 0;
+    }
+    reiserfs_read_lock_xattrs (inode->i_sb);
+    dir = open_xa_dir (inode, FL_READONLY);
+    reiserfs_read_unlock_xattrs (inode->i_sb);
+    if (IS_ERR (dir)) {
+        if (PTR_ERR (dir) != -ENODATA)
+            err = PTR_ERR (dir);
+        goto out;
+    } else if (!dir->d_inode) {
+        dput (dir);
+        goto out;
+    }
+
+    fp = dentry_open (dir, NULL, O_RDWR);
+    if (IS_ERR (fp)) {
+        err = PTR_ERR (fp);
+        /* dentry_open dputs the dentry if it fails */
+        goto out;
+    }
+
+    lock_kernel ();
+
+    attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME);
+    buf.xadir = dir;
+    buf.attrs = attrs;
+    buf.inode = inode;
+
+    err = xattr_readdir (fp, reiserfs_chown_xattrs_filler, &buf);
+    if (err) {
+        unlock_kernel ();
+        goto out_dir;
+    }
+
+    err = notify_change (dir, attrs);
+    unlock_kernel ();
+
+out_dir:
+    fput(fp);
+
+out:
+    attrs->ia_valid = ia_valid;
+    return err;
+}
+
+
+/* Actual operations that are exported to VFS-land */
+
+/*
+ * Inode operation getxattr()
+ * Preliminary locking: we down dentry->d_inode->i_sem
+ */
+ssize_t
+reiserfs_getxattr (struct dentry *dentry, const char *name, void *buffer,
+                   size_t size)
+{
+    struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name);
+    int err;
+
+    if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
+        get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1)
+        return -EOPNOTSUPP;
+    
+    reiserfs_read_lock_xattr_i (dentry->d_inode);
+    reiserfs_read_lock_xattrs (dentry->d_sb);
+    err = xah->get (dentry->d_inode, name, buffer, size);
+    reiserfs_read_unlock_xattrs (dentry->d_sb);
+    reiserfs_read_unlock_xattr_i (dentry->d_inode);
+    return err;
+}
+
+
+/*
+ * Inode operation setxattr()
+ *
+ * dentry->d_inode->i_sem down
+ */
+int
+reiserfs_setxattr (struct dentry *dentry, const char *name, const void *value,
+                   size_t size, int flags)
+{
+    struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name);
+    int err;
+    int lock;
+
+    if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
+        get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1)
+        return -EOPNOTSUPP;
+
+    if (IS_RDONLY (dentry->d_inode))
+        return -EROFS;
+
+    if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode))
+        return -EROFS;
+
+    reiserfs_write_lock_xattr_i (dentry->d_inode);
+    lock = !has_xattr_dir (dentry->d_inode);
+    if (lock)
+        reiserfs_write_lock_xattrs (dentry->d_sb);
+    else
+        reiserfs_read_lock_xattrs (dentry->d_sb);
+    err = xah->set (dentry->d_inode, name, value, size, flags);
+    if (lock)
+        reiserfs_write_unlock_xattrs (dentry->d_sb);
+    else
+        reiserfs_read_unlock_xattrs (dentry->d_sb);
+    reiserfs_write_unlock_xattr_i (dentry->d_inode);
+    return err;
+}
+
+/*
+ * Inode operation removexattr()
+ *
+ * dentry->d_inode->i_sem down
+ */
+int
+reiserfs_removexattr (struct dentry *dentry, const char *name)
+{
+    int err;
+    struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name);
+    int lock;
+
+    if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
+        get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1)
+        return -EOPNOTSUPP;
+
+    if (IS_RDONLY (dentry->d_inode))
+        return -EROFS;
+
+    if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode))
+        return -EPERM;
+
+    reiserfs_write_lock_xattr_i (dentry->d_inode);
+    reiserfs_read_lock_xattrs (dentry->d_sb);
+
+    /* Deletion pre-operation */
+    if (xah->del) {
+        err = xah->del (dentry->d_inode, name);
+        if (err)
+            goto out;
+    }
+
+    err = reiserfs_xattr_del (dentry->d_inode, name);
+
+    dentry->d_inode->i_ctime = CURRENT_TIME;
+    mark_inode_dirty (dentry->d_inode);
+
+out:
+    reiserfs_read_unlock_xattrs (dentry->d_sb);
+    reiserfs_write_unlock_xattr_i (dentry->d_inode);
+    return err;
+}
+
+
+/* This is what filldir will use:
+ * r_pos will always contain the amount of space required for the entire
+ * list. If r_pos becomes larger than r_size, we need more space and we
+ * return an error indicating this. If r_pos is less than r_size, then we've
+ * filled the buffer successfully and we return success */
+struct reiserfs_listxattr_buf {
+    int r_pos;
+    int r_size;
+    char *r_buf;
+    struct inode *r_inode;
+};
+
+static int
+reiserfs_listxattr_filler (void *buf, const char *name, int namelen,
+                           loff_t offset, ino_t ino, unsigned int d_type)
+{
+    struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf;
+    int len = 0;
+    if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) {
+        struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name);
+        if (!xah) return 0; /* Unsupported xattr name, skip it */
+
+        /* We call ->list() twice because the operation isn't required to just
+         * return the name back - we want to make sure we have enough space */
+        len += xah->list (b->r_inode, name, namelen, NULL);
+
+        if (len) {
+            if (b->r_pos + len + 1 <= b->r_size) {
+                char *p = b->r_buf + b->r_pos;
+                p += xah->list (b->r_inode, name, namelen, p);
+                *p++ = '\0';
+            }
+            b->r_pos += len + 1;
+        }
+    }
+
+    return 0;
+}
+/*
+ * Inode operation listxattr()
+ *
+ * Preliminary locking: we down dentry->d_inode->i_sem
+ */
+ssize_t
+reiserfs_listxattr (struct dentry *dentry, char *buffer, size_t size)
+{
+    struct file *fp;
+    struct dentry *dir;
+    int err = 0;
+    struct reiserfs_listxattr_buf buf;
+
+    if (!dentry->d_inode)
+        return -EINVAL;
+
+    if (!reiserfs_xattrs(dentry->d_sb) ||
+        get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1)
+        return -EOPNOTSUPP;
+
+    reiserfs_read_lock_xattr_i (dentry->d_inode);
+    reiserfs_read_lock_xattrs (dentry->d_sb);
+    dir = open_xa_dir (dentry->d_inode, FL_READONLY);
+    reiserfs_read_unlock_xattrs (dentry->d_sb);
+    if (IS_ERR (dir)) {
+        err = PTR_ERR (dir);
+        if (err == -ENODATA)
+            err = 0; /* Not an error if there aren't any xattrs */
+        goto out;
+    }
+
+    fp = dentry_open (dir, NULL, O_RDWR);
+    if (IS_ERR (fp)) {
+        err = PTR_ERR (fp);
+        /* dentry_open dputs the dentry if it fails */
+        goto out;
+    }
+
+    buf.r_buf = buffer;
+    buf.r_size = buffer ? size : 0;
+    buf.r_pos = 0;
+    buf.r_inode = dentry->d_inode;
+
+    REISERFS_I(dentry->d_inode)->i_flags |= i_has_xattr_dir;
+
+    err = xattr_readdir (fp, reiserfs_listxattr_filler, &buf);
+    if (err)
+        goto out_dir;
+
+    if (buf.r_pos > buf.r_size && buffer != NULL)
+        err = -ERANGE;
+    else
+        err = buf.r_pos;
+
+out_dir:
+    fput(fp);
+
+out:
+    reiserfs_read_unlock_xattr_i (dentry->d_inode);
+    return err;
+}
+
+/* This is the implementation for the xattr plugin infrastructure */
+static struct list_head xattr_handlers = LIST_HEAD_INIT (xattr_handlers);
+static rwlock_t handler_lock = RW_LOCK_UNLOCKED;
+
+static struct reiserfs_xattr_handler *
+find_xattr_handler_prefix (const char *prefix)
+{
+    struct reiserfs_xattr_handler *xah = NULL;
+    struct list_head *p;
+
+    read_lock (&handler_lock);
+    list_for_each (p, &xattr_handlers) {
+        xah = list_entry (p, struct reiserfs_xattr_handler, handlers);
+        if (strncmp (xah->prefix, prefix, strlen (xah->prefix)) == 0)
+            break;
+        xah = NULL;
+    }
+
+    read_unlock (&handler_lock);
+    return xah;
+}
+
+static void
+__unregister_handlers (void)
+{
+    struct reiserfs_xattr_handler *xah;
+    struct list_head *p, *tmp;
+
+    list_for_each_safe (p, tmp, &xattr_handlers) {
+        xah = list_entry (p, struct reiserfs_xattr_handler, handlers);
+        if (xah->exit)
+            xah->exit();
+
+        list_del_init (p);
+    }
+    INIT_LIST_HEAD (&xattr_handlers);
+}
+
+int __init
+reiserfs_xattr_register_handlers (void)
+{
+    int err = 0;
+    struct reiserfs_xattr_handler *xah;
+    struct list_head *p;
+
+    write_lock (&handler_lock);
+
+    /* If we're already initialized, nothing to do */
+    if (!list_empty (&xattr_handlers)) {
+        write_unlock (&handler_lock);
+        return 0;
+    }
+
+    /* Add the handlers */
+    list_add_tail (&user_handler.handlers, &xattr_handlers);
+    list_add_tail (&trusted_handler.handlers, &xattr_handlers);
+#ifdef CONFIG_REISERFS_FS_SECURITY
+    list_add_tail (&security_handler.handlers, &xattr_handlers);
+#endif
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+    list_add_tail (&posix_acl_access_handler.handlers, &xattr_handlers);
+    list_add_tail (&posix_acl_default_handler.handlers, &xattr_handlers);
+#endif
+
+    /* Run initializers, if available */
+    list_for_each (p, &xattr_handlers) {
+        xah = list_entry (p, struct reiserfs_xattr_handler, handlers);
+        if (xah->init) {
+            err = xah->init ();
+            if (err) {
+                list_del_init (p);
+                break;
+            }
+        }
+    }
+
+    /* Clean up other handlers, if any failed */
+    if (err)
+        __unregister_handlers ();
+
+    write_unlock (&handler_lock);
+    return err;
+}
+
+void
+reiserfs_xattr_unregister_handlers (void)
+{
+    write_lock (&handler_lock);
+    __unregister_handlers ();
+    write_unlock (&handler_lock);
+}
+
+/* This will catch lookups from the fs root to .reiserfs_priv */
+static int
+xattr_lookup_poison (struct dentry *dentry, struct qstr *q1, struct qstr *name)
+{
+    struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
+    if (name->len == priv_root->d_name.len &&
+        name->hash == priv_root->d_name.hash &&
+        !memcmp (name->name, priv_root->d_name.name, name->len)) {
+            return -ENOENT;
+    }
+    return 0;
+}
+
+static struct dentry_operations xattr_lookup_poison_ops = {
+    .d_compare = xattr_lookup_poison,
+};
+
+
+/* We need to take a copy of the mount flags since things like
+ * MS_RDONLY don't get set until *after* we're called.
+ * mount_flags != mount_options */
+int
+reiserfs_xattr_init (struct super_block *s, int mount_flags)
+{
+  int err = 0;
+
+  /* We need generation numbers to ensure that the oid mapping is correct
+   * v3.5 filesystems don't have them. */
+  if (!old_format_only (s)) {
+    set_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
+  } else if (reiserfs_xattrs_optional (s)) {
+    /* Old format filesystem, but optional xattrs have been enabled
+     * at mount time. Error out. */
+    reiserfs_warning ("reiserfs: xattrs/ACLs not supported on pre v3.6 "
+                      "format filesystem. Failing mount.\n");
+    err = -EOPNOTSUPP;
+    goto error;
+  } else {
+    /* Old format filesystem, but no optional xattrs have been enabled. This
+     * means we silently disable xattrs on the filesystem. */
+    clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
+  }
+
+  /* If we don't have the privroot located yet - go find it */
+  if (reiserfs_xattrs (s) && !REISERFS_SB(s)->priv_root) {
+      struct dentry *dentry;
+      dentry = lookup_one_len (PRIVROOT_NAME, s->s_root,
+                               strlen (PRIVROOT_NAME));
+      if (!IS_ERR (dentry)) {
+        if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) {
+            struct inode *inode = dentry->d_parent->d_inode;
+            down (&inode->i_sem);
+            err = inode->i_op->mkdir (inode, dentry, 0700);
+            up (&inode->i_sem);
+            if (err) {
+                dput (dentry);
+                dentry = NULL;
+            }
+
+            if (dentry && dentry->d_inode)
+                reiserfs_warning ("reiserfs: Created %s on %s - reserved for "
+                                  "xattr storage.\n", PRIVROOT_NAME, 
+                                  reiserfs_bdevname (inode->i_sb));
+        } else if (!dentry->d_inode) {
+            dput (dentry);
+            dentry = NULL;
+        }
+      } else
+        err = PTR_ERR (dentry);
+
+      if (!err && dentry) {
+          s->s_root->d_op = &xattr_lookup_poison_ops;
+          REISERFS_I(dentry->d_inode)->i_flags |= i_priv_object;
+          REISERFS_SB(s)->priv_root = dentry;
+      } else if (!(mount_flags & MS_RDONLY)) { /* xattrs are unavailable */
+          /* If we're read-only it just means that the dir hasn't been
+           * created. Not an error -- just no xattrs on the fs. We'll
+           * check again if we go read-write */
+          reiserfs_warning ("reiserfs: xattrs/ACLs enabled and couldn't "
+                            "find/create .reiserfs_priv. Failing mount.\n");
+          err = -EOPNOTSUPP;
+      }
+  }
+
+error:
+   /* This is only nonzero if there was an error initializing the xattr
+    * directory or if there is a condition where we don't support them. */
+    if (err) {
+          clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
+          clear_bit (REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
+          clear_bit (REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
+    }
+
+    /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
+    s->s_flags = s->s_flags & ~MS_POSIXACL;
+    if (reiserfs_posixacl (s))
+	s->s_flags |= MS_POSIXACL;
+
+    return err;
+}
+
+static int
+__reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd,
+                       int need_lock)
+{
+	umode_t			mode = inode->i_mode;
+
+	if (mask & MAY_WRITE) {
+		/*
+		 * Nobody gets write access to a read-only fs.
+		 */
+		if (IS_RDONLY(inode) &&
+		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+			return -EROFS;
+
+		/*
+		 * Nobody gets write access to an immutable file.
+		 */
+		if (IS_IMMUTABLE(inode))
+			return -EACCES;
+	}
+
+	/* We don't do permission checks on the internal objects.
+	* Permissions are determined by the "owning" object. */
+        if (is_reiserfs_priv_object (inode))
+		return 0;
+
+	if (current->fsuid == inode->i_uid) {
+		mode >>= 6;
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+	} else if (reiserfs_posixacl(inode->i_sb) &&
+                   get_inode_sd_version (inode) != STAT_DATA_V1) {
+                struct posix_acl *acl;
+
+		/* ACL can't contain additional permissions if
+		   the ACL_MASK entry is 0 */
+		if (!(mode & S_IRWXG))
+			goto check_groups;
+
+                reiserfs_read_lock_xattr_i (inode);
+                if (need_lock)
+                    reiserfs_read_lock_xattrs (inode->i_sb);
+                acl = reiserfs_get_acl (inode, ACL_TYPE_ACCESS);
+                if (need_lock)
+                    reiserfs_read_unlock_xattrs (inode->i_sb);
+                reiserfs_read_unlock_xattr_i (inode);
+                if (IS_ERR (acl)) {
+                    if (PTR_ERR (acl) == -ENODATA)
+                        goto check_groups;
+                    return PTR_ERR (acl);
+                }
+
+                if (acl) {
+                    int err = posix_acl_permission (inode, acl, mask);
+                    posix_acl_release (acl);
+                    if (err == -EACCES) {
+                        goto check_capabilities;
+                    }
+                    return err;
+		} else {
+			goto check_groups;
+                }
+#endif
+	} else {
+check_groups:
+		if (in_group_p(inode->i_gid))
+			mode >>= 3;
+	}
+
+	/*
+	 * If the DACs are ok we don't need any capability check.
+	 */
+	if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
+		return 0;
+
+check_capabilities:
+	/*
+	 * Read/write DACs are always overridable.
+	 * Executable DACs are overridable if at least one exec bit is set.
+	 */
+	if (!(mask & MAY_EXEC) ||
+	    (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
+		if (capable(CAP_DAC_OVERRIDE))
+			return 0;
+
+	/*
+	 * Searching includes executable on directories, else just read.
+	 */
+	if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
+		if (capable(CAP_DAC_READ_SEARCH))
+			return 0;
+
+	return -EACCES;
+}
+
+int
+reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd)
+{
+    return __reiserfs_permission (inode, mask, nd, 1);
+}
+
+int
+reiserfs_permission_locked (struct inode *inode, int mask, struct nameidata *nd)
+{
+    return __reiserfs_permission (inode, mask, nd, 0);
+}
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_security.c linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_security.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_security.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_security.c	2004-03-19 16:10:24.038962502 +0100
@@ -0,0 +1,69 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include <linux/reiserfs_xattr.h>
+#include <asm/uaccess.h>
+
+#define XATTR_SECURITY_PREFIX "security."
+
+static int
+security_get (struct inode *inode, const char *name, void *buffer, size_t size)
+{
+    if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
+        return -EINVAL;
+
+    if (is_reiserfs_priv_object(inode))
+        return -EPERM;
+
+    return reiserfs_xattr_get (inode, name, buffer, size);
+}
+
+static int
+security_set (struct inode *inode, const char *name, const void *buffer,
+          size_t size, int flags)
+{
+    if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
+        return -EINVAL;
+
+    if (is_reiserfs_priv_object(inode))
+        return -EPERM;
+
+    return reiserfs_xattr_set (inode, name, buffer, size, flags);
+}
+
+static int
+security_del (struct inode *inode, const char *name)
+{
+    if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
+        return -EINVAL;
+
+    if (is_reiserfs_priv_object(inode))
+        return -EPERM;
+
+    return 0;
+}
+
+static int
+security_list (struct inode *inode, const char *name, int namelen, char *out)
+{
+    int len = namelen;
+
+    if (is_reiserfs_priv_object(inode))
+        return 0;
+
+    if (out)
+        memcpy (out, name, len);
+
+    return len;
+}
+
+
+struct reiserfs_xattr_handler security_handler = {
+    prefix: XATTR_SECURITY_PREFIX,
+    get: security_get,
+    set: security_set,
+    del: security_del,
+    list: security_list,
+};
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_trusted.c linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_trusted.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_trusted.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_trusted.c	2004-03-19 16:10:24.034962932 +0100
@@ -0,0 +1,81 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include <linux/reiserfs_xattr.h>
+#include <asm/uaccess.h>
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+
+static int
+trusted_get (struct inode *inode, const char *name, void *buffer, size_t size)
+{
+    if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
+        return -EINVAL;
+
+    if (!reiserfs_xattrs (inode->i_sb))
+        return -EOPNOTSUPP;
+
+    if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+        return -EPERM;
+
+    return reiserfs_xattr_get (inode, name, buffer, size);
+}
+
+static int
+trusted_set (struct inode *inode, const char *name, const void *buffer,
+          size_t size, int flags)
+{
+    if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
+        return -EINVAL;
+
+    if (!reiserfs_xattrs (inode->i_sb))
+        return -EOPNOTSUPP;
+
+    if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+        return -EPERM;
+
+    return reiserfs_xattr_set (inode, name, buffer, size, flags);
+}
+
+static int
+trusted_del (struct inode *inode, const char *name)
+{
+    if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
+        return -EINVAL;
+
+    if (!reiserfs_xattrs (inode->i_sb))
+        return -EOPNOTSUPP;
+
+    if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+        return -EPERM;
+
+    return 0;
+}
+
+static int
+trusted_list (struct inode *inode, const char *name, int namelen, char *out)
+{
+    int len = namelen;
+
+    if (!reiserfs_xattrs (inode->i_sb))
+        return 0;
+
+    if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+        return 0;
+
+    if (out)
+        memcpy (out, name, len);
+
+    return len;
+}
+
+
+struct reiserfs_xattr_handler trusted_handler = {
+    prefix: XATTR_TRUSTED_PREFIX,
+    get: trusted_get,
+    set: trusted_set,
+    del: trusted_del,
+    list: trusted_list,
+};
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_user.c linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_user.c
--- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_user.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_user.c	2004-03-19 16:10:24.027963686 +0100
@@ -0,0 +1,99 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include <linux/reiserfs_xattr.h>
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+# include <linux/reiserfs_acl.h>
+#endif
+
+#define XATTR_USER_PREFIX "user."
+
+static int
+user_get (struct inode *inode, const char *name, void *buffer, size_t size)
+{
+
+    int error;
+
+    if (strlen(name) < sizeof(XATTR_USER_PREFIX))
+        return -EINVAL;
+
+    if (!reiserfs_xattrs_user (inode->i_sb))
+        return -EOPNOTSUPP;
+
+    error = reiserfs_permission_locked (inode, MAY_READ, NULL);
+    if (error)
+        return error;
+
+    return reiserfs_xattr_get (inode, name, buffer, size);
+}
+
+static int
+user_set (struct inode *inode, const char *name, const void *buffer,
+          size_t size, int flags)
+{
+
+    int error;
+
+    if (strlen(name) < sizeof(XATTR_USER_PREFIX))
+        return -EINVAL;
+
+    if (!reiserfs_xattrs_user (inode->i_sb))
+        return -EOPNOTSUPP;
+
+    if (!S_ISREG (inode->i_mode) &&
+        (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX))
+        return -EPERM;
+
+    error = reiserfs_permission_locked (inode, MAY_WRITE, NULL);
+    if (error)
+        return error;
+
+    return reiserfs_xattr_set (inode, name, buffer, size, flags);
+}
+
+static int
+user_del (struct inode *inode, const char *name)
+{
+    int error;
+
+    if (strlen(name) < sizeof(XATTR_USER_PREFIX))
+        return -EINVAL;
+
+    if (!reiserfs_xattrs_user (inode->i_sb))
+        return -EOPNOTSUPP;
+
+    if (!S_ISREG (inode->i_mode) &&
+        (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX))
+        return -EPERM;
+
+    error = reiserfs_permission_locked (inode, MAY_WRITE, NULL);
+    if (error)
+        return error;
+
+    return 0;
+}
+
+static int
+user_list (struct inode *inode, const char *name, int namelen, char *out)
+{
+    int len = namelen;
+    if (!reiserfs_xattrs_user (inode->i_sb))
+        return 0;
+
+    if (out)
+        memcpy (out, name, len);
+
+    return len;
+}
+
+struct reiserfs_xattr_handler user_handler = {
+    prefix: XATTR_USER_PREFIX,
+    get: user_get,
+    set: user_set,
+    del: user_del,
+    list: user_list,
+};
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/bio.h linux-2.6.5-rc1-mm2/include/linux/bio.h
--- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/bio.h	2004-03-19 15:13:58.000000000 +0100
+++ linux-2.6.5-rc1-mm2/include/linux/bio.h	2004-03-19 16:10:28.737456662 +0100
@@ -140,6 +140,8 @@ struct bio {
 #define bio_cur_sectors(bio)	(bio_iovec(bio)->bv_len >> 9)
 #define bio_data(bio)		(page_address(bio_page((bio))) + bio_offset((bio)))
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
+#define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
+#define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
 
 /*
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/blkdev.h linux-2.6.5-rc1-mm2/include/linux/blkdev.h
--- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/blkdev.h	2004-03-19 15:13:58.000000000 +0100
+++ linux-2.6.5-rc1-mm2/include/linux/blkdev.h	2004-03-19 16:10:28.667464197 +0100
@@ -195,6 +195,8 @@ enum rq_flag_bits {
 	__REQ_PM_SUSPEND,	/* suspend request */
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
+	__REQ_BAR_PREFLUSH,	/* barrier pre-flush done */
+	__REQ_BAR_POSTFLUSH,	/* barrier post-flush */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -220,6 +222,8 @@ enum rq_flag_bits {
 #define REQ_PM_SUSPEND	(1 << __REQ_PM_SUSPEND)
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
+#define REQ_BAR_PREFLUSH	(1 << __REQ_BAR_PREFLUSH)
+#define REQ_BAR_POSTFLUSH	(1 << __REQ_BAR_POSTFLUSH)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
@@ -371,6 +375,7 @@ struct request_queue
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
+#define QUEUE_FLAG_ORDERED	8	/* supports ordered writes */
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
@@ -386,6 +391,10 @@ struct request_queue
 #define blk_pm_request(rq)	\
 	((rq)->flags & (REQ_PM_SUSPEND | REQ_PM_RESUME))
 
+#define blk_barrier_rq(rq)	((rq)->flags & REQ_HARDBARRIER)
+#define blk_barrier_preflush(rq)	((rq)->flags & REQ_BAR_PREFLUSH)
+#define blk_barrier_postflush(rq)	((rq)->flags & REQ_BAR_POSTFLUSH)
+
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
 #define rq_data_dir(rq)		((rq)->flags & 1)
@@ -583,6 +592,7 @@ extern void blk_queue_prep_rq(request_qu
 extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(request_queue_t *, int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
+extern void blk_queue_ordered(request_queue_t *, int);
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
@@ -610,6 +620,7 @@ extern long blk_congestion_wait(int rw, 
 
 extern void blk_rq_bio_prep(request_queue_t *, struct request *, struct bio *);
 extern void blk_rq_prep_restart(struct request *);
+extern int blkdev_issue_flush(struct block_device *);
 
 #define MAX_PHYS_SEGMENTS 128
 #define MAX_HW_SEGMENTS 128
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/buffer_head.h linux-2.6.5-rc1-mm2/include/linux/buffer_head.h
--- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/buffer_head.h	2004-03-11 03:55:21.000000000 +0100
+++ linux-2.6.5-rc1-mm2/include/linux/buffer_head.h	2004-03-19 16:10:27.727565387 +0100
@@ -26,6 +26,7 @@ enum bh_state_bits {
 	BH_Delay,	/* Buffer is not yet allocated on disk */
 	BH_Boundary,	/* Block is followed by a discontiguity */
 	BH_Write_EIO,	/* I/O error on write */
+	BH_Ordered,	/* ordered write */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
@@ -117,7 +118,8 @@ BUFFER_FNS(Async_Read, async_read)
 BUFFER_FNS(Async_Write, async_write)
 BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
-BUFFER_FNS(Write_EIO,write_io_error)
+BUFFER_FNS(Write_EIO, write_io_error)
+BUFFER_FNS(Ordered, ordered)
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 #define touch_buffer(bh)	mark_page_accessed(bh->b_page)
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/ext3_fs.h linux-2.6.5-rc1-mm2/include/linux/ext3_fs.h
--- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/ext3_fs.h	2004-03-11 03:55:33.000000000 +0100
+++ linux-2.6.5-rc1-mm2/include/linux/ext3_fs.h	2004-03-19 16:10:28.716458922 +0100
@@ -324,6 +324,7 @@ struct ext3_inode {
 #define EXT3_MOUNT_NO_UID32		0x2000  /* Disable 32-bit UIDs */
 #define EXT3_MOUNT_XATTR_USER		0x4000	/* Extended user attributes */
 #define EXT3_MOUNT_POSIX_ACL		0x8000	/* POSIX Access Control Lists */
+#define EXT3_MOUNT_BARRIER		0x10000 /* Use block barriers */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/fs.h linux-2.6.5-rc1-mm2/include/linux/fs.h
--- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/fs.h	2004-03-19 15:13:58.000000000 +0100
+++ linux-2.6.5-rc1-mm2/include/linux/fs.h	2004-03-19 16:10:28.669463982 +0100
@@ -85,6 +85,7 @@ extern int leases_enable, dir_notify_ena
 #define SPECIAL 4	/* For non-blockdevice requests in request queue */
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
 #define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
+#define WRITE_BARRIER	((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
 
 #define SEL_IN		1
 #define SEL_OUT		2
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/ide.h linux-2.6.5-rc1-mm2/include/linux/ide.h
--- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/ide.h	2004-03-19 15:13:58.000000000 +0100
+++ linux-2.6.5-rc1-mm2/include/linux/ide.h	2004-03-19 16:10:28.682462582 +0100
@@ -732,6 +732,8 @@ typedef struct ide_drive_s {
 	u8	bios_head;	/* BIOS/fdisk/LILO number of heads */
 	u8	bios_sect;	/* BIOS/fdisk/LILO sectors per track */
 	u8	queue_depth;	/* max queue depth */
+	u8	doing_barrier;	/* state, 1=currently doing flush */
+	u8	last_rq_flush;	/* last rq was a flush */
 
 	unsigned int	bios_cyl;	/* BIOS/fdisk/LILO number of cyls */
 	unsigned int	cyl;		/* "real" number of cyls */
@@ -744,6 +746,7 @@ typedef struct ide_drive_s {
 
 	int		lun;		/* logical unit */
 	int		crc_count;	/* crc counter to reduce drive speed */
+	char		special_buf[8];	/* private command buffer */
 	struct list_head list;
 	struct device	gendev;
 	struct semaphore gendev_rel_sem;	/* to deal with device release() */
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/jbd.h linux-2.6.5-rc1-mm2/include/linux/jbd.h
--- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/jbd.h	2004-03-11 03:55:43.000000000 +0100
+++ linux-2.6.5-rc1-mm2/include/linux/jbd.h	2004-03-19 16:10:28.739456446 +0100
@@ -825,6 +825,7 @@ struct journal_s
 #define JFS_ACK_ERR	0x004	/* The errno in the sb has been acked */
 #define JFS_FLUSHED	0x008	/* The journal superblock has been flushed */
 #define JFS_LOADED	0x010	/* The journal superblock has been loaded */
+#define JFS_BARRIER	0x020	/* Use IDE barriers */
 
 /* 
  * Function declarations for the journaling transaction and buffer
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_acl.h linux-2.6.5-rc1-mm2/include/linux/reiserfs_acl.h
--- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_acl.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.5-rc1-mm2/include/linux/reiserfs_acl.h	2004-03-19 16:10:28.821447619 +0100
@@ -0,0 +1,91 @@
+#include <linux/init.h>
+#include <linux/posix_acl.h>
+#include <linux/xattr_acl.h>
+
+#define REISERFS_ACL_VERSION	0x0001
+
+typedef struct {
+	__u16		e_tag;
+	__u16		e_perm;
+	__u32		e_id;
+} reiserfs_acl_entry;
+
+typedef struct {
+	__u16		e_tag;
+	__u16		e_perm;
+} reiserfs_acl_entry_short;
+
+typedef struct {
+	__u32		a_version;
+} reiserfs_acl_header;
+
+static inline size_t reiserfs_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(reiserfs_acl_header) +
+		       count * sizeof(reiserfs_acl_entry_short);
+	} else {
+		return sizeof(reiserfs_acl_header) +
+		       4 * sizeof(reiserfs_acl_entry_short) +
+		       (count - 4) * sizeof(reiserfs_acl_entry);
+	}
+}
+
+static inline int reiserfs_acl_count(size_t size)
+{
+	ssize_t s;
+	size -= sizeof(reiserfs_acl_header);
+	s = size - 4 * sizeof(reiserfs_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(reiserfs_acl_entry_short))
+			return -1;
+		return size / sizeof(reiserfs_acl_entry_short);
+	} else {
+		if (s % sizeof(reiserfs_acl_entry))
+			return -1;
+		return s / sizeof(reiserfs_acl_entry) + 4;
+	}
+}
+
+
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+struct posix_acl * reiserfs_get_acl(struct inode *inode, int type);
+int reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl);
+int reiserfs_acl_chmod (struct inode *inode);
+int reiserfs_inherit_default_acl (struct inode *dir, struct dentry *dentry, struct inode *inode);
+int reiserfs_cache_default_acl (struct inode *dir);
+extern int reiserfs_xattr_posix_acl_init (void) __init;
+extern int reiserfs_xattr_posix_acl_exit (void);
+extern struct reiserfs_xattr_handler posix_acl_default_handler;
+extern struct reiserfs_xattr_handler posix_acl_access_handler;
+#else
+
+#define reiserfs_set_acl NULL
+#define reiserfs_get_acl NULL
+#define reiserfs_cache_default_acl(inode) 0
+
+static inline int
+reiserfs_xattr_posix_acl_init (void)
+{
+    return 0;
+}
+
+static inline int
+reiserfs_xattr_posix_acl_exit (void)
+{
+    return 0;
+}
+
+static inline int
+reiserfs_acl_chmod (struct inode *inode)
+{
+    return 0;
+}
+
+static inline int
+reiserfs_inherit_default_acl (const struct inode *dir, struct dentry *dentry, struct inode *inode)
+{
+    return 0;
+}
+
+#endif
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs.h linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs.h
--- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs.h	2004-03-19 15:13:58.000000000 +0100
+++ linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs.h	2004-03-19 16:30:24.730644952 +0100
@@ -268,6 +268,7 @@ int is_reiserfs_jr (struct reiserfs_supe
 #define NO_DISK_SPACE -3
 #define NO_BALANCING_NEEDED  (-4)
 #define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5)
+#define QUOTA_EXCEEDED -6
 
 typedef __u32 b_blocknr_t;
 typedef __u32 unp_t;
@@ -287,7 +288,7 @@ struct unfm_nodeinfo {
 #define STAT_DATA_V2 1
 
 
-static inline struct reiserfs_inode_info *REISERFS_I(struct inode *inode)
+static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode)
 {
 	return container_of(inode, struct reiserfs_inode_info, vfs_inode);
 }
@@ -1238,7 +1239,6 @@ excessive effort to avoid disturbing the
 gods only know how we are going to SMP the code that uses them.
 znodes are the way! */
 
-
 struct  path {
   int                   path_length;                      	/* Length of the array above.   */
   struct  path_element  path_elements[EXTENDED_MAX_HEIGHT];	/* Array of the path elements.  */
@@ -1702,45 +1702,91 @@ struct reiserfs_journal_header {
 	 (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
 #define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
 
-/* finds n'th buffer with 0 being the start of this commit.  Needs to go away, j_ap_blocks has changed
-** since I created this.  One chunk of code in journal.c needs changing before deleting it
-*/
-#define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT])
-
 // We need these to make journal.c code more readable
 #define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 #define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 
-void reiserfs_commit_for_inode(struct inode *) ;
+enum reiserfs_bh_state_bits {
+    BH_JDirty = BH_PrivateStart,
+    BH_JDirty_wait,
+    BH_JNew,
+    BH_JPrepared,
+    BH_JRestore_dirty,
+    BH_JTest, // debugging only will go away
+};
+
+/*
+** transaction handle which is passed around for all journal calls
+*/
+struct reiserfs_transaction_handle {
+  struct super_block *t_super ; /* super for this FS when journal_begin was 
+				   called. saves calls to reiserfs_get_super 
+				   also used by nested transactions to make
+				   sure they are nesting on the right FS
+				   _must_ be first in the handle 
+				*/
+  int t_refcount;
+  int t_blocks_logged ;         /* number of blocks this writer has logged */
+  int t_blocks_allocated ;      /* number of blocks this writer allocated */
+  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
+  void *t_handle_save ;		/* save existing current->journal_info */
+  int displace_new_blocks:1;	/* if new block allocation occurres, that block
+				   should be displaced from others */
+} ;
+
+/* used to keep track of ordered and tail writes, attached to the buffer
+ * head through b_journal_head.
+ */
+struct reiserfs_jh {
+    struct reiserfs_journal_list *jl;
+    struct buffer_head *bh;
+    struct list_head list;
+};
+
+void reiserfs_free_jh(struct buffer_head *bh);
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
+int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
+
+static inline int reiserfs_transaction_running(struct super_block *s) {
+    struct reiserfs_transaction_handle *th = current->journal_info ;
+    if (th && th->t_super == s)
+        return 1 ;
+    if (th && th->t_super == NULL)
+        BUG();
+    return 0 ;
+}
+
+int reiserfs_async_progress_wait(struct super_block *s);
+
+struct reiserfs_transaction_handle *
+reiserfs_persistent_transaction(struct super_block *, int count);
+int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+		unsigned from, unsigned to);
+int reiserfs_flush_old_commits(struct super_block *);
+int reiserfs_commit_for_inode(struct inode *) ;
+int  reiserfs_inode_needs_commit(struct inode *) ;
 void reiserfs_update_inode_transaction(struct inode *) ;
 void reiserfs_wait_on_write_block(struct super_block *s) ;
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ;
 void reiserfs_allow_writes(struct super_block *s) ;
 void reiserfs_check_lock_depth(char *caller) ;
-void reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ;
+int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ;
 void reiserfs_restore_prepared_buffer(struct super_block *, struct buffer_head *bh) ;
 int journal_init(struct super_block *, const char * j_dev_name, int old_format, unsigned int) ;
 int journal_release(struct reiserfs_transaction_handle*, struct super_block *) ;
 int journal_release_error(struct reiserfs_transaction_handle*, struct super_block *) ;
 int journal_end(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ;
 int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ;
-int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
 int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block *, b_blocknr_t blocknr) ;
-int push_journal_writer(char *w) ;
-int pop_journal_writer(int windex) ;
 int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
 int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, int searchall, b_blocknr_t *next) ;
 int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ;
-void flush_async_commits(struct super_block *p_s_sb) ;
 
 int buffer_journaled(const struct buffer_head *bh) ;
 int mark_buffer_journal_new(struct buffer_head *bh) ;
-int reiserfs_add_page_to_flush_list(struct reiserfs_transaction_handle *,
-                                    struct inode *, struct buffer_head *) ;
-int reiserfs_remove_page_from_flush_list(struct reiserfs_transaction_handle *,
-                                         struct inode *) ;
-
 int reiserfs_allocate_list_bitmaps(struct super_block *s, struct reiserfs_list_bitmap *, int) ;
 
 				/* why is this kerplunked right here? */
@@ -1844,11 +1890,13 @@ void pathrelse_and_restore (struct super
 int reiserfs_insert_item (struct reiserfs_transaction_handle *th, 
 			  struct path * path, 
 			  const struct cpu_key * key,
-			  struct item_head * ih, const char * body);
+			  struct item_head * ih,
+			  struct inode *inode, const char * body);
 
 int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th,
 			      struct path * path,
 			      const struct cpu_key * key,
+			      struct inode *inode,
 			      const char * body, int paste_size);
 
 int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th,
@@ -1865,7 +1913,7 @@ int reiserfs_delete_item (struct reiserf
 			  struct buffer_head  * p_s_un_bh);
 
 void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
-                                                                struct key * key);
+				 struct inode *inode, struct key * key);
 void reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * p_s_inode);
 void reiserfs_do_truncate (struct reiserfs_transaction_handle *th, 
 			   struct  inode * p_s_inode, struct page *, 
@@ -1910,11 +1958,22 @@ int reiserfs_new_inode (struct reiserfs_
 				   struct inode * dir, int mode, 
 				   const char * symname, loff_t i_size,
 				   struct dentry *dentry, struct inode *inode);
-int reiserfs_sync_inode (struct reiserfs_transaction_handle *th, struct inode * inode);
-void reiserfs_update_sd (struct reiserfs_transaction_handle *th, struct inode * inode);
+
+int reiserfs_sync_inode (struct reiserfs_transaction_handle *th,
+                         struct inode * inode);
+
+void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
+                              struct inode * inode, loff_t size);
+
+static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
+                                      struct inode *inode)
+{
+    reiserfs_update_sd_size(th, inode, inode->i_size) ;
+}
 
 void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode );
 void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs );
+int reiserfs_setattr(struct dentry *dentry, struct iattr *attr);
 
 /* namei.c */
 void set_de_name_and_namelen (struct reiserfs_dir_entry * de);
@@ -1965,6 +2024,7 @@ int reiserfs_global_version_in_proc( cha
 
 /* dir.c */
 extern struct inode_operations reiserfs_dir_inode_operations;
+extern struct inode_operations reiserfs_symlink_inode_operations;
 extern struct file_operations reiserfs_dir_operations;
 
 /* tail_conversion.c */
@@ -2082,7 +2142,7 @@ typedef struct __reiserfs_blocknr_hint r
 
 int reiserfs_parse_alloc_options (struct super_block *, char *);
 int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value);
-void reiserfs_free_block (struct reiserfs_transaction_handle *th, b_blocknr_t);
+void reiserfs_free_block (struct reiserfs_transaction_handle *th, struct inode *, b_blocknr_t, int for_unformatted);
 int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int);
 extern inline int reiserfs_new_form_blocknrs (struct tree_balance * tb,
 					      b_blocknr_t *new_blocknrs, int amount_needed)
@@ -2183,6 +2243,9 @@ int reiserfs_unpack (struct inode * inod
 #define reiserfs_write_lock( sb ) lock_kernel()
 #define reiserfs_write_unlock( sb ) unlock_kernel()
  			         
+/* xattr stuff */
+#define REISERFS_XATTR_DIR_SEM(s) (REISERFS_SB(s)->xattr_dir_sem)
+
 #endif /* _LINUX_REISER_FS_H */
 
 
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_i.h linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_i.h
--- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_i.h	2004-03-11 03:55:43.000000000 +0100
+++ linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_i.h	2004-03-19 16:10:28.740456339 +0100
@@ -3,6 +3,8 @@
 
 #include <linux/list.h>
 
+struct reiserfs_journal_list;
+
 /** bitmasks for i_flags field in reiserfs-specific part of inode */
 typedef enum {
     /** this says what format of key do all items (but stat data) of
@@ -20,7 +22,9 @@ typedef enum {
       truncate or unlink. Safe link is used to avoid leakage of disk
       space on crash with some files open, but unlinked. */
     i_link_saved_unlink_mask   =  0x0010,
-    i_link_saved_truncate_mask =  0x0020
+    i_link_saved_truncate_mask =  0x0020,
+    i_priv_object              =  0x0080,
+    i_has_xattr_dir            =  0x0100,
 } reiserfs_inode_flags;
 
 
@@ -48,7 +52,11 @@ struct reiserfs_inode_info {
     ** needs to be committed in order for this inode to be properly
     ** flushed */
     unsigned long i_trans_id ;
-    unsigned long i_trans_index ;
+    struct reiserfs_journal_list *i_jl;
+
+    struct posix_acl *i_acl_access;
+    struct posix_acl *i_acl_default;
+    struct rw_semaphore xattr_sem;
     struct inode vfs_inode;
 };
 
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_sb.h linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_sb.h
--- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_sb.h	2004-03-11 03:55:27.000000000 +0100
+++ linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_sb.h	2004-03-19 16:30:24.728645167 +0100
@@ -6,6 +6,7 @@
 
 #ifdef __KERNEL__
 #include <linux/workqueue.h>
+#include <linux/rwsem.h>
 #endif
 
 typedef enum {
@@ -106,21 +107,6 @@ typedef enum {
 #define JOURNAL_MAX_CNODE   1500 /* max cnodes to allocate. */
 #define JOURNAL_HASH_SIZE 8192   
 #define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating.  Must be >= 2 */
-#define JOURNAL_LIST_COUNT 64
-
-/* these are bh_state bit flag offset numbers, for use in the buffer head */
-
-#define BH_JDirty       16      /* journal data needs to be written before buffer can be marked dirty */
-#define BH_JDirty_wait 18	/* commit is done, buffer marked dirty */
-#define BH_JNew 19		/* buffer allocated during this transaction, no need to write if freed during this trans too */
-
-/* ugly.  metadata blocks must be prepared before they can be logged.  
-** prepared means unlocked and cleaned.  If the block is prepared, but not
-** logged for some reason, any bits cleared while preparing it must be 
-** set again.
-*/
-#define BH_JPrepared 20		/* block has been prepared for the log */
-#define BH_JRestore_dirty 22    /* restore the dirty bit later */
 
 /* One of these for every block in every transaction
 ** Each one is in two hash tables.  First, a hash of the current transaction, and after journal_end, a
@@ -154,22 +140,6 @@ struct reiserfs_list_bitmap {
 } ;
 
 /*
-** transaction handle which is passed around for all journal calls
-*/
-struct reiserfs_transaction_handle {
-				/* ifdef it. -Hans */
-  char *t_caller ;              /* debugging use */
-  int t_blocks_logged ;         /* number of blocks this writer has logged */
-  int t_blocks_allocated ;      /* number of blocks this writer allocated */
-  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
-  struct super_block *t_super ; /* super for this FS when journal_begin was 
-                                   called. saves calls to reiserfs_get_super */
-  int displace_new_blocks:1;	/* if new block allocation occurres, that block
-				   should be displaced from others */
-
-} ;
-
-/*
 ** one of these for each transaction.  The most important part here is the j_realblock.
 ** this list of cnodes is used to hash all the blocks in all the commits, to mark all the
 ** real buffer heads dirty once all the commits hit the disk,
@@ -177,23 +147,30 @@ struct reiserfs_transaction_handle {
 ** to be overwritten */
 struct reiserfs_journal_list {
   unsigned long j_start ;
+  unsigned long j_state;
   unsigned long j_len ;
   atomic_t j_nonzerolen ;
   atomic_t j_commit_left ;
-  atomic_t j_flushing ;
-  atomic_t j_commit_flushing ;
   atomic_t j_older_commits_done ;      /* all commits older than this on disk*/
+  struct semaphore j_commit_lock;
   unsigned long j_trans_id ;
   time_t j_timestamp ;
   struct reiserfs_list_bitmap *j_list_bitmap ;
   struct buffer_head *j_commit_bh ; /* commit buffer head */
   struct reiserfs_journal_cnode *j_realblock  ;
   struct reiserfs_journal_cnode *j_freedlist ; /* list of buffers that were freed during this trans.  free each of these on flush */
-  wait_queue_head_t j_commit_wait ; /* wait for all the commit blocks to be flushed */
-  wait_queue_head_t j_flush_wait ; /* wait for all the real blocks to be flushed */
-} ;
+  /* time ordered list of all active transactions */
+  struct list_head j_list;
 
-struct reiserfs_page_list  ; /* defined in reiserfs_fs.h */
+  /* time ordered list of all transactions we haven't tried to flush yet */
+  struct list_head j_working_list;
+
+  /* list of tail conversion targets in need of flush before commit */
+  struct list_head j_tail_bh_list;
+  /* list of data=ordered buffers in need of flush before commit */
+  struct list_head j_bh_list;
+  int j_refcount;
+} ;
 
 struct reiserfs_journal {
   struct buffer_head ** j_ap_blocks ; /* journal blocks on disk */
@@ -216,16 +193,11 @@ struct reiserfs_journal {
   unsigned long j_last_flush_trans_id ;    /* last fully flushed journal timestamp */
   struct buffer_head *j_header_bh ;   
 
-  /* j_flush_pages must be flushed before the current transaction can
-  ** commit
-  */
-  struct reiserfs_page_list *j_flush_pages ;
   time_t j_trans_start_time ;         /* time this transaction started */
-  wait_queue_head_t j_wait ;         /* wait  journal_end to finish I/O */
-  atomic_t j_wlock ;                       /* lock for j_wait */
+  struct semaphore j_lock;
+  struct semaphore j_flush_sem;
   wait_queue_head_t j_join_wait ;    /* wait for current transaction to finish before starting new one */
   atomic_t j_jlock ;                       /* lock for j_join_wait */
-  int j_journal_list_index ;	      /* journal list number of the current trans */
   int j_list_bitmap_index ;	      /* number of next list bitmap to use */
   int j_must_wait ;		       /* no more journal begins allowed. MUST sleep on j_join_wait */
   int j_next_full_flush ;             /* next journal_end will flush all journal list */
@@ -242,24 +214,43 @@ struct reiserfs_journal {
   struct reiserfs_journal_cnode *j_cnode_free_list ;
   struct reiserfs_journal_cnode *j_cnode_free_orig ; /* orig pointer returned from vmalloc */
 
+  struct reiserfs_journal_list *j_current_jl;
   int j_free_bitmap_nodes ;
   int j_used_bitmap_nodes ;
+
+  int j_num_lists;      /* total number of active transactions */
+  int j_num_work_lists; /* number that need attention from kreiserfsd */
+
+  /* debugging to make sure things are flushed in order */
+  int j_last_flush_id;
+
+  /* debugging to make sure things are committed in order */
+  int j_last_commit_id;
+
   struct list_head j_bitmap_nodes ;
   struct list_head j_dirty_buffers ;
   spinlock_t j_dirty_buffers_lock ; /* protects j_dirty_buffers */
+
+  /* list of all active transactions */
+  struct list_head j_journal_list;
+  /* lists that haven't been touched by writeback attempts */
+  struct list_head j_working_list;
+
   struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS] ;	/* array of bitmaps to record the deleted blocks */
-  struct reiserfs_journal_list j_journal_list[JOURNAL_LIST_COUNT] ;	    /* array of all the journal lists */
   struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ; 	    /* hash table for real buffer heads in current trans */ 
   struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for all the real buffer heads in all 
   										the transactions */
   struct list_head j_prealloc_list;     /* list of inodes which have preallocated blocks */
   unsigned long j_max_trans_size ;
   unsigned long j_max_batch_size ;
+
+  /* when flushing ordered buffers, throttle new ordered writers */
+  struct work_struct j_work;
+  atomic_t j_async_throttle;
 };
 
 #define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick.  magic string to find desc blocks in the journal */
 
-
 typedef __u32 (*hashf_t) (const signed char *, int);
 
 struct reiserfs_bitmap_info
@@ -403,18 +394,22 @@ struct reiserfs_sb_info
     struct proc_dir_entry *procdir;
     int reserved_blocks; /* amount of blocks reserved for further allocations */
     spinlock_t bitmap_lock; /* this lock on now only used to protect reserved_blocks variable */
+    struct dentry *priv_root; /* root of /.reiserfs_priv */
+    struct dentry *xattr_root; /* root of /.reiserfs_priv/.xa */
+    struct rw_semaphore xattr_dir_sem;
+
 };
 
 /* Definitions of reiserfs on-disk properties: */
 #define REISERFS_3_5 0
 #define REISERFS_3_6 1
 
+enum reiserfs_mount_options {
 /* Mount options */
-#define REISERFS_LARGETAIL 0  /* large tails will be created in a session */
-#define REISERFS_SMALLTAIL 17  /* small (for files less than block size) tails will be created in a session */
-#define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */
-#define REISERFS_NOLOG 4      /* -o nolog: turn journalling off */
-#define REISERFS_CONVERT 5    /* -o conv: causes conversion of old
+    REISERFS_LARGETAIL,  /* large tails will be created in a session */
+    REISERFS_SMALLTAIL,  /* small (for files less than block size) tails will be created in a session */
+    REPLAYONLY, /* replay journal and return 0. Use by fsck */
+    REISERFS_CONVERT,    /* -o conv: causes conversion of old
                                  format super block to the new
                                  format. If not specified - old
                                  partition will be dealt with in a
@@ -428,26 +423,34 @@ struct reiserfs_sb_info
 ** the existing hash on the FS, so if you have a tea hash disk, and mount
 ** with -o hash=rupasov, the mount will fail.
 */
-#define FORCE_TEA_HASH 6      /* try to force tea hash on mount */
-#define FORCE_RUPASOV_HASH 7  /* try to force rupasov hash on mount */
-#define FORCE_R5_HASH 8       /* try to force rupasov hash on mount */
-#define FORCE_HASH_DETECT 9   /* try to detect hash function on mount */
-
+    FORCE_TEA_HASH,      /* try to force tea hash on mount */
+    FORCE_RUPASOV_HASH,  /* try to force rupasov hash on mount */
+    FORCE_R5_HASH,       /* try to force rupasov hash on mount */
+    FORCE_HASH_DETECT,   /* try to detect hash function on mount */
+
+    REISERFS_DATA_LOG,
+    REISERFS_DATA_ORDERED,
+    REISERFS_DATA_WRITEBACK,
 
 /* used for testing experimental features, makes benchmarking new
    features with and without more convenient, should never be used by
    users in any code shipped to users (ideally) */
 
-#define REISERFS_NO_BORDER 11
-#define REISERFS_NO_UNHASHED_RELOCATION 12
-#define REISERFS_HASHED_RELOCATION 13
-
-#define REISERFS_ATTRS 15
-
-#define REISERFS_TEST1 11
-#define REISERFS_TEST2 12
-#define REISERFS_TEST3 13
-#define REISERFS_TEST4 14 
+    REISERFS_NO_BORDER,
+    REISERFS_NO_UNHASHED_RELOCATION,
+    REISERFS_HASHED_RELOCATION,
+    REISERFS_ATTRS,
+    REISERFS_XATTRS,
+    REISERFS_XATTRS_USER,
+    REISERFS_POSIXACL,
+    REISERFS_BARRIER_NONE,
+    REISERFS_BARRIER_FLUSH,
+
+    REISERFS_TEST1,
+    REISERFS_TEST2,
+    REISERFS_TEST3,
+    REISERFS_TEST4,
+};
 
 #define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH))
 #define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH))
@@ -461,17 +464,21 @@ struct reiserfs_sb_info
 #define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL))
 #define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL))
 #define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY))
-#define reiserfs_dont_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NOLOG))
 #define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS))
 #define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5))
 #define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT))
-
+#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
+#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
+#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
+#define reiserfs_xattrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS))
+#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
+#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
+#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
+#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
+#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
 
 void reiserfs_file_buffer (struct buffer_head * bh, int list);
 extern struct file_system_type reiserfs_fs_type;
-int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
-int flush_old_commits(struct super_block *s, int) ;
-int show_reiserfs_locks(void) ;
 int reiserfs_resize(struct super_block *, unsigned long) ;
 
 #define CARRY_ON                0
@@ -481,8 +488,6 @@ int reiserfs_resize(struct super_block *
 #define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh)
 #define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal)
 #define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
-#define SB_JOURNAL_LIST(s) (SB_JOURNAL(s)->j_journal_list)
-#define SB_JOURNAL_LIST_INDEX(s) (SB_JOURNAL(s)->j_journal_list_index) 
 #define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) 
 #define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap)
 
diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_xattr.h linux-2.6.5-rc1-mm2/include/linux/reiserfs_xattr.h
--- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_xattr.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.5-rc1-mm2/include/linux/reiserfs_xattr.h	2004-03-19 16:10:28.814448373 +0100
@@ -0,0 +1,132 @@
+/*
+  File: linux/reiserfs_xattr.h
+*/
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+/* Magic value in header */
+#define REISERFS_XATTR_MAGIC 0x52465841 /* "RFXA" */
+
+struct reiserfs_xattr_header {
+    __u32 h_magic;              /* magic number for identification */
+    __u32 h_hash;               /* hash of the value */
+};
+
+#ifdef __KERNEL__
+
+struct reiserfs_xattr_handler {
+	char *prefix;
+        int (*init)(void);
+        void (*exit)(void);
+	int (*get)(struct inode *inode, const char *name, void *buffer,
+		   size_t size);
+	int (*set)(struct inode *inode, const char *name, const void *buffer,
+		   size_t size, int flags);
+	int (*del)(struct inode *inode, const char *name);
+        int (*list)(struct inode *inode, const char *name, int namelen, char *out);
+        struct list_head handlers;
+};
+
+
+#ifdef CONFIG_REISERFS_FS_XATTR
+#define is_reiserfs_priv_object(inode) (REISERFS_I(inode)->i_flags & i_priv_object)
+#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
+ssize_t reiserfs_getxattr (struct dentry *dentry, const char *name,
+			   void *buffer, size_t size);
+int reiserfs_setxattr (struct dentry *dentry, const char *name,
+                       const void *value, size_t size, int flags);
+ssize_t reiserfs_listxattr (struct dentry *dentry, char *buffer, size_t size);
+int reiserfs_removexattr (struct dentry *dentry, const char *name);
+int reiserfs_delete_xattrs (struct inode *inode);
+int reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs);
+int reiserfs_xattr_init (struct super_block *sb, int mount_flags);
+int reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd);
+int reiserfs_permission_locked (struct inode *inode, int mask, struct nameidata *nd);
+
+int reiserfs_xattr_del (struct inode *, const char *);
+int reiserfs_xattr_get (const struct inode *, const char *, void *, size_t);
+int reiserfs_xattr_set (struct inode *, const char *, const void *,
+                               size_t, int);
+
+extern struct reiserfs_xattr_handler user_handler;
+extern struct reiserfs_xattr_handler trusted_handler;
+#ifdef CONFIG_REISERFS_FS_SECURITY
+extern struct reiserfs_xattr_handler security_handler;
+#endif
+
+int reiserfs_xattr_register_handlers (void) __init;
+void reiserfs_xattr_unregister_handlers (void);
+
+static inline void
+reiserfs_write_lock_xattrs(struct super_block *sb)
+{
+    down_write (&REISERFS_XATTR_DIR_SEM(sb));
+}
+static inline void
+reiserfs_write_unlock_xattrs(struct super_block *sb)
+{
+    up_write (&REISERFS_XATTR_DIR_SEM(sb));
+}
+static inline void
+reiserfs_read_lock_xattrs(struct super_block *sb)
+{
+    down_read (&REISERFS_XATTR_DIR_SEM(sb));
+}
+
+static inline void
+reiserfs_read_unlock_xattrs(struct super_block *sb)
+{
+    up_read (&REISERFS_XATTR_DIR_SEM(sb));
+}
+
+static inline void
+reiserfs_write_lock_xattr_i(struct inode *inode)
+{
+    down_write (&REISERFS_I(inode)->xattr_sem);
+}
+static inline void
+reiserfs_write_unlock_xattr_i(struct inode *inode)
+{
+    up_write (&REISERFS_I(inode)->xattr_sem);
+}
+static inline void
+reiserfs_read_lock_xattr_i(struct inode *inode)
+{
+    down_read (&REISERFS_I(inode)->xattr_sem);
+}
+
+static inline void
+reiserfs_read_unlock_xattr_i(struct inode *inode)
+{
+    up_read (&REISERFS_I(inode)->xattr_sem);
+}
+
+#else
+
+#define is_reiserfs_priv_object(inode) 0
+#define reiserfs_getxattr NULL
+#define reiserfs_setxattr NULL
+#define reiserfs_listxattr NULL
+#define reiserfs_removexattr NULL
+#define reiserfs_write_lock_xattrs(sb)
+#define reiserfs_write_unlock_xattrs(sb)
+#define reiserfs_read_lock_xattrs(sb)
+#define reiserfs_read_unlock_xattrs(sb)
+
+#define reiserfs_permission NULL
+
+#define reiserfs_xattr_register_handlers() 0
+#define reiserfs_xattr_unregister_handlers()
+
+static inline int reiserfs_delete_xattrs (struct inode *inode) { return 0; };
+static inline int reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs) { return 0; };
+static inline int reiserfs_xattr_init (struct super_block *sb, int mount_flags)
+{
+    sb->s_flags = (sb->s_flags & ~MS_POSIXACL); /* to be sure */
+    return 0;
+};
+#endif
+
+#endif  /* __KERNEL__ */